# Tennis Game

## Step 1: Data reading


1- Importing necessary libraries

In [1]:
import pandas as pd
import os
import zipfile

2- Unziping data files

In [2]:
folder = '/Users/mahlagha/TennisGame_Repository/data/raw/202405'
extension = ".zip"
os.chdir(folder) 

for item in os.listdir(folder): 
    if item.endswith(extension): 
        file_name = os.path.abspath(item) 
        zip_ref = zipfile.ZipFile(item) 
        zip_ref.extractall(folder) 

3- Making a dictionary of empty dataframes

In [3]:
dataframes = {
    'match_time_info': pd.DataFrame(),
    'match_season_info': pd.DataFrame(),
    'match_round_info': pd.DataFrame(),
    'match_venue_info': pd.DataFrame(),
    'match_event_info': pd.DataFrame(),
    'match_tournament_info': pd.DataFrame(),
    'match_awayteam_info': pd.DataFrame(),
    'match_awayscore_info': pd.DataFrame(),
    'match_hometeam_info': pd.DataFrame(),
    'match_homescore_info': pd.DataFrame(),
    'odds_info': pd.DataFrame(),
    'match_votes_info': pd.DataFrame(),
    'game_info': pd.DataFrame(),
    'power_info': pd.DataFrame(),
    'period_info': pd.DataFrame(),
}

4-  function for Loading and concatenating dataframes 

In [4]:
def load_and_concat(df, file_path):
    new_df = pd.read_parquet(file_path)
    df = pd.concat([df, new_df], ignore_index=True)
    return df


5- Mapping Prefixes to DataFrames

In [6]:
prefix_to_key = {
    'match_time_info': 'time',
    'match_season_info': 'season',
    'match_round_info': 'round',
    'match_venue_info': 'venue',
    'match_event_info': 'event',
    'match_tournament_info': 'tournament',
    'match_awayteam_info': 'away_team_1',
    'match_awayscore_info': 'away_team_score',
    'match_hometeam_info': 'home_team_1',
    'match_homescore_info': 'home_team_score',
    'odds_info': 'odds',
    'match_votes_info': 'votes',
    'game_info': 'pbp',
    'power_info': 'power',
    'period_info': 'statistics',
}


6- Loop to iterate through each directory

In [7]:
path = "/Users/mahlagha/TennisGame_Repository/data/raw/202405/data/raw"

for f in os.listdir(path):
    entry_path = os.path.join(path, f)
    if os.path.isdir(entry_path):
        for s in os.listdir(entry_path):
            for prefix, key in prefix_to_key.items():
                if s.startswith(key):
                    file_path = os.path.join(entry_path, s)
                    dataframes[prefix] = load_and_concat(dataframes[prefix], file_path)
                    break
                    
print('Completed')

Completed


7- A loop to save dataframes as csv files and print the shape of them

In [8]:
csv_path = "/Users/mahlagha/TennisGame_Repository/data/processed"

for key, df in dataframes.items():
    csv_file_path = os.path.join(csv_path, f"{key}.csv")
    df.to_csv(csv_file_path, index=False)
    print(f'Shape of {key} is: {df.shape}')

Shape of match_time_info is: (9319, 7)
Shape of match_season_info is: (9319, 4)
Shape of match_round_info is: (5790, 5)
Shape of match_venue_info is: (9286, 5)
Shape of match_event_info is: (9319, 10)
Shape of match_tournament_info is: (9319, 16)
Shape of match_awayteam_info is: (6143, 18)
Shape of match_awayscore_info is: (9319, 14)
Shape of match_hometeam_info is: (6670, 18)
Shape of match_homescore_info is: (9319, 14)
Shape of odds_info is: (15774, 11)
Shape of match_votes_info is: (9319, 3)
Shape of game_info is: (751256, 13)
Shape of power_info is: (136637, 5)
Shape of period_info is: (402853, 13)
