In [1]:
import sqlite3
import pandas as pd
# pd.set_option('display.max_rows', None)

In [2]:
# path to SQLite db
db_path = '../Resources/database.sqlite'

# connect to the SQLite db
conn = sqlite3.connect(db_path)

# get a list of all tables in the db
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql_query(query, conn)

# create a dictionary to hold the dfs
df_dict = {}

# load the tables into a df
for table_name in tables['name']:
    query = f'SELECT * FROM {table_name}'
    
    # convert database table to df, add to df_dict, and export df to csv
    df = pd.read_sql_query(query, conn)
    df_dict[table_name] = df
    # df.to_csv(f'../Resources/{table_name}.csv', index=False, header=True)

# close the db connection
conn.close()

In [3]:
# check matches table info
matches_df = df_dict['matches']

matches_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935359 entries, 0 to 935358
Data columns (total 81 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   tourney_id           932936 non-null  object 
 1   tourney_name         935359 non-null  object 
 2   surface              928966 non-null  object 
 3   draw_size            934524 non-null  object 
 4   tourney_level        935359 non-null  object 
 5   tourney_date         932387 non-null  float64
 6   match_num            935359 non-null  object 
 7   winner_id            908960 non-null  float64
 8   winner_seed          398676 non-null  object 
 9   winner_entry         173132 non-null  object 
 10  winner_name          908960 non-null  object 
 11  winner_hand          889495 non-null  object 
 12  winner_ht            550391 non-null  float64
 13  winner_ioc           908862 non-null  object 
 14  winner_age           895775 non-null  float64
 15  loser_id         

In [4]:
# lists for filtering winner_entry and tourney_level columns
tourney_levels = ['G', 'M', 'A', 'F']
rounds = ['R128', 'R64', 'R32', 'R16', 'QF', 'SF', 'F']

# filter df by tourney_levels and rounds lists
tourney_entries_df = matches_df.loc[(matches_df['round'].isin(rounds)) & (matches_df['tourney_level'].isin(tourney_levels))]
tourney_entries_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194618 entries, 0 to 929454
Data columns (total 81 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   tourney_id           194618 non-null  object 
 1   tourney_name         194618 non-null  object 
 2   surface              192777 non-null  object 
 3   draw_size            193798 non-null  object 
 4   tourney_level        194618 non-null  object 
 5   tourney_date         194618 non-null  float64
 6   match_num            194618 non-null  object 
 7   winner_id            194618 non-null  float64
 8   winner_seed          70630 non-null   object 
 9   winner_entry         16715 non-null   object 
 10  winner_name          194618 non-null  object 
 11  winner_hand          178899 non-null  object 
 12  winner_ht            163715 non-null  float64
 13  winner_ioc           194610 non-null  object 
 14  winner_age           187847 non-null  float64
 15  loser_id             1

In [5]:
# create year column to filter and change dtype to int
tourney_entries_df['year'] = tourney_entries_df['tourney_id'].str.split('-', expand=True)[0]
tourney_entries_df = tourney_entries_df.astype({'year': 'int64'})

# change tourney_date to date type
tourney_entries_df['tourney_date'] = pd.to_datetime(tourney_entries_df['tourney_date'], format='%Y%m%d')

# check dtypes to confirm changes
display(tourney_entries_df['year'].dtype)
display(tourney_entries_df.dtypes[:6])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourney_entries_df['year'] = tourney_entries_df['tourney_id'].str.split('-', expand=True)[0]


dtype('int64')

tourney_id               object
tourney_name             object
surface                  object
draw_size                object
tourney_level            object
tourney_date     datetime64[ns]
dtype: object

In [6]:
# filter by matches during 2009 season and after
tourney_entries_2009s_df = tourney_entries_df.loc[tourney_entries_df['year'] >= 2009]

In [7]:
# split sets to add to new columns in the df
split_sets = tourney_entries_2009s_df['score'].str.split(' ', expand=True)

for i in range(split_sets.shape[1]):
    tourney_entries_2009s_df[f'set_{i + 1}'] = split_sets[i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourney_entries_2009s_df[f'set_{i + 1}'] = split_sets[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourney_entries_2009s_df[f'set_{i + 1}'] = split_sets[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourney_entries_2009s_df[f'set_{i + 1}'] = split_sets[i]
A value is trying to be set on a c

In [8]:
# split every set column into the sets won for each player and make new columns for each player

for i in range(1, 6):
    split = tourney_entries_2009s_df[f'set_{i}'].str.split('-', expand=True)
    tourney_entries_2009s_df[f'set_{i}_p1'] = split[0].replace('\((\d+)\)', '', regex=True)
    tourney_entries_2009s_df[f'set_{i}_p2'] = split[1].replace('\((\d+)\)', '', regex=True)
    
tourney_entries_2009s_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourney_entries_2009s_df[f'set_{i}_p1'] = split[0].replace('\((\d+)\)', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourney_entries_2009s_df[f'set_{i}_p2'] = split[1].replace('\((\d+)\)', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourney_entries_2009s_df[f'se

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,set_1_p1,set_1_p2,set_2_p1,set_2_p2,set_3_p1,set_3_p2,set_4_p1,set_4_p2,set_5_p1,set_5_p2
31832,2020-8888,Atp Cup,Hard,24,A,2020-01-06,300,104925.0,,,...,6,2,7,6,,,,,,
31833,2020-8888,Atp Cup,Hard,24,A,2020-01-06,299,105138.0,,,...,7,5,6,1,,,,,,
31834,2020-8888,Atp Cup,Hard,24,A,2020-01-06,298,104925.0,,,...,6,1,5,7,6.0,4.0,,,,
31835,2020-8888,Atp Cup,Hard,24,A,2020-01-06,297,105583.0,,,...,7,5,7,6,,,,,,
31836,2020-8888,Atp Cup,Hard,24,A,2020-01-06,296,104745.0,,,...,4,6,7,5,6.0,1.0,,,,


In [9]:
# drop columns that won't be used as features
cols_to_drop = ["winner1_id","winner2_id","loser1_id","loser2_id","winner1_name" ,"winner1_hand","winner1_ht","winner1_ioc","winner1_age","winner2_name","winner2_hand",
                  "winner2_ht","winner2_ioc","winner2_age","loser1_name","loser1_hand","loser1_ht","loser1_ioc","loser1_age","loser2_name","loser2_hand","loser2_ht",
                  "loser2_ioc","loser2_age","winner1_rank", "winner1_rank_points", "winner2_rank","winner2_rank_points","loser1_rank","loser1_rank_points","loser2_rank","loser2_rank_points",
                  "winner_entry", "loser_entry", "set_1", "set_2", "set_3", "set_4", "set_5", "set_6"]
tourney_entries_2009s_df.drop(columns=cols_to_drop, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourney_entries_2009s_df.drop(columns=cols_to_drop, inplace=True)


In [10]:
# rename columns w/ winner or w to player
tourney_entries_2009s_winners_df = tourney_entries_2009s_df.copy()
tourney_entries_2009s_winners_df = tourney_entries_2009s_winners_df.rename(columns={'winner_id': 'player_id', 'winner_seed': 'player_seed', 'winner_name': 'player_name',
                                                                                    'winner_hand': 'player_hand', 'winner_ht': 'player_ht', 'winner_ioc': 'player_ioc', 'winner_age': 'player_age',
                                                                                    'w_ace': 'player_ace', 'w_df': 'player_df', 'w_svpt': 'player_svpt', 'w_1stIn': 'player_1stIn', 
                                                                                    'w_1stWon': 'player_1stWon', 'w_2ndWon': 'player_2ndWon', 'w_SvGms': 'player_SvGms', 'w_bpSaved': 'player_bpSaved',
                                                                                    'w_bpFaced': 'player_bpFaced', 'winner_rank': 'player_rank', 'winner_rank_points': 'player_rank_points',
                                                                                    'set_1_p1': 'set_1_score', 'set_2_p1': 'set_2_score', 'set_3_p1': 'set_3_score', 'set_4_p1': 'set_4_score',
                                                                                    'set_5_p1': 'set_5_score'})

# drop all loser info
cols_to_drop_2 = ['loser_id', 'loser_seed', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon',
       'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'loser_rank', 'loser_rank_points', 'year', 'set_1_p2', 'set_2_p2', 'set_3_p2', 'set_4_p2', 'set_5_p2']
tourney_entries_2009s_winners_df.drop(columns=cols_to_drop_2, inplace=True)

# add column for if player won. all winners will get 1
tourney_entries_2009s_winners_df['winner'] = 1

In [11]:
# rename columns w/ loser or l to player 
tourney_entries_2009s_losers_df = tourney_entries_2009s_df.copy()
tourney_entries_2009s_losers_df = tourney_entries_2009s_losers_df.rename(columns={'loser_id': 'player_id', 'loser_seed': 'player_seed', 'loser_name': 'player_name',
                                                                                    'loser_hand': 'player_hand', 'loser_ht': 'player_ht', 'loser_ioc': 'player_ioc', 'loser_age': 'player_age',
                                                                                    'l_ace': 'player_ace', 'l_df': 'player_df', 'l_svpt': 'player_svpt', 'l_1stIn': 'player_1stIn', 
                                                                                    'l_1stWon': 'player_1stWon', 'l_2ndWon': 'player_2ndWon', 'l_SvGms': 'player_SvGms', 'l_bpSaved': 'player_bpSaved',
                                                                                    'l_bpFaced': 'player_bpFaced', 'loser_rank': 'player_rank', 'loser_rank_points': 'player_rank_points',
                                                                                    'set_1_p2': 'set_1_score', 'set_2_p2': 'set_2_score', 'set_3_p2': 'set_3_score', 'set_4_p2': 'set_4_score',
                                                                                    'set_5_p2': 'set_5_score'})

# drop all winner columns
cols_to_drop_3 = ['winner_id', 'winner_seed', 'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'winner_rank', 'winner_rank_points', 'year', 'set_1_p1', 'set_2_p1', 'set_3_p1', 'set_4_p1', 'set_5_p1']
tourney_entries_2009s_losers_df.drop(columns=cols_to_drop_3, inplace=True)

# add column for if the player won. all losers will be assigned 0
tourney_entries_2009s_losers_df['winner'] = 0

In [12]:
tourney_entries_2009s_winners_df.head(2)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,player_id,player_seed,player_name,...,player_bpSaved,player_bpFaced,player_rank,player_rank_points,set_1_score,set_2_score,set_3_score,set_4_score,set_5_score,winner
31832,2020-8888,Atp Cup,Hard,24,A,2020-01-06,300,104925.0,,Novak Djokovic,...,5.0,5.0,2.0,9055.0,6,7,,,,1
31833,2020-8888,Atp Cup,Hard,24,A,2020-01-06,299,105138.0,,Roberto Bautista Agut,...,3.0,5.0,10.0,2335.0,7,6,,,,1


In [13]:
tourney_entries_2009s_losers_df.head(2)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,player_id,player_seed,player_name,...,player_bpSaved,player_bpFaced,player_rank,player_rank_points,set_1_score,set_2_score,set_3_score,set_4_score,set_5_score,winner
31832,2020-8888,Atp Cup,Hard,24,A,2020-01-06,300,104745.0,,Rafael Nadal,...,6.0,8.0,1.0,9985.0,2,6,,,,0
31833,2020-8888,Atp Cup,Hard,24,A,2020-01-06,299,105583.0,,Dusan Lajovic,...,5.0,10.0,34.0,1251.0,5,1,,,,0


In [14]:
# concat dfs and interleave the rows
winners_losers_df = pd.concat([tourney_entries_2009s_winners_df, tourney_entries_2009s_losers_df]).sort_index().reset_index(drop=True)

# sort by date and match num to keep winners and losers of matches together
winners_losers_df.sort_values(by=['tourney_date', 'match_num'], ascending=False, inplace=True)
display(winners_losers_df.head(4), winners_losers_df.tail(4))

# change dtype of sets to int
for i in range(1, 6):
    winners_losers_df[f'set_{i}_score'] = pd.to_numeric(winners_losers_df[f'set_{i}_score'], errors='coerce')

winners_losers_df.info()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,player_id,player_seed,player_name,...,player_bpSaved,player_bpFaced,player_rank,player_rank_points,set_1_score,set_2_score,set_3_score,set_4_score,set_5_score,winner
39312,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,300,209098.0,6.0,Hamad Medjedovic,...,4.0,4.0,110.0,582.0,3,4,4,3,4.0,1
39313,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,300,209950.0,1.0,Arthur Fils,...,1.0,4.0,36.0,1158.0,4,1,2,4,1.0,0
39314,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,299,209950.0,1.0,Arthur Fils,...,2.0,3.0,36.0,1158.0,2,4,4,4,,1
39315,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,299,209414.0,2.0,Luca Van Assche,...,2.0,3.0,70.0,756.0,4,1,3,3,,0


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,player_id,player_seed,player_name,...,player_bpSaved,player_bpFaced,player_rank,player_rank_points,set_1_score,set_2_score,set_3_score,set_4_score,set_5_score,winner
61602,2009-339,Brisbane,Hard,32,A,2009-01-04,10,103566.0,,Michael Llodra,...,3.0,5.0,54.0,1400.0,5,7,6.0,,,1
61603,2009-339,Brisbane,Hard,32,A,2009-01-04,10,102512.0,,Joseph Sirianni,...,4.0,7.0,263.0,282.0,7,6,2.0,,,0
61584,2009-339,Brisbane,Hard,32,A,2009-01-04,1,105208.0,,Ernests Gulbis,...,2.0,4.0,53.0,1408.0,6,6,,,,1
61585,2009-339,Brisbane,Hard,32,A,2009-01-04,1,104925.0,1.0,Novak Djokovic,...,9.0,13.0,3.0,10590.0,4,4,,,,0


<class 'pandas.core.frame.DataFrame'>
Index: 76434 entries, 39312 to 61585
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tourney_id          76434 non-null  object        
 1   tourney_name        76434 non-null  object        
 2   surface             76434 non-null  object        
 3   draw_size           76434 non-null  object        
 4   tourney_level       76434 non-null  object        
 5   tourney_date        76434 non-null  datetime64[ns]
 6   match_num           76434 non-null  object        
 7   player_id           76434 non-null  float64       
 8   player_seed         27936 non-null  object        
 9   player_name         76434 non-null  object        
 10  player_hand         76434 non-null  object        
 11  player_ht           75798 non-null  float64       
 12  player_ioc          76434 non-null  object        
 13  player_age          76432 non-null  float64    

In [15]:
import numpy as np

# drop score values over 7 (only occurs in sets 3 and 5)

# testing_copy = testing_copy[(testing_copy['set_3_score'] <= 7) | (testing_copy['set_5_score'] <= 7)]
winners_losers_df.loc[winners_losers_df['set_3_score'] >= 8, 'set_3_score'] = np.nan
winners_losers_df.loc[winners_losers_df['set_5_score'] >= 8, 'set_5_score'] = np.nan

winners_losers_df['set_5_score'].value_counts(dropna=False)
winners_losers_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76434 entries, 39312 to 61585
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tourney_id          76434 non-null  object        
 1   tourney_name        76434 non-null  object        
 2   surface             76434 non-null  object        
 3   draw_size           76434 non-null  object        
 4   tourney_level       76434 non-null  object        
 5   tourney_date        76434 non-null  datetime64[ns]
 6   match_num           76434 non-null  object        
 7   player_id           76434 non-null  float64       
 8   player_seed         27936 non-null  object        
 9   player_name         76434 non-null  object        
 10  player_hand         76434 non-null  object        
 11  player_ht           75798 non-null  float64       
 12  player_ioc          76434 non-null  object        
 13  player_age          76432 non-null  float64    

In [16]:
# check unique values of 
display(winners_losers_df['surface'].value_counts())
display(winners_losers_df['player_ioc'].value_counts())
display(winners_losers_df['draw_size'].value_counts())
display(winners_losers_df['tourney_name'].value_counts())

surface
Hard     43944
Clay     23638
Grass     8852
Name: count, dtype: int64

player_ioc
ESP    8747
FRA    7846
USA    7108
GER    4756
ARG    4289
       ... 
UAE       3
ESA       2
ZIM       2
MAS       2
KUW       2
Name: count, Length: 75, dtype: int64

draw_size
32     29262
128    18216
28      9828
64      7848
56      5610
48      2820
96      2660
8        120
24        28
16        18
18        12
12        12
Name: count, dtype: int64

tourney_name
Australian Open         3810
Roland Garros           3810
Wimbledon               3556
US Open                 2794
Indian Wells Masters    2660
                        ... 
Cologne 1                 54
Sardinia                  54
Atp Cup                   52
NextGen Finals            36
United Cup                12
Name: count, Length: 126, dtype: int64

In [17]:
winners_losers_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76434 entries, 39312 to 61585
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tourney_id          76434 non-null  object        
 1   tourney_name        76434 non-null  object        
 2   surface             76434 non-null  object        
 3   draw_size           76434 non-null  object        
 4   tourney_level       76434 non-null  object        
 5   tourney_date        76434 non-null  datetime64[ns]
 6   match_num           76434 non-null  object        
 7   player_id           76434 non-null  float64       
 8   player_seed         27936 non-null  object        
 9   player_name         76434 non-null  object        
 10  player_hand         76434 non-null  object        
 11  player_ht           75798 non-null  float64       
 12  player_ioc          76434 non-null  object        
 13  player_age          76432 non-null  float64    

In [181]:
winners_losers_df.to_csv('../Resources/matches_cleaned.csv')