In [622]:
# Importing libraries for data manipulation
import sqlite3
import pandas as pd

# Importing libraries for data visualization
import matplotlib.pyplot as plt
import numpy as np

# Importing libraries for data preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Importing libraries for classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report

# Importing libraries for classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV

from py_helpers.pipeline_helper import *
from py_helpers.db_helper import *

In [623]:
seed = 42
target = 'playoff'
data = './db/WNBA_filtered.db'
tables = ['Awards_Players', 'Coaches', 'Players', 'Players_Teams', 'Teams', 'Teams_Year']

In [624]:
awards_players_df = retrieve_data(data, tables[0])  
coaches_df = retrieve_data(data, tables[1])  
players_df = retrieve_data(data, tables[2])  
players_teams_df = retrieve_data(data, tables[3])  
teams_df = retrieve_data(data, tables[4])  
teams_year_df = retrieve_data(data, tables[5])

In [625]:
# Join the DataFrames
df = teams_df.merge(players_teams_df, on=['year', 'tmID'], how='left') \
                   .merge(coaches_df, on=['year', 'tmID'], how='left') 
                   #.merge(teams_year_df, on=['year', 'tmID'], how='left')
                   #.merge(players_df, left_on='playerID', right_on='playerID', how='left') \

# Drop the columns that are not needed

list(df.columns)

['tmID',
 'year',
 'playoff',
 'principal_component_1',
 'principal_component_2',
 'playerID',
 'GP',
 'GS',
 'minutes',
 'points',
 'oRebounds',
 'dRebounds',
 'rebounds',
 'assists',
 'steals',
 'blocks',
 'turnovers',
 'PF',
 'fgRatio',
 'ftRatio',
 'threeRatio',
 'dq',
 'p_stint',
 'coachID',
 'won',
 'lost',
 'c_stint']

In [626]:
# Data from the eleveth year
coaches_csv = '../season_11/coaches.csv'
players_teams_csv = '../season_11/players_teams.csv'
teams_csv = '../season_11/teams.csv'

coaches_df_11 = pd.read_csv(coaches_csv)
coaches_df_11['c_stint'] = coaches_df_11['stint']
coaches_df_11 = coaches_df_11.drop(columns=['stint'])

players_teams_df_11 = pd.read_csv(players_teams_csv)
players_teams_df_11['p_stint'] = players_teams_df_11['stint']
players_teams_df_11 = players_teams_df_11.drop(['stint'], axis=1)

teams_df_11 = pd.read_csv(teams_csv)

In [627]:
# Having coaches_df and coaches_df_11
# Remove coaches_df_11 columns that are not in coaches_df
# Coaches_df could have more columns than coaches_df_11
coaches_df_11_cols = [x for x in coaches_df_11.columns if x in coaches_df.columns]
coaches_df_11 = coaches_df_11[coaches_df_11_cols]

# Having players_teams_df and players_teams_df_11
# Remove players_teams_df_11 columns that are not in players_teams_df
# players_teams_df could have more columns than players_teams_df_11
players_teams_df_11_cols = [x for x in players_teams_df_11.columns if x in players_teams_df.columns]
players_teams_df_11 = players_teams_df_11[players_teams_df_11_cols]

# Having df_teams and teams_df
# Remove df_teams columns that are not in teams_df
# teams_df could have more columns than df_teams
teams_df_11_cols = [x for x in teams_df_11.columns if x in teams_df.columns]
teams_df_11 = teams_df_11[teams_df_11_cols]

In [628]:
# Merge the DataFrames of the eleventh year
df_11 = teams_df_11.merge(players_teams_df_11, on=['year', 'tmID'], how='left') \
                     .merge(coaches_df_11, on=['year', 'tmID'], how='left')
df_11.columns

Index(['year', 'tmID', 'playerID', 'p_stint', 'coachID', 'c_stint'], dtype='object')

In [629]:
df_11.head()

Unnamed: 0,year,tmID,playerID,p_stint,coachID,c_stint
0,11,ATL,balesal01w,0,meadoma99w,0
1,11,ATL,castriz01w,0,meadoma99w,0
2,11,ATL,desouer01w,0,meadoma99w,0
3,11,ATL,hardili01w,0,meadoma99w,0
4,11,ATL,irvinsa01w,0,meadoma99w,0


In [630]:
team_names = df['tmID'].unique().tolist()
new_teams = df_11['tmID'].unique().tolist()
team_names = list(set(team_names + new_teams))
team_names.sort()
team_names

['ATL',
 'CHA',
 'CHI',
 'CLE',
 'CON',
 'DET',
 'HOU',
 'IND',
 'LAS',
 'MIA',
 'MIN',
 'NYL',
 'ORL',
 'PHO',
 'POR',
 'SAC',
 'SAS',
 'SEA',
 'TUL',
 'UTA',
 'WAS']

In [631]:
len(team_names)
encoder = LabelEncoder()
team_names = encoder.fit_transform(team_names)
print(team_names)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


In [632]:
print([x for x in df_11.columns if x in df.columns])
print(list(df_11.columns))

['year', 'tmID', 'playerID', 'p_stint', 'coachID', 'c_stint']
['year', 'tmID', 'playerID', 'p_stint', 'coachID', 'c_stint']


In [633]:
# Columns to solve
to_solve = [x for x in df.columns if x not in df_11.columns]
# Remove the target column
to_solve.remove(target)

print(list(to_solve))
print(len(to_solve))

['principal_component_1', 'principal_component_2', 'GP', 'GS', 'minutes', 'points', 'oRebounds', 'dRebounds', 'rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'PF', 'fgRatio', 'ftRatio', 'threeRatio', 'dq', 'won', 'lost']
20


In [634]:
# Join df_11 to df and assing missing values to NaN
df_all =  df.merge(df_11, on=['year', 'tmID', 'playerID', 'p_stint', 'coachID', 'c_stint'], how='outer')
            
awards_coach_df = awards_players_df[ ['playerID', 'year', 'Coach_of_the_Year'] ]
awards_coach_df = awards_coach_df.rename(columns={'playerID': 'coachID'})

awards_players_df = awards_players_df.drop(columns=['Coach_of_the_Year'])

df_all = df_all.merge(teams_year_df, on=['year', 'tmID'], how='outer') \
                .merge(awards_players_df, on=['playerID', 'year'], how='left') \
                .merge(awards_coach_df, on=['coachID', 'year'], how='left') \
                .merge(players_df, left_on='playerID', right_on='playerID', how='left') \
                    
df_all.head()

Unnamed: 0,tmID,year,playoff,principal_component_1,principal_component_2,playerID,GP,GS,minutes,points,...,height,weight,birthYear,pos_C,pos_C_F,pos_F,pos_F_C,pos_F_G,pos_G,pos_G_F
0,ATL,9,0.0,1.863011,1.406214,balesal01w,17.0,9.0,389.0,82.0,...,79.0,218.0,1985.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ATL,9,0.0,1.863011,1.406214,castriz01w,29.0,20.0,671.0,269.0,...,72.0,140.0,1982.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,ATL,9,0.0,1.863011,1.406214,desouer01w,12.0,8.0,277.0,112.0,...,77.0,190.0,1982.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,ATL,9,0.0,1.863011,1.406214,haynikr01w,33.0,3.0,486.0,94.0,...,69.0,147.0,1983.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,ATL,9,0.0,1.863011,1.406214,lacyje01w,33.0,22.0,605.0,189.0,...,75.0,175.0,1983.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [635]:
# check if I have pairs tmID-year-playerID-coachID with duplicated values
df_all[['tmID', 'year', 'playerID', 'coachID']].duplicated().sum()

0

In [636]:
# Checking for missing values after the merge
# Check for missing values in playoff column
df_all[target].isnull().sum()

233

In [637]:
# Print the number of entries in year 11
print(df_all[df_all['year'] == 11].shape)

# Checking for missing values on playoff after the merge on year 11
print(df_all[df_all['year'] == 11][target].isnull().sum())

(175, 52)
175


In [638]:
# Distinct pairs of tmID-year when the year is 11
df_all[df_all['year'] == 11][['tmID', 'year']].drop_duplicates().shape

(12, 2)

In [639]:
# Print the entrnumber of entries that are not in year 11 but have missing values on playoff
print(df_all[(df_all['year'] != 11) & (df_all[target].isnull())].shape)

(58, 52)


In [640]:
df_old_shape = df_all.shape

# Remove entries that are not in year 11 but have missing values on playoff
df_all = df_all[~((df_all['year'] != 11) & (df_all['playoff'].isna()))]

df_updated_shape = df_all.shape

# Print the number of entries in each set and the number of entries removed
print(df_old_shape)
print(df_updated_shape)
print(df_old_shape[0] - df_updated_shape[0])

(2378, 52)
(2320, 52)
58


In [641]:
columns_with_missing_values = df_all.columns[df_all.isna().any()].tolist()
# Using loc to modify the DataFrame
# Fill the missing values with 0 except for the target column
columns_with_missing_values.remove(target)
df_all.loc[:, columns_with_missing_values] = df_all[columns_with_missing_values].fillna(0)

columns_with_missing_values = df_all.columns[df_all.isna().any()].tolist()
# Checking for missing values after the merge
# Check for missing values in each column
for column in columns_with_missing_values:
    print(column, df_all[column].isna().sum())

# Expecting to see no missing values

playoff 175


In [642]:
# List the columns that are not boolean or numeric
non_numeric_columns = df_all.select_dtypes(exclude=['bool', 'number']).columns.tolist() 
print(non_numeric_columns)

# Remove the non-numeric columns except for the tmID
non_numeric_columns.remove('tmID')
df_all = df_all.drop(columns=non_numeric_columns)


# Insert into the database
table_name = 'merged'
schema = schema_builder(df_all)
schema = f"Create Table {table_name} ({schema})"
insert_dataframe(df_all, table_name, data, schema)

['tmID', 'playerID', 'coachID']


In [643]:
df_shift = df.merge(df_11, on=['year', 'tmID', 'playerID', 'p_stint', 'coachID', 'c_stint'], how='outer')

# Change name of the columns p_stint and c_stint to p_stint_now and c_stint_now
df_shift = df_shift.rename(columns={'p_stint': 'p_stint_now', 'c_stint': 'c_stint_now'})


# Drop columns that have any missing values except for the target column
columns_with_missing_values = df_shift.columns[df_shift.isna().any()].tolist()
columns_with_missing_values.remove(target)
df_shift = df_shift.drop(columns=columns_with_missing_values)
# Drop duplicates
df_shift = df_shift.drop_duplicates()

# Shift year value by one (eg. 11 -> 10)
df_shift['year'] = df_shift['year'] - 1

# Merge data withe df
df_shift = df_shift \
                    .merge(teams_df, on=['year', 'tmID'], how='left')
df_shift =  df_shift \
                    .merge(players_teams_df, on=['year', 'playerID'], how='left') # tmID \ 

# tmID_x to tmID
df_shift = df_shift.rename(columns={'tmID_x': 'tmID'})
df_shift = df_shift.drop(columns=['tmID_y'])

df_shift = df_shift \
                    .merge(coaches_df, on=['year', 'coachID'], how='left') \
                        
df_shift = df_shift.rename(columns={'tmID_x': 'tmID'})
df_shift = df_shift.drop(columns=['tmID_y'])
df_shift = df_shift \
                    .merge(awards_players_df, on=['playerID', 'year'], how='left') \
                    .merge(awards_coach_df, on=['coachID', 'year'], how='left') \
                    .merge(players_df, left_on='playerID', right_on='playerID', how='left')  \
                    .merge(teams_year_df, on=['year', 'tmID'], how='left')   


# Drop playoff_y column
# Rename playoff_x to playoff
if 'playoff_y' in df_shift.columns:
    df_shift = df_shift.drop(columns=['playoff_y'])
    df_shift = df_shift.rename(columns={'playoff_x': 'playoff'}) 

# Shift back year value by one (eg. 10 -> 11)
df_shift['year'] = df_shift['year'] + 1

# Remove year 1
df_shift = df_shift[df_shift['year'] != 1]

df_shift.tail()

Unnamed: 0,tmID,year,playoff,playerID,p_stint_now,coachID,c_stint_now,principal_component_1,principal_component_2,GP,...,pos_C_F,pos_F,pos_F_C,pos_F_G,pos_G,pos_G_F,totalPoints,cumulativePoints,totalMinutes,yearlyWins
2367,WAS,11,,langhcr01w,0,laceytr99w,0,1.631433,0.226761,34.0,...,0,0,1,0,0,0,2585.0,23442.0,6873.0,16.0
2368,WAS,11,,milleke01w,0,laceytr99w,0,1.631433,0.226761,34.0,...,0,0,0,0,1,0,2585.0,23442.0,6873.0,16.0
2369,WAS,11,,phillta02w,0,laceytr99w,0,1.631433,0.226761,,...,0,0,0,0,0,0,2585.0,23442.0,6873.0,16.0
2370,WAS,11,,thomaja01w,0,laceytr99w,0,1.631433,0.226761,,...,0,0,0,0,1,0,2585.0,23442.0,6873.0,16.0
2371,WAS,11,,walkede01w,0,laceytr99w,0,1.631433,0.226761,34.0,...,0,1,0,0,0,0,2585.0,23442.0,6873.0,16.0


In [644]:
# Columns that shift have and df_all does not have
# And vice versa
print([x for x in df_shift.columns if x not in df_all.columns])
print([x for x in df_all.columns if x not in df_shift.columns])

['playerID', 'p_stint_now', 'coachID', 'c_stint_now']
[]


In [645]:
# Checking for missing values after the merge
# Check for missing values in playoff column
df_shift[target].isnull().sum()

177

In [646]:
# Columns with missing values
columns_with_missing_values = df_shift.columns[df_shift.isna().any()].tolist()
for column in columns_with_missing_values:
    print(column, df_shift[column].isna().sum())

playoff 177
principal_component_1 97
principal_component_2 97
GP 559
GS 559
minutes 559
points 559
oRebounds 559
dRebounds 559
rebounds 559
assists 559
steals 559
blocks 559
turnovers 559
PF 559
fgRatio 559
ftRatio 559
threeRatio 559
dq 559
p_stint 559
won 749
lost 749
c_stint 749
All_Star_Game_Most_Valuable_Player 2084
Defensive_Player_of_the_Year 2084
Kim_Perrot_Sportsmanship_Award 2084
Most_Improved_Player 2084
Most_Valuable_Player 2084
Rookie_of_the_Year 2084
Sixth_Woman_of_the_Year 2084
WNBA_All_Decade_Team_Honorable_Mention 2084
WNBA_All_Decade_Team 2084
WNBA_Finals_Most_Valuable_Player 2084
Coach_of_the_Year 2001
totalPoints 32
cumulativePoints 32
totalMinutes 32
yearlyWins 32


In [647]:
# The tmID, year where the missing values are located
sample = df_shift[df_shift['playoff'].isna()][['tmID', 'year', 'playerID', 'coachID']]
# Remove duplicates from the sample
sample = sample.drop_duplicates()
# Order by year and tmID
sample = sample.sort_values(by=['year', 'tmID'])
sample.head()

Unnamed: 0,tmID,year,playerID,coachID
2195,ATL,11,balesal01w,meadoma99w
2196,ATL,11,castriz01w,meadoma99w
2197,ATL,11,desouer01w,meadoma99w
2198,ATL,11,hardili01w,meadoma99w
2199,ATL,11,irvinsa01w,meadoma99w


In [648]:
# Columns with missing values on year 11
columns_with_missing_values_11 = df_shift[df_shift['year'] == 11].columns[df_shift[df_shift['year'] == 11].isna().any()].tolist()
for column in columns_with_missing_values_11:
    print(column, df_shift[df_shift['year'] == 11][column].isna().sum())
    

playoff 177
principal_component_1 32
principal_component_2 32
GP 64
GS 64
minutes 64
points 64
oRebounds 64
dRebounds 64
rebounds 64
assists 64
steals 64
blocks 64
turnovers 64
PF 64
fgRatio 64
ftRatio 64
threeRatio 64
dq 64
p_stint 64
won 80
lost 80
c_stint 80
All_Star_Game_Most_Valuable_Player 170
Defensive_Player_of_the_Year 170
Kim_Perrot_Sportsmanship_Award 170
Most_Improved_Player 170
Most_Valuable_Player 170
Rookie_of_the_Year 170
Sixth_Woman_of_the_Year 170
WNBA_All_Decade_Team_Honorable_Mention 170
WNBA_All_Decade_Team 170
WNBA_Finals_Most_Valuable_Player 170
Coach_of_the_Year 164
totalPoints 32
cumulativePoints 32
totalMinutes 32
yearlyWins 32


In [649]:
# Distinct pairs of tmID-year when the year is 11
df_shift[df_shift['year'] == 11][['tmID', 'year']].drop_duplicates().shape

# Unique values of year tmmID in year 11
df_shift[df_shift['year'] == 11][['tmID', 'year']].drop_duplicates().values.tolist()

[['ATL', 11],
 ['CHI', 11],
 ['CON', 11],
 ['IND', 11],
 ['LAS', 11],
 ['MIN', 11],
 ['NYL', 11],
 ['PHO', 11],
 ['SAS', 11],
 ['SEA', 11],
 ['TUL', 11],
 ['WAS', 11]]

In [650]:
# Store the df_shift DataFrame into the database
table_name = 'merged_shift'
schema = schema_builder(df_shift)
schema = f"Create Table {table_name} ({schema})"
insert_dataframe(df_shift, table_name, data, schema)