In [1]:
# Importing libraries for data manipulation
import sqlite3
import pandas as pd

# Importing libraries for data visualization
import matplotlib.pyplot as plt
import numpy as np

# Importing libraries for data preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Importing libraries for classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report

# Importing libraries for classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV

from py_helpers.pipeline_helper import *
from py_helpers.db_helper import *

In [2]:
seed = 42
target = 'playoff'
data = './db/WNBA_filtered.db'
tables = ['Awards_Players', 'Coaches', 'Players', 'Players_Teams', 'Teams', 'Teams_Year']

In [3]:
awards_players_df = retrieve_data(data, tables[0])  
coaches_df = retrieve_data(data, tables[1])  
players_df = retrieve_data(data, tables[2])  
players_teams_df = retrieve_data(data, tables[3])  
teams_df = retrieve_data(data, tables[4])  
teams_year_df = retrieve_data(data, tables[5])

In [4]:
# Join the DataFrames
df = teams_df.merge(players_teams_df, on=['year', 'tmID'], how='left') \
                   .merge(coaches_df, on=['year', 'tmID'], how='left') \
                   .merge(awards_players_df, on=['playerID', 'year'], how='left') \
                   .merge(teams_year_df, on=['year', 'tmID'], how='left')
                   #.merge(players_df, left_on='playerID', right_on='playerID', how='left') \

# Drop the columns that are not needed

list(df.columns)

['year',
 'tmID',
 'rank',
 'playoff',
 'o_fgm',
 'o_fga',
 'o_ftm',
 'o_fta',
 'o_3pm',
 'o_3pa',
 'o_oreb',
 'o_dreb',
 'o_reb',
 'o_asts',
 'o_pf',
 'o_stl',
 'o_to',
 'o_blk',
 'o_pts',
 'd_fgm',
 'd_fga',
 'd_ftm',
 'd_fta',
 'd_3pm',
 'd_3pa',
 'd_oreb',
 'd_dreb',
 'd_reb',
 'd_asts',
 'd_pf',
 'd_stl',
 'd_to',
 'd_blk',
 'd_pts',
 'homeW',
 'homeL',
 'awayW',
 'awayL',
 'last_year_rank',
 'playerID',
 'GP',
 'GS',
 'minutes',
 'points',
 'oRebounds',
 'dRebounds',
 'rebounds',
 'assists',
 'steals',
 'blocks',
 'turnovers',
 'PF',
 'fgRatio',
 'ftRatio',
 'threeRatio',
 'dq',
 'p_stint',
 'coachID',
 'won',
 'lost',
 'c_stint',
 'All_Star_Game_Most_Valuable_Player',
 'Defensive_Player_of_the_Year',
 'Kim_Perrot_Sportsmanship_Award',
 'Most_Improved_Player',
 'Most_Valuable_Player',
 'Rookie_of_the_Year',
 'Sixth_Woman_of_the_Year',
 'WNBA_All_Decade_Team_Honorable_Mention',
 'WNBA_All_Decade_Team',
 'WNBA_Finals_Most_Valuable_Player',
 'totalPoints',
 'cumulativePoints',
 'tot

In [5]:
# Data from the eleveth year
coaches_csv = '../season_11/coaches.csv'
players_teams_csv = '../season_11/players_teams.csv'
teams_csv = '../season_11/teams.csv'

coaches_df_11 = pd.read_csv(coaches_csv)
coaches_df_11['c_stint'] = coaches_df_11['stint']
coaches_df_11 = coaches_df_11.drop(columns=['stint'])

players_teams_df_11 = pd.read_csv(players_teams_csv)
players_teams_df_11['p_stint'] = players_teams_df_11['stint']
players_teams_df_11 = players_teams_df_11.drop(['stint'], axis=1)

teams_df_11 = pd.read_csv(teams_csv)

In [6]:
# Having coaches_df and coaches_df_11
# Remove coaches_df_11 columns that are not in coaches_df
# Coaches_df could have more columns than coaches_df_11
coaches_df_11_cols = [x for x in coaches_df_11.columns if x in coaches_df.columns]
coaches_df_11 = coaches_df_11[coaches_df_11_cols]

# Having players_teams_df and players_teams_df_11
# Remove players_teams_df_11 columns that are not in players_teams_df
# players_teams_df could have more columns than players_teams_df_11
players_teams_df_11_cols = [x for x in players_teams_df_11.columns if x in players_teams_df.columns]
players_teams_df_11 = players_teams_df_11[players_teams_df_11_cols]

# Having df_teams and teams_df
# Remove df_teams columns that are not in teams_df
# teams_df could have more columns than df_teams
teams_df_11_cols = [x for x in teams_df_11.columns if x in teams_df.columns]
teams_df_11 = teams_df_11[teams_df_11_cols]

In [7]:
# Merge the DataFrames of the eleventh year
df_11 = teams_df_11.merge(players_teams_df_11, on=['year', 'tmID'], how='left') \
                     .merge(coaches_df_11, on=['year', 'tmID'], how='left')
df_11.columns

Index(['year', 'tmID', 'playerID', 'p_stint', 'coachID', 'c_stint'], dtype='object')

In [8]:
df_11.head()

Unnamed: 0,year,tmID,playerID,p_stint,coachID,c_stint
0,11,ATL,balesal01w,0,meadoma99w,0
1,11,ATL,castriz01w,0,meadoma99w,0
2,11,ATL,desouer01w,0,meadoma99w,0
3,11,ATL,hardili01w,0,meadoma99w,0
4,11,ATL,irvinsa01w,0,meadoma99w,0


In [9]:
team_names = df['tmID'].unique().tolist()
new_teams = df_11['tmID'].unique().tolist()
team_names = list(set(team_names + new_teams))
team_names.sort()
team_names

['ATL',
 'CHA',
 'CHI',
 'CLE',
 'CON',
 'DET',
 'HOU',
 'IND',
 'LAS',
 'MIA',
 'MIN',
 'NYL',
 'ORL',
 'PHO',
 'POR',
 'SAC',
 'SAS',
 'SEA',
 'TUL',
 'UTA',
 'WAS']

In [10]:
len(team_names)
encoder = LabelEncoder()
team_names = encoder.fit_transform(team_names)
print(team_names)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


In [11]:
print([x for x in df_11.columns if x in df.columns])
print(list(df_11.columns))

['year', 'tmID', 'playerID', 'p_stint', 'coachID', 'c_stint']
['year', 'tmID', 'playerID', 'p_stint', 'coachID', 'c_stint']


In [12]:
to_solve = [x for x in df.columns if x not in df_11.columns]
print(list(to_solve))
print(len(to_solve))

['rank', 'playoff', 'o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_3pm', 'o_3pa', 'o_oreb', 'o_dreb', 'o_reb', 'o_asts', 'o_pf', 'o_stl', 'o_to', 'o_blk', 'o_pts', 'd_fgm', 'd_fga', 'd_ftm', 'd_fta', 'd_3pm', 'd_3pa', 'd_oreb', 'd_dreb', 'd_reb', 'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk', 'd_pts', 'homeW', 'homeL', 'awayW', 'awayL', 'last_year_rank', 'GP', 'GS', 'minutes', 'points', 'oRebounds', 'dRebounds', 'rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'PF', 'fgRatio', 'ftRatio', 'threeRatio', 'dq', 'won', 'lost', 'All_Star_Game_Most_Valuable_Player', 'Defensive_Player_of_the_Year', 'Kim_Perrot_Sportsmanship_Award', 'Most_Improved_Player', 'Most_Valuable_Player', 'Rookie_of_the_Year', 'Sixth_Woman_of_the_Year', 'WNBA_All_Decade_Team_Honorable_Mention', 'WNBA_All_Decade_Team', 'WNBA_Finals_Most_Valuable_Player', 'totalPoints', 'cumulativePoints', 'totalMinutes', 'yearlyWins']
69


In [13]:
# Join df_11 to df and assing missing values to NaN
df_all =  df.merge(df_11, on=['year', 'tmID', 'playerID', 'p_stint', 'coachID', 'c_stint'], how='left') \
            .merge(players_df, left_on='playerID', right_on='playerID', how='left')
df_all.head()

Unnamed: 0,year,tmID,rank,playoff,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,...,height,weight,birthYear,pos_C,pos_C_F,pos_F,pos_F_C,pos_F_G,pos_G,pos_G_F
0,9,ATL,7,0,895,2258,542,725,202,598,...,79.0,218,1985.0,1,0,0,0,0,0,0
1,9,ATL,7,0,895,2258,542,725,202,598,...,72.0,140,1982.0,0,0,0,0,1,0,0
2,9,ATL,7,0,895,2258,542,725,202,598,...,77.0,190,1982.0,0,0,0,1,0,0,0
3,9,ATL,7,0,895,2258,542,725,202,598,...,69.0,147,1983.0,0,0,0,0,0,1,0
4,9,ATL,7,0,895,2258,542,725,202,598,...,75.0,175,1983.0,0,0,1,0,0,0,0


In [16]:
# Checking for missing values after the merge
# Check for missing values in each column
columns_with_missing_values = df_all.columns[df_all.isna().any()].tolist()
for column in columns_with_missing_values:
    print(column, df_all[column].isna().sum())

All_Star_Game_Most_Valuable_Player 2070
Defensive_Player_of_the_Year 2070
Kim_Perrot_Sportsmanship_Award 2070
Most_Improved_Player 2070
Most_Valuable_Player 2070
Rookie_of_the_Year 2070
Sixth_Woman_of_the_Year 2070
WNBA_All_Decade_Team_Honorable_Mention 2070
WNBA_All_Decade_Team 2070
WNBA_Finals_Most_Valuable_Player 2070


In [18]:
# Using loc to modify the DataFrame
# Fill the missing values with 0
df_all.loc[:, columns_with_missing_values] = df_all[columns_with_missing_values].fillna(0)

In [None]:
# Checking for missing values after the merge
# Check for missing values in each column
columns_with_missing_values = df_all.columns[df_all.isna().any()].tolist()
for column in columns_with_missing_values:
    print(column, df_all[column].isna().sum())

# Expecting to see no missing values

All_Star_Game_Most_Valuable_Player 2070
Defensive_Player_of_the_Year 2070
Kim_Perrot_Sportsmanship_Award 2070
Most_Improved_Player 2070
Most_Valuable_Player 2070
Rookie_of_the_Year 2070
Sixth_Woman_of_the_Year 2070
WNBA_All_Decade_Team_Honorable_Mention 2070
WNBA_All_Decade_Team 2070
WNBA_Finals_Most_Valuable_Player 2070
