# Data Mining Project

## Model Evaluation Pipeline

In order to evaluate a model's performance with a certain data we have to define and structure a pipeline that collects information about the the model's performance metrics and a way to test the model with unknown data.


Dicas para avaliação de modelos:
Problema cronológico -> Usar apenas dados passados para prever os futuros
Problema da Granularidade -> usar dados passados para prever o proximo ano progressivamente (ex: treino:6 anos teste:7ºano -> treino:7 anos teste:8ºano)


In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

DATA_PATH = 'data'
DATA_TEAMS = 'teams.csv'
DATA_COACHES = 'coaches.csv'
DATA_PLAYERS = 'players.csv'
DATA_AWARDS = 'awards_players.csv'
DATA_PLAYERS_TEAMS = 'players_teams.csv'
DATA_SERIES_POST = 'series_post.csv'
DATA_TEAMS_POST = 'teams_post.csv'

In [12]:
# Merge Tables

# Read the CSV files into data frames
awards_df = pd.read_csv(f'{DATA_PATH}/{DATA_AWARDS}')
coaches_df = pd.read_csv(f'{DATA_PATH}/{DATA_COACHES}')
players_df = pd.read_csv(f'{DATA_PATH}/{DATA_PLAYERS}')
players_teams_df = pd.read_csv(f'{DATA_PATH}/{DATA_PLAYERS_TEAMS}')
series_post_df = pd.read_csv(f'{DATA_PATH}/{DATA_SERIES_POST}')
teams_post_df = pd.read_csv(f'{DATA_PATH}/{DATA_TEAMS_POST}')
teams_df = pd.read_csv(f'{DATA_PATH}/{DATA_TEAMS}')

# Merge tables one by one based on common columns
players_df = players_df.rename(columns={'bioID': 'playerID'})

# FIND way to merge players with awards (maybe add list of awards per year)
#merged_df = pd.merge(players_df, awards_df, on="playerID", how="left") # ADDS MISSING Values

# series_post is a useless table for training, since we can't use it to predict playoff

merged_df = pd.merge(players_teams_df, players_df, on="playerID", how="left")
merged_df.to_csv('merged_data.csv', index=False)
#merged_df = pd.merge(teams_df, merged_df, on=[], how="left")


# merged_df = pd.merge(players_teams_df, players_df, on="playerID", how="inner")
# merged_df = pd.merge(merged_df, awards_df, on="playerID", how="left")
# merged_df = pd.merge(merged_df, coaches_df, on="coachID", how="left")
# merged_df = pd.merge(merged_df, series_post_df, on="year", how="left")
# merged_df = pd.merge(merged_df, teams_post_df, on="year", how="left")
# merged_df = pd.merge(merged_df, teams_df, on="tmID", how="left")



# Save the result to a new CSV file
#merged_df.to_csv('merged_data.csv', index=False)

print(merged_df.head())

     playerID  year_x  stint tmID lgID_x  GP  GS  minutes  points  oRebounds  \
0  abrossv01w       2      0  MIN   WNBA  26  23      846     343         43   
1  abrossv01w       3      0  MIN   WNBA  27  27      805     314         45   
2  abrossv01w       4      0  MIN   WNBA  30  25      792     318         44   
3  abrossv01w       5      0  MIN   WNBA  22  11      462     146         17   
4  abrossv01w       6      0  MIN   WNBA  31  31      777     304         29   

   ...  lastseason  height  weight      college  collegeOther   birthDate  \
0  ...           0    74.0     169  Connecticut           NaN  1980-07-09   
1  ...           0    74.0     169  Connecticut           NaN  1980-07-09   
2  ...           0    74.0     169  Connecticut           NaN  1980-07-09   
3  ...           0    74.0     169  Connecticut           NaN  1980-07-09   
4  ...           0    74.0     169  Connecticut           NaN  1980-07-09   

    deathDate  award  year_y  lgID_y  
0  0000-00-00    

## Data Cleaning



In [14]:
#Remove useless collumns
merged_df.drop(columns=['lgID','deathDate', 'firstseason', 'lastseason', 'seeded'])

#Remove revealing collumns
merged_df.drop(columns=['won','lost', 'firstseason', 'lastseason', 'seeded'])

#Remove collum with many missing values
merged_df.drop(columns=['collegeOther'])

Unnamed: 0,playerID,award,year,lgID_x,coachID,tmID_x,stint_x,won_x,lost_x,post_wins,...,GP_y,homeW,homeL,awayW,awayL,confW,confL,min,attend,arena
0,thompti01w,All-Star Game Most Valuable Player,1,WNBA,adubari99w,NYL,0,20,12,4,...,32,14,2,13,3,17,4,6475,196077,Compaq Center
1,thompti01w,All-Star Game Most Valuable Player,1,WNBA,adubari99w,NYL,0,20,12,4,...,32,14,2,13,3,17,4,6475,196077,Compaq Center
2,thompti01w,All-Star Game Most Valuable Player,1,WNBA,adubari99w,NYL,0,20,12,4,...,32,14,2,13,3,17,4,6475,196077,Compaq Center
3,thompti01w,All-Star Game Most Valuable Player,1,WNBA,adubari99w,NYL,0,20,12,4,...,32,14,2,13,3,17,4,6475,196077,Compaq Center
4,thompti01w,All-Star Game Most Valuable Player,1,WNBA,adubari99w,NYL,0,20,12,4,...,32,14,2,13,3,17,4,6475,196077,Compaq Center
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73579,pondeca01w,WNBA Finals Most Valuable Player,8,WNBA,zierddo99w,MIN,0,10,24,0,...,34,9,8,11,6,13,9,6875,128680,AT&T Center
73580,pondeca01w,WNBA Finals Most Valuable Player,8,WNBA,zierddo99w,MIN,0,10,24,0,...,34,9,8,11,6,13,9,6875,128680,AT&T Center
73581,pondeca01w,WNBA Finals Most Valuable Player,8,WNBA,zierddo99w,MIN,0,10,24,0,...,34,9,8,11,6,13,9,6875,128680,AT&T Center
73582,pondeca01w,WNBA Finals Most Valuable Player,8,WNBA,zierddo99w,MIN,0,10,24,0,...,34,9,8,11,6,13,9,6875,128680,AT&T Center
