# Data Mining Project

## Model Evaluation Pipeline

In order to evaluate a model's performance with a certain data we have to define and structure a pipeline that collects information about the the model's performance metrics and a way to test the model with unknown data.


Dicas para avaliação de modelos:
Problema cronológico -> Usar apenas dados passados para prever os futuros
Problema da Granularidade -> usar dados passados para prever o proximo ano progressivamente (ex: treino:6 anos teste:7ºano -> treino:7 anos teste:8ºano)


In [10]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

DATA_PATH = 'data'
DATA_TEAMS = 'teams.csv'
DATA_COACHES = 'coaches.csv'
DATA_PLAYERS = 'players.csv'
DATA_AWARDS = 'awards_players.csv'
DATA_PLAYERS_TEAMS = 'players_teams.csv'
DATA_SERIES_POST = 'series_post.csv'
DATA_TEAMS_POST = 'teams_post.csv'

In [28]:
# Merge Tables

# Read the CSV files into data frames
awards_df = pd.read_csv(f'{DATA_PATH}/{DATA_AWARDS}')
coaches_df = pd.read_csv(f'{DATA_PATH}/{DATA_COACHES}')
players_df = pd.read_csv(f'{DATA_PATH}/{DATA_PLAYERS}')
players_teams_df = pd.read_csv(f'{DATA_PATH}/{DATA_PLAYERS_TEAMS}')
series_post_df = pd.read_csv(f'{DATA_PATH}/{DATA_SERIES_POST}')
teams_post_df = pd.read_csv(f'{DATA_PATH}/{DATA_TEAMS_POST}')
teams_df = pd.read_csv(f'{DATA_PATH}/{DATA_TEAMS}')

# Merge tables one by one based on common columns
players_df = players_df.rename(columns={'bioID': 'playerID'})

# FIND way to merge players with awards (maybe add list of awards per year)
#merged_df = pd.merge(players_df, awards_df, on="playerID", how="left") # ADDS MISSING Values

# series_post is a useless table for training, since we can't use it to predict playoff

players_teams_df = players_teams_df.rename(columns={'GP': 'GP_player_team',})
merged_df = pd.merge(players_teams_df, players_df, on="playerID", how="left")

merged_df = pd.merge(awards_df, merged_df, on=["playerID", "lgID", "year"], how="right")

coaches_df = coaches_df.rename(columns={'won': 'won_coaches',
                                        'lost': 'lost_coaches',
                                        'post_win': 'post_win_coaches',
                                        'post_lost': 'post_lost_coaches',
                                        'stint': 'stint_coaches'})
merged_df = pd.merge(coaches_df, merged_df, on=["year","lgID" ,"tmID"], how="right")

teams_df = teams_df.rename(columns={'GP': 'GP_team',})
merged_df = pd.merge(teams_df, merged_df, on=["year","lgID" ,"tmID"], how="right")

# merged_df = pd.merge(teams_post_df, merged_df, on=["year","lgID" ,"tmID"], how="right")

# series_post_df = series_post_df.rename(columns={'W': 'W_series',
#                                         'L': 'L_series'})
# merged_df = pd.merge(series_post_df, merged_df, on=["year"], how="right")

# last merged table (2nd atribute) atributes are on the left
merged_df.to_csv('merged_data.csv', index=False)



# Save the result to a new CSV file
#merged_df.to_csv('merged_data.csv', index=False)

#print(merged_df.head())
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2160 entries, 0 to 2159
Columns: 117 entries, year to deathDate
dtypes: float64(2), int64(97), object(18)
memory usage: 1.9+ MB
None


## Data Cleaning



In [29]:
#Remove useless collumns
merged_df = merged_df.drop(columns=['lgID','deathDate', 
                                    'firstseason', 'lastseason',
                                    'name', 'franchID',
                                    'divID', 'seeded',
                                    "tmORB","tmDRB",
                                    "tmTRB","opptmORB",
                                    "opptmDRB","opptmTRB",
                                    'attend', 'arena'])

#Remove result revealing collumns
merged_df = merged_df.drop(columns=['won_coaches','lost_coaches',
                                    'post_wins','post_losses']) # coaches
merged_df = merged_df.drop(columns=['rank']) # teams
#merged_df = merged_df.drop(columns=['W','L']) #teams_post

#Remove collum with many missing values
merged_df = merged_df.drop(columns=['collegeOther'])

merged_df.to_csv('merged_data.csv', index=False)

print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2160 entries, 0 to 2159
Data columns (total 95 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   year                2160 non-null   int64  
 1   tmID                2160 non-null   object 
 2   confID              2160 non-null   object 
 3   playoff             2160 non-null   object 
 4   firstRound          1163 non-null   object 
 5   semis               547 non-null    object 
 6   finals              259 non-null    object 
 7   o_fgm               2160 non-null   int64  
 8   o_fga               2160 non-null   int64  
 9   o_ftm               2160 non-null   int64  
 10  o_fta               2160 non-null   int64  
 11  o_3pm               2160 non-null   int64  
 12  o_3pa               2160 non-null   int64  
 13  o_oreb              2160 non-null   int64  
 14  o_dreb              2160 non-null   int64  
 15  o_reb               2160 non-null   int64  
 16  o_asts