# Dataset Preprocessing

In this notebook we are performing all the tasks related with the datapreprocessing. We start by checking the missing values and the outliers and then, once they are detected, we delete it from our dataset. Finnaly, we transform the categorical columns into values as it would be easy for our models to get results if those columns are integers rather than strings. 

## Imports

In [None]:
import copy
import numpy as np
import pandas as pd
import seaborn as sn
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.decomposition import PCA 
from sklearn.preprocessing import MinMaxScaler

## Utils

In [None]:
def print_information_about_dataset(df):
    for it, key in enumerate(df.keys()): print(it, key, len(df[df[key].notna()]))
    print('From the dataset, after deleting the NaN values, we have found that we have only', 
            len(df[~df.isnull().any(axis=1)]), 'rows remaining.')

In [None]:
def delete_from_dataframe(df, delete_keys):
    for key in delete_keys:
        del df[key]
    return df

In [None]:
def replace_string_to_int_dataframe(df, keys, input):
    for keys, inp in zip(keys, input):
        df[keys].replace(inp, list(range(len(inp))), inplace=True)
    return df

In [None]:
def dataframe_to_dictionary(df, key, continue_list=None):
    result = dict()
    for _key, _val in zip(df[key].keys(), df[key].values):
        if _key not in continue_list:
            result[_key] = _val
    return result

In [None]:
def make_plots(df, plot=0, objective_vars=2):
    corr_matrix = df.corr()
    if objective_vars == 2:
        home_team = dataframe_to_dictionary(corr_matrix, 'home_team_score', ['home_team_score', 'away_team_score'])
        away_team = dataframe_to_dictionary(corr_matrix, 'away_team_score', ['home_team_score', 'away_team_score'])

        home_team = dict(sorted(home_team.items(), key=lambda item: item[1]))
        away_team = dict(sorted(away_team.items(), key=lambda item: item[1]))
    elif objective_vars == 1:
        home_team = dataframe_to_dictionary(corr_matrix, 'result', ['result'])
        home_team = dict(sorted(home_team.items(), key=lambda item: item[1]))

    if plot == 0:
        fig, ax = plt.subplots(objective_vars+1, 1)
        ax[0].matshow(corr_matrix)
        ax[0].set_title('Correlation Matrix', fontsize=8)
        ax[0].set_xticklabels(list(range(len(corr_matrix.keys()))), fontsize=5)
        ax[0].set_yticklabels(list(corr_matrix.keys()), fontsize=5)

        ax[1].set_title('Home Team Variables Correlation', fontsize=8)
        ax[1].barh(list(home_team.keys()), list(home_team.values()))
        ax[1].set_yticklabels(list(home_team.keys()), fontsize=5)
        
        if objective_vars == 2:
            ax[2].set_title('Away Team Variables Correlation', fontsize=8)
            ax[2].barh(list(away_team.keys()), list(away_team.values()))
            ax[2].set_yticklabels(list(away_team.keys()), fontsize=5)

        fig.tight_layout()

    elif plot == 1:
        plt.matshow(corr_matrix)
        plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=90)
        plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
        plt.colorbar()

    elif plot == 2:
        plt.title('Home Team Variables Correlation')
        plt.barh(list(home_team.keys()), list(home_team.values()))
        plt.yticks(list(home_team.keys()))

    elif plot == 3:
        plt.title('Away Team Variables Correlation')
        plt.barh(list(away_team.keys()), list(away_team.values()))
        plt.yticks(list(away_team.keys()))
    
    elif plot == 4:
        plt.title('Result Variable Correlation')
        plt.barh(list(home_team.keys()), list(home_team.values()))
        plt.yticks(list(home_team.keys()))

    plt.show()

In [None]:
def create_scaler_and_transform_data(scaler, data, transform_data=False):
    scaler.fit(data)
    if not transform_data:
        return scaler
    return scaler, scaler.transform(data)

## Main

In [None]:
df = pd.read_csv('../../Data/international_matches.csv', delimiter=';')
df.head()

In [None]:
print_information_about_dataset(df)

In [None]:
df_without_NaN = df[~df.isnull().any(axis=1)]
df_without_NaN.head()

In [None]:
df_without_NaN_UnusedVars = copy.deepcopy(df_without_NaN)
delete_keys = ['date', 'home_team', 'away_team', 'tournament', 'city', 'country', 'neutral_location', 'home_team_result']
df_without_NaN_UnusedVars = delete_from_dataframe(df_without_NaN_UnusedVars, delete_keys)
df_without_NaN_UnusedVars.head()

In [None]:
df_without_NaN_UnusedVars_KeysReplaced = copy.deepcopy(df_without_NaN_UnusedVars)
replace_keys = ['home_team_continent', 'away_team_continent', 'shoot_out']
replace_input = [['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America'], ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America'], ['No', 'Yes']]
df_without_NaN_UnusedVars_KeysReplaced = replace_string_to_int_dataframe(df_without_NaN_UnusedVars_KeysReplaced, replace_keys, replace_input)

In [None]:
make_plots(df_without_NaN_UnusedVars_KeysReplaced, 1)

## Estandarización de los datos

In [None]:
df_without_NaN_UnusedVars_KeysReplaced.head()

In [None]:
X = df_without_NaN_UnusedVars_KeysReplaced[df_without_NaN_UnusedVars_KeysReplaced.columns.difference(['home_team_score', 'away_team_score'])]
y = df_without_NaN_UnusedVars_KeysReplaced[['home_team_score', 'away_team_score']]

In [None]:
X_scaler, X_scaled = create_scaler_and_transform_data(MinMaxScaler(), X, transform_data=True)

In [None]:
df_without_NaN_UnusedVars_KeysReplaced_Scaled = pd.DataFrame(X_scaled, columns=df_without_NaN_UnusedVars_KeysReplaced.columns.difference(['home_team_score', 'away_team_score']))
df_without_NaN_UnusedVars_KeysReplaced_Scaled['home_team_score'] = y['home_team_score'].to_list()
df_without_NaN_UnusedVars_KeysReplaced_Scaled['away_team_score'] = y['away_team_score'].to_list()
df_without_NaN_UnusedVars_KeysReplaced_Scaled.head()

## Modificación de la Variable Objetivo

In [None]:
df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar = copy.deepcopy(df_without_NaN_UnusedVars_KeysReplaced_Scaled)
df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar['result'] = df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar.apply(lambda row: row.home_team_score - row.away_team_score, axis=1)
delete_keys = ['home_team_score','away_team_score']
df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar = delete_from_dataframe(df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar, delete_keys)

In [None]:
make_plots(df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar, 1, objective_vars=1)

## Redución de la dimensionalidad

In [None]:
df_pca = copy.deepcopy(df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar)
del df_pca['result']

In [None]:
df_pca.head()

In [None]:
myPCA = PCA().fit(df_pca)

print(myPCA.explained_variance_ratio_)
print(myPCA.explained_variance_ratio_.cumsum())

In [None]:
fig = plt.figure(figsize=(8,6));
plt.plot(range(1,len(myPCA.singular_values_ )+1),myPCA.singular_values_ ,alpha=0.8,marker='.');
y_label = plt.ylabel('Eigenvalues');
x_label = plt.xlabel('Componentes');
plt.title('Scree plot');

In [None]:
fig = plt.figure(figsize=(8,3));
plt.plot(range(1,len(myPCA.explained_variance_ratio_ )+1),myPCA.explained_variance_ratio_ ,alpha=0.8,marker='.',label="Variancia Explicada", c='tab:olive');
y_label = plt.ylabel('Variancia explicada');
x_label = plt.xlabel('Componentes');
plt.plot(range(1,len(myPCA.explained_variance_ratio_ )+1),
         np.cumsum(myPCA.explained_variance_ratio_),
         marker='.',
         label="Variancia Explicada Acumulativa", c='tab:purple');
plt.legend();
plt.title('Porcentaje de variancia explicada por componente');

In [None]:
fig, ax = plt.subplots(figsize=(15,15))         # Sample figsize in inches
sn.heatmap(myPCA.components_,  
            xticklabels=list(df_pca.columns),
            vmin=-np.max(np.abs(myPCA.components_)),
            vmax=np.max(np.abs(myPCA.components_)),
            annot=True, cmap='viridis');

In [None]:
transformed_crabs = myPCA.transform(df_pca[df_pca.columns])
df_pca[['PC1','PC2', 'PC3']] = transformed_crabs[:,:3]

In [None]:
fig = plt.figure(figsize=(8,8))
_ = sn.scatterplot(x='PC1', y='PC2', hue=df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar['result'] ,data=df_pca, legend=True, palette='viridis')

In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=df_pca['PC1'],
    y=df_pca['PC2'],
    z=df_pca['PC3'],
    mode='markers',
    marker=dict(
        color=df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar['result'],                # set color to an array/list of desired values
        colorscale='Viridis',   # choose a colorscale
        opacity=0.8,
        colorbar=dict(thickness=20)
    ),
)])
fig.show()

## Guardar Conjunto De Datos

In [None]:
df_without_NaN_UnusedVars_KeysReplaced_OneObjectiveVar.to_csv('../../Data/international_matches_clean.csv', index=False)