# Dataset Preprocessing

In this notebook we are performing all the tasks related with the datapreprocessing. We start by checking the missing values and the outliers and then, once they are detected, we delete it from our dataset. Finnaly, we transform the categorical columns into values as it would be easy for our models to get results if those columns are integers rather than strings. 

## Imports

In [None]:
import copy
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

## Utils

In [None]:
def print_information_about_dataset(df):
    for it, key in enumerate(df.keys()): print(it, key, len(df[df[key].notna()]))
    print('From the dataset, after deleting the NaN values, we have found that we have only', 
            len(df[~df.isnull().any(axis=1)]), 'rows remaining.')

In [None]:
def delete_from_dataframe(df, delete_keys):
    for key in delete_keys:
        del df[key]
    return df

In [None]:
def replace_string_to_int_dataframe(df, keys, input):
    for keys, inp in zip(keys, input):
        df[keys].replace(inp, list(range(len(inp))), inplace=True)
    return df

In [None]:
def dataframe_to_dictionary(df, key, continue_list=None):
    result = dict()
    for _key, _val in zip(df[key].keys(), df[key].values):
        if _key not in continue_list:
            result[_key] = _val
    return result

In [None]:
def make_plots(df, plot=0):
    corr_matrix = df.corr()

    home_team = dataframe_to_dictionary(corr_matrix, 'home_team_score', ['home_team_score', 'away_team_score'])
    away_team = dataframe_to_dictionary(corr_matrix, 'away_team_score', ['home_team_score', 'away_team_score'])

    home_team = dict(sorted(home_team.items(), key=lambda item: item[1]))
    away_team = dict(sorted(away_team.items(), key=lambda item: item[1]))
    if plot == 0:
        fig, (ax0, ax1, ax2) = plt.subplots(3, 1)
        ax0.matshow(corr_matrix)
        ax0.set_title('Correlation Matrix', fontsize=8)
        ax0.set_xticklabels(list(range(len(corr_matrix.keys()))), fontsize=5)
        ax0.set_yticklabels(list(corr_matrix.keys()), fontsize=5)

        ax1.set_title('Home Team Variables Correlation', fontsize=8)
        ax1.barh(list(home_team.keys()), list(home_team.values()))
        ax1.set_yticklabels(list(home_team.keys()), fontsize=5)
        
        ax2.set_title('Away Team Variables Correlation', fontsize=8)
        ax2.barh(list(away_team.keys()), list(away_team.values()))
        ax2.set_yticklabels(list(away_team.keys()), fontsize=5)

        fig.tight_layout()

    elif plot == 1:
        plt.matshow(corr_matrix)
        plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=90)
        plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
        plt.colorbar()

    elif plot == 2:
        plt.title('Home Team Variables Correlation')
        plt.barh(list(home_team.keys()), list(home_team.values()))
        plt.yticks(list(home_team.keys()))
    elif plot == 3:
        plt.title('Away Team Variables Correlation')
        plt.barh(list(away_team.keys()), list(away_team.values()))
        plt.yticks(list(away_team.keys()))

    plt.show()

## Main

In [None]:
df = pd.read_csv('../../Data/international_matches.csv', delimiter=';')
df.head()

In [None]:
print_information_about_dataset(df)

In [None]:
df_without_NaN = df[~df.isnull().any(axis=1)]
df_without_NaN.head()

In [None]:
df_without_NaN_UnusedVars = copy.deepcopy(df_without_NaN)
delete_keys = ['date', 'home_team', 'away_team', 'tournament', 'city', 'country', 'neutral_location', 'home_team_result']
df_without_NaN_UnusedVars = delete_from_dataframe(df_without_NaN_UnusedVars, delete_keys)
df_without_NaN_UnusedVars.head()

In [None]:
df_without_NaN_UnusedVars_KeysReplaced = copy.deepcopy(df_without_NaN_UnusedVars)
replace_keys = ['home_team_continent', 'away_team_continent', 'shoot_out']
replace_input = [['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America'], ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America'], ['No', 'Yes']]
df_without_NaN_UnusedVars_KeysReplaced = replace_string_to_int_dataframe(df_without_NaN_UnusedVars_KeysReplaced, replace_keys, replace_input)

In [None]:
make_plots(df_without_NaN_UnusedVars_KeysReplaced, 3)

In [92]:
df_without_NaN_UnusedVars_KeysReplaced.to_csv('../../Data/international_matches_clean.csv', index=False)