# Progetto di Web Intelligence

## Import libraries

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import random

## Definizione di varie funzioni utili per i prossimi blocchi

In [2]:
# Inverti ordine delle parole in una stringa
def invert_order (x: str):
    tokens = x.split()
    tokens = tokens[::-1]
    new_x  = ' '.join(tokens)
    return new_x

# Rimuovi spazi
def remove_spaces(x: str):
    if not isinstance(x, str):
        x.to_string(x)
    tokens = x.split()
    new_x = ''.join(tokens)
    return new_x

# Rimuovi punti
def remove_dots(x: str):
    return x.replace('.', '')

# Rimuovi punti e spazi
def remove_dotsandspaces(x: str):
    return remove_dots(remove_spaces(x))

## Import games datasets

In [3]:
dataset_2017 = './data/2017.csv'
dataset_2018 = './data/2018.csv'
dataset_2019 = './data/2019.csv'

# Read datasets
df2017 = pd.read_csv(dataset_2017)
df2018 = pd.read_csv(dataset_2018)
df2019 = pd.read_csv(dataset_2019)

# Merge datasets
frames = [df2017, df2018, df2019]
# Ignore_index serve per generare un nuovo indice
df_full = pd.concat(frames, ignore_index=True)

# Rimuovo le colonne inutili
df = df_full.drop(['Comment', 'ATP', 'Location', 'Tournament', 'Date', 'Best of', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'B365W', 'B365L', 'EXW', 'EXL', 'LBW', 'LBL'], axis=1)

# Rimuovo altre colonne
df.drop(['Series', 'Round', 'PSW', 'PSL', 'MaxW', 'MaxL', 'AvgW', 'AvgL', 'Wsets', 'Lsets'], axis=1, inplace=True)

# Tolgo spazi o punti nei nomi dei giocatori
for i in range(0, df.shape[0]):
    df.at[i, 'Winner'] = remove_dotsandspaces(df.loc[i]['Winner'])
    df.at[i, 'Loser'] = remove_dotsandspaces(df.loc[i]['Loser'])

df

Unnamed: 0,Court,Surface,Winner,Loser,WRank,LRank,WPts,LPts
0,Outdoor,Hard,ThompsonJ,YmerE,79.0,160.0,689.0,372.0
1,Outdoor,Hard,MahutN,RobertS,39.0,54.0,1102.0,854.0
2,Outdoor,Hard,FerrerD,TomicB,21.0,26.0,1785.0,1465.0
3,Outdoor,Hard,EdmundK,EscobedoE,45.0,141.0,1001.0,443.0
4,Outdoor,Hard,DimitrovG,JohnsonS,17.0,33.0,2035.0,1320.0
...,...,...,...,...,...,...,...,...
7875,Indoor,Hard,NadalR,TsitsipasS,1.0,6.0,9585.0,4000.0
7876,Indoor,Hard,ZverevA,MedvedevD,7.0,4.0,2945.0,5705.0
7877,Indoor,Hard,TsitsipasS,FedererR,6.0,3.0,4000.0,6190.0
7878,Indoor,Hard,ThiemD,ZverevA,5.0,7.0,5025.0,2945.0


## Import Australian Open 2020 Players Dataset

In [4]:
from pandas.io.json import json_normalize 

# Giocatori dell'Australian Open 2020 (scaricato da https://ausopen.com/event/131981/players?_format=json)
playersAO2020 = './data/players.json'

# Leggo il dataset
playersAO = pd.read_json(playersAO2020)

## Sistemo tutti i problemi e rimuovo dati superflui

# Normalizzo i json innestati
nationalities = json_normalize(playersAO.nationality)['code']
playersAO['nationality']=nationalities

for i in range(0, playersAO.shape[0]):

    # Sistemo i dati sul ranking
    ranking = json_normalize(playersAO.rankings[i])['ranking']
    playersAO.at[i, 'rankings']=ranking[0]

    # Sistemo i short_name invertiti
    playersAO.at[i, 'short_name'] = ' '.join([playersAO.loc[i]['last_name'], playersAO.loc[i]['first_name'][0:1]])
    
    # Fix per i short name che hanno iniziale solo del primo nome
    if len(playersAO.at[i, 'first_name'].split()) > 1:
        playersAO.at[i, 'short_name'] = playersAO.at[i, 'short_name'] + playersAO.at[i, 'first_name'].split()[1][0]

    # Tolgo spazi o punti nei nomi
    playersAO.at[i, 'short_name'] = remove_dotsandspaces(playersAO.loc[i]['short_name'])


# Funzione per sostituzione nome
def fix_name (pre_fix: str, post_fix: str):
    playersAO.at[playersAO[playersAO.short_name == pre_fix].index[0], 'short_name'] = post_fix

# Fix per inconsistenza nomi
fix_name('McDonaldM', 'McdonaldM')
fix_name('SmithJ', 'SmithJP')
fix_name('StruffJ', 'StruffJL')
fix_name('StebeC', 'StebeCM')
fix_name('KwonS', 'KwonSW')
fix_name('LuY', 'LuYH')
fix_name('TsongaJ', 'TsongaJW')
fix_name('HerbertP', 'HerbertPH')

# Rimuovo colonne inutili
playersAO.drop(['player_id', 'uuid', 'nid', 'tour_id', 'birth_place', 'resident_of', 'coach', 'dob', 'first_name', 'last_name', 'full_name', 'gender', 'hero_image', 'hero_image_240', 'image', 'player_icon', 'events_contested', 'profile_link'], axis=1, inplace=True)

# Rimuovo colonne inutili
playersAO.drop(['nationality', 'turned_pro', 'rankings', 'player_height', 'player_weight'], axis=1, inplace=True)

playersAO

  nationalities = json_normalize(playersAO.nationality)['code']
  ranking = json_normalize(playersAO.rankings[i])['ranking']


Unnamed: 0,short_name,career_loses,career_prize_money,career_titles,career_wins
0,AndersonK,228,16637503,6,333
1,BautistaAgutR,175,12968125,9,296
2,BerankisR,120,3282803,0,103
3,BasilashviliN,103,5479761,3,98
4,BedeneA,129,3903708,0,117
...,...,...,...,...,...
123,TrungellitiM,15,1010162,0,12
124,TabiloA,3,210307,0,3
125,VilellaMartinezM,1,243129,0,0
126,YmerM,25,693421,0,22


## Aggiornamento dataset giocatori

In [5]:
for i in range(0, playersAO.shape[0]):
    
    # Calcolo numero vittorie e sconfitte per ogni giocatore negli ultimi 3 anni
    # Aggiungo una partita per evitare percentuali nulle di probabilità di vittoria che impatterebbero i classificatori
    wins = 1 + df[(df['Winner'] == playersAO.loc[i]['short_name'])].shape[0]
    losses = 1 + df[(df['Loser'] == playersAO.loc[i]['short_name'])].shape[0]

    # Calcolo il numero di vittorie in carriera e il numero di game giocati
    career_wins = 1 + playersAO.at[i, 'career_wins']
    career_games = career_wins + playersAO.at[i, 'career_loses']

    # Inserisco nel dataset i winrate per gli ultimi 3 anni e per tutta la carriera
    playersAO.at[i, 'winrate3y'] = wins/(wins + losses)
    playersAO.at[i, 'winrate'] = career_wins / career_games

    # Calcolo valore vittorie fratto partite carriera
    playersAO.at[i, 'prizerate'] = (playersAO.at[i, 'career_prize_money'] + 0.01) / career_games

    ## Calcolo affinità giocatore con tipologia terreno e outdoor/indoor (se non hanno partite in quelle condizioni metto uno 0.5 che equivale a nessuna predisposizione)
    # Calcolo affinità con clay
    clay_wins = df[((df['Winner'] == playersAO.loc[i]['short_name']) & (df['Surface'] == 'Clay'))].shape[0] 
    clay_games = df[(((df['Loser'] == playersAO.loc[i]['short_name']) | (df['Winner'] == playersAO.loc[i]['short_name'])) & (df['Surface'] == 'Clay'))].shape[0] 
    playersAO.at[i, 'clay_affinity'] = clay_wins / clay_games if (clay_games != 0) else 0.5
    
    # Calcolo affinità con grass
    grass_wins = df[((df['Winner'] == playersAO.loc[i]['short_name']) & (df['Surface'] == 'Grass'))].shape[0] 
    grass_games = df[(((df['Loser'] == playersAO.loc[i]['short_name']) | (df['Winner'] == playersAO.loc[i]['short_name'])) & (df['Surface'] == 'Grass'))].shape[0] 
    playersAO.at[i, 'grass_affinity'] = grass_wins / grass_games if (grass_games != 0) else 0.5
    
    # Calcolo affinità con hard
    hard_wins = df[((df['Winner'] == playersAO.loc[i]['short_name']) & (df['Surface'] == 'Hard'))].shape[0] 
    hard_games = df[(((df['Loser'] == playersAO.loc[i]['short_name']) | (df['Winner'] == playersAO.loc[i]['short_name'])) & (df['Surface'] == 'Hard'))].shape[0] 
    playersAO.at[i, 'hard_affinity'] = hard_wins / hard_games if (hard_games != 0) else 0.5
    
    # Calcolo affinità con outdoor
    outdoor_wins = df[((df['Winner'] == playersAO.loc[i]['short_name']) & (df['Court'] == 'Outdoor'))].shape[0] 
    outdoor_games = df[(((df['Loser'] == playersAO.loc[i]['short_name']) | (df['Winner'] == playersAO.loc[i]['short_name'])) & (df['Court'] == 'Outdoor'))].shape[0] 
    playersAO.at[i, 'outdoor_affinity'] = outdoor_wins / outdoor_games if (outdoor_games != 0) else 0.5
    
    # Calcolo affinità con indoor
    indoor_wins = df[((df['Winner'] == playersAO.loc[i]['short_name']) & (df['Court'] == 'Indoor'))].shape[0] 
    indoor_games = df[(((df['Loser'] == playersAO.loc[i]['short_name']) | (df['Winner'] == playersAO.loc[i]['short_name'])) & (df['Court'] == 'Indoor'))].shape[0] 
    playersAO.at[i, 'indoor_affinity'] = indoor_wins / indoor_games if (indoor_games != 0) else 0.5


# Se dei giocatori hanno affinità zero (zero vittorie in tutti i loro game)
#  prendo l'affinità più bassa diversa da zero e divido per due
for column in ['clay_affinity', 'grass_affinity', 'hard_affinity', 'outdoor_affinity', 'indoor_affinity']:
    playersAO[column] = playersAO[column].replace(0, np.nan)
    playersAO[column] = playersAO[column].replace(np.nan, playersAO[column].min() / 2)
    
# Scarto le colonne ora inutili
playersAO.drop(['career_wins', 'career_loses', 'career_titles'], axis=1, inplace=True)

# Il valore delle vittorie non sembra corretto e quindi non uso questo valore
playersAO.drop(['prizerate', 'career_prize_money'], axis=1, inplace=True)

playersAO

Unnamed: 0,short_name,winrate3y,winrate,clay_affinity,grass_affinity,hard_affinity,outdoor_affinity,indoor_affinity
0,AndersonK,0.671533,0.594306,0.615385,0.684211,0.688889,0.678261,0.650000
1,BautistaAgutR,0.655738,0.629237,0.600000,0.750000,0.666667,0.668831,0.592593
2,BerankisR,0.375000,0.464286,0.100000,0.333333,0.400000,0.225806,0.565217
3,BasilashviliN,0.503185,0.490099,0.541667,0.312500,0.516484,0.496063,0.535714
4,BedeneA,0.495495,0.477733,0.581818,0.428571,0.400000,0.489130,0.529412
...,...,...,...,...,...,...,...,...
123,TrungellitiM,0.500000,0.464286,0.533333,0.500000,0.055556,0.461538,0.666667
124,TabiloA,0.500000,0.571429,0.500000,0.500000,0.500000,0.500000,0.500000
125,VilellaMartinezM,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000
126,YmerM,0.315789,0.479167,0.375000,0.500000,0.222222,0.307692,0.250000


## Transformiamo le features categoriali

In [6]:
from sklearn import preprocessing

# Uso l'encoder OneHotEncoder per creare una colonna per ogni categoriale 
# (drop if binary per avere una colonna sola qualora i valori possibili fossero solo 2)
enc = preprocessing.OneHotEncoder(drop='if_binary')
X = df[['Court', 'Surface']]
enc.fit(X)

# Trasformo le categoriali e inserisco le nuove colonne generate rimuovendo quelle originali
tranformed = enc.transform(df[['Court', 'Surface']]).toarray()
new_data = pd.DataFrame(tranformed, columns=enc.get_feature_names())
df = pd.concat([df, new_data], axis=1).drop(['Court', 'Surface'], axis=1)

# Creo una copia di questo dataset per dopo
df_base = df

df

Unnamed: 0,Winner,Loser,WRank,LRank,WPts,LPts,x0_Outdoor,x1_Clay,x1_Grass,x1_Hard
0,ThompsonJ,YmerE,79.0,160.0,689.0,372.0,1.0,0.0,0.0,1.0
1,MahutN,RobertS,39.0,54.0,1102.0,854.0,1.0,0.0,0.0,1.0
2,FerrerD,TomicB,21.0,26.0,1785.0,1465.0,1.0,0.0,0.0,1.0
3,EdmundK,EscobedoE,45.0,141.0,1001.0,443.0,1.0,0.0,0.0,1.0
4,DimitrovG,JohnsonS,17.0,33.0,2035.0,1320.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
7875,NadalR,TsitsipasS,1.0,6.0,9585.0,4000.0,0.0,0.0,0.0,1.0
7876,ZverevA,MedvedevD,7.0,4.0,2945.0,5705.0,0.0,0.0,0.0,1.0
7877,TsitsipasS,FedererR,6.0,3.0,4000.0,6190.0,0.0,0.0,0.0,1.0
7878,ThiemD,ZverevA,5.0,7.0,5025.0,2945.0,0.0,0.0,0.0,1.0


## Preparazione del dataset

In [7]:
def anonymize_players(df):
    # Rinomino le colonne vincitore e sconfitto in player1 e player2
    df.rename(columns = {'Winner':'P1', 'Loser':'P2'}, inplace = True)

    # Rinomino le colonne che vado modificare mettendo un prefisso P1, P2 al posto di W e L
    df.rename(columns = {'WRank':'P1Rank', 'LRank':'P2Rank', 'WPts':'P1Pts', 'LPts':'P2Pts'}, inplace = True)

    # Scarto le righe con dati mancanti
    df.dropna(inplace=True)

    # Usato per aggiornare gli indici e quindi anche la shape
    df.reset_index(drop=True, inplace=True)

anonymize_players(df)

def rnd_player_order(df):
    for i in range(0, df.shape[0]):

        # Raccoglie dati utili
        WPlayerID = df.iloc[i]['P1']
        LPlayerID = df.iloc[i]['P2']

        # Alterno vincitore e sconfitto all'interno del dataset
        if (random.choice([True, False])):
            df.at[i, 'P1'] = LPlayerID
            df.at[i, 'P2'] = WPlayerID
            # Vincitore è player 2
            df.at[i, 'Win'] = 2.0

            # Aggiorno i restanti dati 
            df.loc[i, ['P1Rank', 'P2Rank', 'P1Pts', 'P2Pts']] = df.loc[i, ['P2Rank', 'P1Rank', 'P2Pts', 'P1Pts']].values
        else: 
            # Vincitore è player 1
            df.at[i, 'Win'] = 1.0

rnd_player_order(df)

df

Unnamed: 0,P1,P2,P1Rank,P2Rank,P1Pts,P2Pts,x0_Outdoor,x1_Clay,x1_Grass,x1_Hard,Win
0,YmerE,ThompsonJ,160.0,79.0,372.0,689.0,1.0,0.0,0.0,1.0,2.0
1,RobertS,MahutN,54.0,39.0,854.0,1102.0,1.0,0.0,0.0,1.0,2.0
2,TomicB,FerrerD,26.0,21.0,1465.0,1785.0,1.0,0.0,0.0,1.0,2.0
3,EdmundK,EscobedoE,45.0,141.0,1001.0,443.0,1.0,0.0,0.0,1.0,1.0
4,JohnsonS,DimitrovG,33.0,17.0,1320.0,2035.0,1.0,0.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
7851,TsitsipasS,NadalR,6.0,1.0,4000.0,9585.0,0.0,0.0,0.0,1.0,2.0
7852,MedvedevD,ZverevA,4.0,7.0,5705.0,2945.0,0.0,0.0,0.0,1.0,2.0
7853,FedererR,TsitsipasS,3.0,6.0,6190.0,4000.0,0.0,0.0,0.0,1.0,2.0
7854,ZverevA,ThiemD,7.0,5.0,2945.0,5025.0,0.0,0.0,0.0,1.0,2.0


## Unisco i due dataset

In [8]:
df = df.merge(playersAO, how='left', left_on=['P1'], right_on=['short_name'], suffixes=("_1", "_2"), validate='m:1')
df = df.merge(playersAO, how='left', left_on=['P2'], right_on=['short_name'], suffixes=("_1", "_2"), validate='m:1').drop(['short_name_1', 'short_name_2'], axis=1)

In [9]:
# Scarto le righe con dati mancanti
df.dropna(inplace=True)

# Usato per aggiornare gli indici e quindi anche la shape
df.reset_index(drop=True, inplace=True)

# Lascio solo l'affinità con il terreno attualmente presente e con il tipo di campo attuale
for i in range(0, df.shape[0]):
    if (df.at[i, 'x1_Clay'] == 1.0):
        df.at[i, 'terrain_affinity_1'] = df.at[i, 'clay_affinity_1']
        df.at[i, 'terrain_affinity_2'] = df.at[i, 'clay_affinity_2']
    elif (df.at[i, 'x1_Grass'] == 1.0):
        df.at[i, 'terrain_affinity_1'] = df.at[i, 'grass_affinity_1']
        df.at[i, 'terrain_affinity_2'] = df.at[i, 'grass_affinity_2']
    elif (df.at[i, 'x1_Hard'] == 1.0):
        df.at[i, 'terrain_affinity_1'] = df.at[i, 'hard_affinity_1']
        df.at[i, 'terrain_affinity_2'] = df.at[i, 'hard_affinity_2']

    if (df.at[i, 'x0_Outdoor'] == 1.0):
        df.at[i, 'iodoor_affinity_1'] = df.at[i, 'outdoor_affinity_1']
        df.at[i, 'iodoor_affinity_2'] = df.at[i, 'outdoor_affinity_2']
    elif (df.at[i, 'x0_Outdoor'] == 0.0):
        df.at[i, 'iodoor_affinity_1'] = df.at[i, 'indoor_affinity_1']
        df.at[i, 'iodoor_affinity_2'] = df.at[i, 'indoor_affinity_2']


# Droppo le colonne ora inutili
df = df.drop(['clay_affinity_1', 'clay_affinity_2', 'grass_affinity_1', 'grass_affinity_2', 'hard_affinity_1', 'hard_affinity_2', 'x1_Clay', 'x1_Grass', 'x1_Hard', 'indoor_affinity_1', 'indoor_affinity_2', 'outdoor_affinity_1', 'outdoor_affinity_2', 'x0_Outdoor'], axis = 1)

df

Unnamed: 0,P1,P2,P1Rank,P2Rank,P1Pts,P2Pts,Win,winrate3y_1,winrate_1,winrate3y_2,winrate_2,terrain_affinity_1,terrain_affinity_2,iodoor_affinity_1,iodoor_affinity_2
0,JohnsonS,DimitrovG,33.0,17.0,1320.0,2035.0,2.0,0.514493,0.503012,0.617834,0.602317,0.450000,0.669725,0.539062,0.586777
1,SchwartzmanD,QuerreyS,52.0,31.0,864.0,1355.0,1.0,0.583784,0.534965,0.574468,0.554235,0.578947,0.547368,0.582278,0.583333
2,SchwartzmanD,RaonicM,52.0,3.0,864.0,5450.0,2.0,0.583784,0.534965,0.646154,0.679612,0.578947,0.610390,0.582278,0.672269
3,EdmundK,WawrinkaS,45.0,4.0,1001.0,5315.0,2.0,0.532895,0.515419,0.611111,0.635152,0.547368,0.637500,0.514925,0.610000
4,DimitrovG,ThiemD,17.0,8.0,2035.0,3415.0,1.0,0.617834,0.602317,0.688679,0.652174,0.669725,0.638889,0.586777,0.699422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3561,TsitsipasS,NadalR,6.0,1.0,4000.0,9585.0,2.0,0.613497,0.637838,0.863874,0.832074,0.603774,0.828283,0.621622,0.636364
3562,MedvedevD,ZverevA,4.0,7.0,5705.0,2945.0,2.0,0.642105,0.636364,0.702326,0.659026,0.693431,0.680000,0.642857,0.588235
3563,FedererR,TsitsipasS,3.0,6.0,6190.0,4000.0,2.0,0.852459,0.821004,0.613497,0.637838,0.853846,0.603774,0.833333,0.621622
3564,ZverevA,ThiemD,7.0,5.0,2945.0,5025.0,2.0,0.702326,0.659026,0.688679,0.652174,0.680000,0.638889,0.588235,0.648649


## Unisco alcune colonne per generare features migliori

In [10]:
# Facciamo il delta delle features
df['Rank_delta'] = df['P1Rank'] - df['P2Rank']
df['Pts_delta'] = df['P1Pts'] - df['P2Pts']
df['winrate3y_delta'] = df['winrate3y_1'] - df['winrate3y_2']
df['winrate_delta'] = df['winrate_1'] - df['winrate_2']
df['terrain_affinity_delta'] = df['terrain_affinity_1'] - df['terrain_affinity_2']
df['iodoor_affinity_delta'] = df['iodoor_affinity_1'] - df['iodoor_affinity_2']

# Facciamo il drop delle colonne ora inutili
df = df.drop(['P1Rank', 'P2Rank', 'P1Pts', 'P2Pts', 'winrate3y_1', 'winrate3y_2', 'winrate_1', 'winrate_2', 'terrain_affinity_1', 'terrain_affinity_2', 'iodoor_affinity_1', 'iodoor_affinity_2'], axis = 1)

df

Unnamed: 0,P1,P2,Win,Rank_delta,Pts_delta,winrate3y_delta,winrate_delta,terrain_affinity_delta,iodoor_affinity_delta
0,JohnsonS,DimitrovG,2.0,16.0,-715.0,-0.103342,-0.099305,-0.219725,-0.047714
1,SchwartzmanD,QuerreyS,1.0,21.0,-491.0,0.009316,-0.019270,0.031579,-0.001055
2,SchwartzmanD,RaonicM,2.0,49.0,-4586.0,-0.062370,-0.144647,-0.031442,-0.089990
3,EdmundK,WawrinkaS,2.0,41.0,-4314.0,-0.078216,-0.119733,-0.090132,-0.095075
4,DimitrovG,ThiemD,1.0,9.0,-1380.0,-0.070845,-0.049857,0.030836,-0.112645
...,...,...,...,...,...,...,...,...,...
3561,TsitsipasS,NadalR,2.0,5.0,-5585.0,-0.250377,-0.194236,-0.224509,-0.014742
3562,MedvedevD,ZverevA,2.0,-3.0,2760.0,-0.060220,-0.022662,0.013431,0.054622
3563,FedererR,TsitsipasS,2.0,-3.0,2190.0,0.238962,0.183166,0.250073,0.211712
3564,ZverevA,ThiemD,2.0,2.0,-2080.0,0.013646,0.006852,0.041111,-0.060413


## Importiamo tutte le libreria di base necessarie per i classificatori

In [11]:
%matplotlib inline

from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

## Prepariamo i dati per i classificatori

In [12]:
from sklearn.model_selection import train_test_split

# Crea dataset senza i nomi dei giocatori
df2 = df.drop(['P1', 'P2'], axis=1)
dataset = df2.astype(float).values
print("dataset shape", dataset.shape)


# Crea dataset senza colonna vincitore
df3 = df2.drop(['Win'], axis=1)
dataset2 = df3.astype(float).values
print("dataset2 shape", dataset2.shape)

# Crea dataset con solo colonna vincitore
df_noWin = df2['Win']

# Get features
X = dataset2
print("X shape", X.shape)

# Get win label
y = df_noWin
print("y shape", y.shape)
print (y)

# Suddividiamo il dataset in test set e train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

dataset shape (3566, 7)
dataset2 shape (3566, 6)
X shape (3566, 6)
y shape (3566,)
0       2.0
1       1.0
2       2.0
3       2.0
4       1.0
       ... 
3561    2.0
3562    2.0
3563    2.0
3564    2.0
3565    2.0
Name: Win, Length: 3566, dtype: float64


## Uso un regressore lineare: Perceptron

In [13]:
from sklearn.linear_model import Perceptron

p = Perceptron()
# Genero il modello
p.fit(X_train, y_train)

# Simulo il modello con il test set
y_pred = p.predict(X_test)

# Calcolo l'accuracy
acc = accuracy_score(y_true=y_test, y_pred=y_pred)

print("Score:", p.score(X_test,y_test))
print("Accuracy:", acc)

Score: 0.6100254885301615
Accuracy: 0.6100254885301615


## Usiamo il classificatore k-Nearest-Neighbor

In [14]:
from sklearn import neighbors

def knn(X_train, y_train, X_test, y_test):
    print('Non uso nessuno scaler')
    best_acc = (0, 0)

    for k in range(1,100):
        
        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        
        # Genero il modello
        kNN.fit(X_train,y_train)

        # Simulo il modello con il test set
        y_pred = kNN.predict(X_test)
        
        # Calcolo l'accuracy
        acc = accuracy_score(y_true=y_test, y_pred=y_pred)

        # Se ottengo un accuracy migliore incremento il valore da stampare dopo
        if acc > best_acc[1]:
            best_acc = (k, acc)
    print("La miglior accuracy la ottengo con k:", best_acc[0], "| Accuracy:", best_acc[1])

    print('Uso il MinMax Scaler')
    scaler = MinMaxScaler()
    # Faccio il fit dello scaler sul dataset di allenamento
    scaler.fit(X_train)

    best_acc = (0, 0)

    for k in range(1,100):
        
        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        
        # Genero il modello
        kNN.fit(scaler.transform(X_train),y_train)

        # Simulo il modello con il test set
        y_pred = kNN.predict( scaler.transform(X_test) )
        
        # Calcolo l'accuracy
        acc = accuracy_score(y_true=y_test, y_pred=y_pred)

        # Se ottengo un accuracy migliore incremento il valore da stampare dopo
        if acc > best_acc[1]:
            best_acc = (k, acc)
    print("La miglior accuracy la ottengo con k:", best_acc[0], "| Accuracy:", best_acc[1])

    print('Uso il Standard Scaler')
    scaler = StandardScaler()
    # Faccio il fit dello scaler sul dataset di allenamento
    scaler.fit(X_train)

    best_acc = (0, 0)

    for k in range(1,100):
        

        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        
        # Genero il modello    
        kNN.fit(scaler.transform(X_train),y_train)

        # Simulo il modello con il test set
        y_pred = kNN.predict( scaler.transform(X_test) )
        
        # Calcolo l'accuracy
        acc = accuracy_score(y_true=y_test, y_pred=y_pred)

        # Se ottengo un accuracy migliore incremento il valore da stampare dopo
        if acc > best_acc[1]:
            best_acc = (k, acc)  

    print("La miglior accuracy la ottengo con k:", best_acc[0], "| Accuracy:", best_acc[1])

    print('Uso il MaxAbs Scaler')
    scaler = MaxAbsScaler()
    # Faccio il fit dello scaler sul dataset di allenamento
    scaler.fit(X_train)

    best_acc = (0, 0)

    for k in range(1,100):
        

        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        
        # Genero il modello
        kNN.fit(scaler.transform(X_train),y_train)

        # Simulo il modello con il test set
        y_pred = kNN.predict( scaler.transform(X_test) )
        
        # Calcolo l'accuracy
        acc = accuracy_score(y_true=y_test, y_pred=y_pred)

        # Se ottengo un accuracy migliore incremento il valore da stampare dopo
        if acc > best_acc[1]:
            best_acc = (k, acc)  

    print("La miglior accuracy la ottengo con k:", best_acc[0], "| Accuracy:", best_acc[1])


knn(X_train, y_train, X_test, y_test)

Non uso nessuno scaler
La miglior accuracy la ottengo con k: 92 | Accuracy: 0.6193712829226848
Uso il MinMax Scaler
La miglior accuracy la ottengo con k: 79 | Accuracy: 0.6873406966864911
Uso il Standard Scaler
La miglior accuracy la ottengo con k: 61 | Accuracy: 0.6881903143585386
Uso il MaxAbs Scaler
La miglior accuracy la ottengo con k: 81 | Accuracy: 0.6924384027187765


## Usiamo un albero di decisione

In [15]:
from sklearn.tree import DecisionTreeClassifier

def DecisionTree(X_train, y_train, X_test, y_test):
    best_acc = (0, 0)

    for l in range(2,100):
        dtc = DecisionTreeClassifier(max_leaf_nodes=l)

        # Genero il modello
        dtc.fit(X_train, y_train)

        # Simulo il modello con il test set
        y_pred = dtc.predict(X_test)
            
        # Calcolo l'accuracy
        acc = accuracy_score(y_true=y_test, y_pred=y_pred)

        # Se ottengo un accuracy migliore incremento il valore da stampare dopo
        if acc > best_acc[1]:
            best_acc = (l, acc)  

    print("La miglior accuracy la ottengo con numero foglie:", best_acc[0], "| Accuracy:", best_acc[1])
    return best_acc


best_acc = DecisionTree(X_train, y_train, X_test, y_test)

La miglior accuracy la ottengo con numero foglie: 7 | Accuracy: 0.6915887850467289


In [16]:
def check_DecisionTree(best_acc):
    dtc = DecisionTreeClassifier(max_leaf_nodes=best_acc[0])
    
    # Genero il modello
    dtc.fit(X_train, y_train)

    # Stampo l'importanza calcolata delle varie features
    print(dtc.feature_importances_)

check_DecisionTree(best_acc)
print(df3.columns.values)

[0.         0.         0.         0.         0.89528366 0.10471634]
['Rank_delta' 'Pts_delta' 'winrate3y_delta' 'winrate_delta'
 'terrain_affinity_delta' 'iodoor_affinity_delta']


## Proviamo ad applicare il bagging a questo albero

In [17]:
from sklearn.ensemble import BaggingClassifier


def Bagging(X_train, y_train, X_test, y_test):
    best_acc = (0, 0, 0)

    for l in range(2, 20):
        for n in range (1, 20):
            dtc = DecisionTreeClassifier(max_leaf_nodes=l)
            bagged_dtc = BaggingClassifier(dtc, n_estimators=n)

            # Genero il modello
            bagged_dtc.fit(X_train,y_train)

            # Simulo il modello con il test set
            y_pred = bagged_dtc.predict(X_test)

            # Calcolo l'accuracy
            acc = accuracy_score(y_true=y_test, y_pred=y_pred)

            # Se ottengo un accuracy migliore incremento il valore da stampare dopo
            if acc > best_acc[2]:
                best_acc = (l, n, acc)

    print("La miglior accuracy la ottengo con numero foglie:", best_acc[0],", numero di estimatori:", best_acc[1], "| Accuracy:", best_acc[2])

Bagging(X_train, y_train, X_test, y_test)

La miglior accuracy la ottengo con numero foglie: 15 , numero di estimatori: 10 | Accuracy: 0.7000849617672048


## Proviamo ad applicare il boosting a questo albero

In [18]:
from sklearn.ensemble import AdaBoostClassifier

def Boosting(X_train, y_train, X_test, y_test):
    best_acc = (0, 0, 0)

    for l in range(2, 20):
        for n in range (1, 20):
            dtc = DecisionTreeClassifier(max_leaf_nodes=l)
            boosted_dtc = AdaBoostClassifier(dtc, n_estimators=n)

            # Genero il modello
            boosted_dtc.fit(X_train,y_train)

            # Simulo il modello con il test set
            y_pred = boosted_dtc.predict(X_test)

            # Calcolo l'accuracy
            acc = accuracy_score(y_true=y_test, y_pred=y_pred)

            # Se ottengo un accuracy migliore incremento il valore da stampare dopo
            if acc > best_acc[2]:
                best_acc = (l, n, acc)

    print("La miglior accuracy la ottengo con numero foglie:", best_acc[0],", numero di estimatori:", best_acc[1], "| Accuracy:", best_acc[2])
    return best_acc

best_acc = Boosting(X_train, y_train, X_test, y_test)

La miglior accuracy la ottengo con numero foglie: 4 , numero di estimatori: 4 | Accuracy: 0.6924384027187765


In [19]:
def check_Boosting(best_acc, X_train, y_train, X_test, y_test):
    dtc = DecisionTreeClassifier(max_leaf_nodes=best_acc[0])
    boosted_dtc = AdaBoostClassifier(dtc, n_estimators=best_acc[1])

    # Genero il modello
    boosted_dtc.fit(X_train,y_train)

    # Simulo il modello con il test set
    y_pred = boosted_dtc.predict(X_test)

    # Stampo l'importanza calcolata delle varie features
    print(boosted_dtc.feature_importances_)

check_Boosting(best_acc, X_train, y_train, X_test, y_test)    
print(df3.columns.values)

[0.         0.         0.10048461 0.05452318 0.69999386 0.14499835]
['Rank_delta' 'Pts_delta' 'winrate3y_delta' 'winrate_delta'
 'terrain_affinity_delta' 'iodoor_affinity_delta']


## Proviamo ora con la random forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

def RandomForest (X_train, y_train, X_test, y_test):
    best_acc = (0, 0)

    for n in range (1, 100):
            rfc = RandomForestClassifier(n_estimators=n)

            # Genero il modello
            rfc.fit(X_train,y_train)

            # Simulo il modello con il test set
            y_pred = rfc.predict(X_test)

            # Calcolo l'accuracy
            acc = accuracy_score(y_true=y_test, y_pred=y_pred)

            # Se ottengo un accuracy migliore incremento il valore da stampare dopo
            if acc > best_acc[1]:
                best_acc = (n, acc)

    print("La miglior accuracy la ottengo con numero di estimatori:", best_acc[0], "| Accuracy:", best_acc[1])
    return best_acc

best_acc = RandomForest(X_train, y_train, X_test, y_test)

La miglior accuracy la ottengo con numero di estimatori: 75 | Accuracy: 0.6737468139337298


In [21]:
def check_RandomForest(best_acc):
    rfc = RandomForestClassifier(n_estimators=best_acc[0])
    
    # Genero il modello
    rfc.fit(X_train,y_train)

    # Stampo l'importanza calcolata delle varie features
    print(rfc.feature_importances_)

check_RandomForest(best_acc)
print(df3.columns.values)

[0.13102537 0.14207641 0.16062084 0.14720441 0.22562426 0.19344872]
['Rank_delta' 'Pts_delta' 'winrate3y_delta' 'winrate_delta'
 'terrain_affinity_delta' 'iodoor_affinity_delta']


## La precisione che riusciamo a raggiungere al momento è al massimo 70%. Meglio di scegliere randomicamente ma comunque non eccezionale

## Provo ad usare solo il dataset di base aggiungendo le features considerando la data della partita

In [None]:

# Aggiungo nuovamente la data delle partite per generare le features considerando solo il passato e non tutto il dataframe
df_base['Date'] = pd.to_datetime(df_full['Date'])

# # Ordino il dataset per data
# df_base.sort_values('Date')

def update_data(df_base):
    # Faccio l'anonimizzazione del dataset come descritto sopra
    anonymize_players(df_base)
    rnd_player_order(df_base)

    # Calcolo il delta del rank e dei punti
    df_base['Rank_delta'] = df_base['P1Rank'] - df_base['P2Rank']
    df_base['Pts_delta'] = df_base['P1Pts'] - df_base['P2Pts']

    # Ottengo le vittorie del giocatore player prima della data date
    def get_past_wins(player, date):
        wins = df_base[(
            # Prendo solo le partite dove player è vincitore
            (
                ((df_base['P1'] == player) 
                & 
                (df_base['Win'] == 1.0))
                |
                ((df_base['P2'] == player) 
                & 
                (df_base['Win'] == 2.0))
            )
            # e dove la partita è avvenuto prima di date
            &
            (df_base['Date'] < date)
            )].shape[0]
        return wins

    # Ottengo i game precedenti a date del giocatore player
    def get_past_games(player, date):
        games = df_base[(
            # Prendo solo le partite dove player è uno dei due giocatori
            (
                (df_base['P1'] == player)
                |
                (df_base['P2'] == player)             
            )
            # e dove la partita è avvenuto prima di date
            &
            (df_base['Date'] < date)
            )].shape[0]
        return games

    # Ottengo le passate vittorie di giocatore player sul terreno terrain
    def get_past_wins_terr(player, terrain, date):
        wins = df_base[(
            # Prendo solo le partite dove player è vincitore
            (
                ((df_base['P1'] == player) 
                & 
                (df_base['Win'] == 1.0))
                |
                ((df_base['P2'] == player) 
                & 
                (df_base['Win'] == 2.0))
            )
            # e dove la partita è avvenuto prima di date
            &
            (df_base['Date'] < date)
            # e con il terreno giusto
            &
            (df_base[terrain] == 1.0)
            )].shape[0]
        return wins

    # Ottengo le passate partite di giocatore player sul terreno terrain
    def get_past_games_terr(player, terrain, date):
        games = df_base[(
            # Prendo solo le partite dove player è vincitore
            (
                (df_base['P1'] == player) 
                |
                (df_base['P2'] == player) 
            )
            # e dove la partita è avvenuto prima di date
            &
            (df_base['Date'] < date)
            # e con il terreno giusto
            &
            (df_base[terrain] == 1.0)
            )].shape[0]
        return games

    for i in range(0, df_base.shape[0]):
    # for i in range(0, 10):
    # for i in range(df_base.shape[0] - 10, df_base.shape[0]):

        # Raccolgo le informazioni utili
        match_date = df_base.at[i, 'Date']
        player_1 = df_base.loc[i]['P1']
        player_2 = df_base.loc[i]['P2']

        # Calcolo il winrate delta
        past_victories_1 = get_past_wins(player_1, match_date)
        past_games_1 = get_past_games(player_1, match_date)
        past_victories_2 = get_past_wins(player_2, match_date)
        past_games_2 = get_past_games(player_2, match_date)
        winrate_1 = (past_victories_1 / past_games_1) if (past_games_1 != 0) else 0.5
        winrate_2 = (past_victories_2 / past_games_2) if (past_games_2 != 0) else 0.5
        df_base.at[i, 'winrate_delta'] = winrate_1 - winrate_2

        # Calcolo la terrain affinity delta
        for terreno in ['x1_Clay', 'x1_Grass', 'x1_Hard']:
            if df_base.at[i, terreno] == 1.0:
                past_wins_terr_1 = get_past_wins_terr(player_1, terreno, match_date)
                past_games_terr_1 = get_past_games_terr(player_1, terreno, match_date)
                past_wins_terr_2 = get_past_wins_terr(player_2, terreno, match_date)
                past_games_terr_2 = get_past_games_terr(player_2, terreno, match_date)
                affinity_1 = (past_wins_terr_1 / past_games_terr_1) if (past_games_terr_1 != 0) else 0.5
                affinity_2 = (past_wins_terr_2 / past_games_terr_2) if (past_games_terr_2 != 0) else 0.5
                df_base.at[i, 'terrain_affinity_delta'] = affinity_1 - affinity_2

        # Calcolo la outdoor indoor affinity
        past_wins_outdoor_1 = 1 + get_past_wins_terr(player_1, 'x0_Outdoor', match_date)
        past_games_outdoor_1 = 1 + get_past_games_terr(player_1, 'x0_Outdoor', match_date)
        past_wins_outdoor_2 = 1 + get_past_wins_terr(player_2, 'x0_Outdoor', match_date)
        past_games_outdoor_2 = 1 + get_past_games_terr(player_2, 'x0_Outdoor', match_date)
        if df_base.at[i, 'x0_Outdoor'] == 1.0:
            outdoorrate_1 = (past_wins_outdoor_1 / past_games_outdoor_1) if (past_games_outdoor_1 != 0) else 0.5
            outdoorrate_2 = (past_wins_outdoor_2 / past_games_outdoor_2) if (past_games_outdoor_2 != 0) else 0.5
            df_base.at[i, 'iodoor_affinity_delta'] = outdoorrate_1 - outdoorrate_2
        else:
            past_wins_indoor_1 = 1 + past_victories_1 - past_wins_outdoor_1
            past_wins_indoor_2 = 1 + past_victories_2 - past_wins_outdoor_2
            past_games_indoor_1 = 1 + past_games_1 - past_games_outdoor_1
            past_games_indoor_2 = 1 + past_games_2 - past_games_outdoor_2
            indoorrate_1 = (past_wins_indoor_1 / past_games_indoor_1) if (past_games_indoor_1 != 0) else 0.5
            indoorrate_2 = (past_wins_indoor_2 / past_games_indoor_2) if (past_games_indoor_2 != 0) else 0.5
            df_base.at[i, 'iodoor_affinity_delta'] = indoorrate_1 - indoorrate_2

    # Faccio un drop delle colonne ora inutili
    df_base = df_base.drop(['P1Rank', 'P2Rank', 'P1Pts', 'P2Pts', 'x1_Clay', 'x1_Grass', 'x1_Hard', 'x0_Outdoor', 'Date'], axis = 1)
    return df_base

# Aggiorno i dati
df_base = update_data(df_base)

df_base

## Proviamo a vedere se i classificatori danno risultati migliori

In [None]:
# Crea dataset senza i nomi dei giocatori
df_base_noname = df_base.drop(['P1', 'P2'], axis=1)

# Crea dataset senza colonna vincitore
features = df_base_noname.drop(['Win'], axis=1)
dataset_v2 = features.astype(float).values
print("dataset2 shape", dataset_v2.shape)

# Crea dataset con solo colonna vincitore
df_Win = df_base_noname['Win']

# Get features
X = dataset_v2
print("X shape", X.shape)

# Get win label
y = df_Win
print("y shape", y.shape)
print (y)

# Suddividiamo il dataset in test set e train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## kNN

In [None]:
knn(X_train, y_train, X_test, y_test)

## Applico il MinMax Scaler

In [None]:
print('Uso il MinMax Scaler')
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Provo la random forest

In [None]:
best_acc = RandomForest(X_train, y_train, X_test, y_test)
check_RandomForest(best_acc)
print(features.columns.values)

## Provo il boosting

In [None]:
best_acc = Boosting(X_train, y_train, X_test, y_test)
check_Boosting(best_acc, X_train, y_train, X_test, y_test)    
print(features.columns.values)

## Provo un normale decision tree

In [None]:
best_acc = DecisionTree(X_train, y_train, X_test, y_test)
check_DecisionTree(best_acc)
print(features.columns.values)

## I Classificatori non sembrano dare risultati migliori. La precisione si attesta sempre attorno al 70% nel migliore dei casi

## Importo Dataset 2020

In [None]:
dataset_2020 = './data/2020.xlsx'

df2020 = pd.read_excel(dataset_2020)

# Rimuovo tutte le partite non relative all'australian open e diverse dal primo round
df2020.drop(df2020[(df2020.Tournament != 'Australian Open') | (df2020.Round != '1st Round') ].index, inplace=True)

# Rimuovo le colonne inutili
df2020.drop(['Comment', 'ATP', 'Location', 'Tournament', 'Best of', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'B365W', 'B365L'], axis=1, inplace=True)

# Rimuovo altre colonne
df2020.drop(['Series', 'Round', 'PSW', 'PSL', 'MaxW', 'MaxL', 'AvgW', 'AvgL', 'Wsets', 'Lsets'], axis=1, inplace=True)

df2020.reset_index(drop=True, inplace=True)

# Tolgo spazi e punti nei nomi
for i in range(0, df2020.shape[0]):
    df2020.at[i, 'Winner'] = remove_dotsandspaces(df2020.loc[i]['Winner'])
    df2020.at[i, 'Loser'] = remove_dotsandspaces(df2020.loc[i]['Loser'])

# Converto le features categoriali
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder(drop='if_binary')
# Uso il dataframe originale per avere gli stessi di tipi di terreno e court del trainset
X = df2019[['Court', 'Surface']]
enc.fit(X)
tranformed = enc.transform(df2020[['Court', 'Surface']]).toarray()
new_data = pd.DataFrame(tranformed, columns=enc.get_feature_names())
df2020 = pd.concat([df2020, new_data], axis=1).drop(['Court', 'Surface'], axis=1)

# Anonimizzo il dataset
anonymize_players(df2020)
rnd_player_order(df2020)

## Unisco i due dataset

In [None]:
df2020 = df2020.merge(playersAO, how='left', left_on=['P1'], right_on=['short_name'], suffixes=("_1", "_2"), validate='m:1')
df2020 = df2020.merge(playersAO, how='left', left_on=['P2'], right_on=['short_name'], suffixes=("_1", "_2"), validate='m:1').drop(['short_name_1', 'short_name_2'], axis=1)

## Manteniamo solo le affinity corrette

In [None]:
# Usato per aggiornare gli indici e quindi anche la shape
df2020.reset_index(drop=True, inplace=True)

# Lascio solo l'affinità con il terreno attualmente presente e con il tipo di campo attuale
for i in range(0, df2020.shape[0]):
    if (df2020.at[i, 'x1_Clay'] == 1.0):
        df2020.at[i, 'terrain_affinity_1'] = df2020.at[i, 'clay_affinity_1']
        df2020.at[i, 'terrain_affinity_2'] = df2020.at[i, 'clay_affinity_2']
    elif (df2020.at[i, 'x1_Grass'] == 1.0):
        df2020.at[i, 'terrain_affinity_1'] = df2020.at[i, 'grass_affinity_1']
        df2020.at[i, 'terrain_affinity_2'] = df2020.at[i, 'grass_affinity_2']
    elif (df2020.at[i, 'x1_Hard'] == 1.0):
        df2020.at[i, 'terrain_affinity_1'] = df2020.at[i, 'hard_affinity_1']
        df2020.at[i, 'terrain_affinity_2'] = df2020.at[i, 'hard_affinity_2']

    if (df2020.at[i, 'x0_Outdoor'] == 1.0):
        df2020.at[i, 'iodoor_affinity_1'] = df2020.at[i, 'outdoor_affinity_1']
        df2020.at[i, 'iodoor_affinity_2'] = df2020.at[i, 'outdoor_affinity_2']
    elif (df2020.at[i, 'x0_Outdoor'] == 0.0):
        df2020.at[i, 'iodoor_affinity_1'] = df2020.at[i, 'indoor_affinity_1']
        df2020.at[i, 'iodoor_affinity_2'] = df2020.at[i, 'indoor_affinity_2']


# Droppo le colonne ora inutili
df2020 = df2020.drop(['clay_affinity_1', 'clay_affinity_2', 'grass_affinity_1', 'grass_affinity_2', 'hard_affinity_1', 'hard_affinity_2', 'indoor_affinity_1', 'indoor_affinity_2', 'outdoor_affinity_1', 'outdoor_affinity_2'], axis = 1)

## Calcoliamo i delta

In [None]:
from IPython.display import display, HTML
display(HTML(df2020.to_html()))

# Differenza trai dati del primo giocatore e il secondo
df2020['Rank_delta'] = df2020['P1Rank'] - df2020['P2Rank']
df2020['Pts_delta'] = df2020['P1Pts'] - df2020['P2Pts']
df2020['winrate3y_delta'] = df2020['winrate3y_1'] - df2020['winrate3y_2']
df2020['win_rate_delta'] = df2020['winrate_1'] - df2020['winrate_2']
df2020['terrain_affinity_delta'] = df2020['terrain_affinity_1'] - df2020['terrain_affinity_2']
df2020['iodoor_affinity_delta'] = df2020['iodoor_affinity_1'] - df2020['iodoor_affinity_2']

# Tolgo le colonne al momento inutili
df2020 = df2020.drop(['x0_Outdoor',	'x1_Clay',	'x1_Grass',	'x1_Hard', 'Date'], axis = 1)

## Riordiniamo le partite per semplificare le operazioni successive

In [None]:
#Riordino le righe in modo tale da risultare coerente con le partite del torneo svolte nella realtà
c = df2020.copy(deep=True)

positions = [37,55,46,36,51,63,38,27,50,61,35,32,53,60,42,28,49,59,52,44,58,56,43,31,54,48,47,45,40,41,62,57,0,30,1,5,4,6,34,21,7,22,26,9,15,17,33,8,10,11,29,25,18,13,39,20,19,14,24,23,2,3,16,12]

for i in range(0,64):
    df2020.iloc[i] = c.iloc[positions[i]]

df2020

## Usiamo il classificatore migliore trovato fino ad ora

## Iniziamo a predire i risultati

In [None]:
print('Uso il MinMax Scaler e identifico ')

# Utilizziamo il MinMax scaler
scaler = MinMaxScaler()
scaler.fit(dataset2)

# Creo il modello con kNN
kNN = neighbors.KNeighborsClassifier(n_neighbors=67)
kNN.fit(scaler.transform(dataset2),df_noWin)


#Togliamo tutte le colonne inutile per la predizione del vincitore
to_analyze = df2020.drop(['P1', 'P2', 'Win', 'winrate3y_1', 'winrate_1', 'winrate3y_2', 'winrate_2', 'terrain_affinity_1', 'terrain_affinity_2', 'iodoor_affinity_1', 'iodoor_affinity_2', 'P1Rank', 'P2Rank', 'P1Pts', 'P2Pts'], axis=1)
df_pred = to_analyze.astype(float).values

# Simuliamo il modello
y_pred = kNN.predict( scaler.transform(df_pred) )

print("I vincitori sono:", y_pred )

In [None]:
# Trovati i vincitori dalla simulazione effettuata, copio i dati (non compresi i delta) dei vincitori e li metto in coppia 
#  per simulare il turno successivo fino ad arrivare alla finale
df2020_tmp = df2020.drop(['Win'], axis = 1)
j = df2020_tmp.shape[0]
y_pred_tmp = y_pred
while j > 1:
    y_p = y_pred_tmp.tolist()
    rows_list = []
    for i in range(0, df2020_tmp.shape[0], 2):
        if(y_p[i] == 1.0 and y_p[i+1] == 1.0):
            # Aggiungo una nuova riga al dataset
            rows_list.append([df2020_tmp.at[i, 'P1'], 
                              df2020_tmp.at[i+1, 'P1'], 
                              df2020_tmp.at[i, 'P1Rank'], 
                              df2020_tmp.at[i+1, 'P1Rank'], 
                              df2020_tmp.at[i, 'P1Pts'], 
                              df2020_tmp.at[i+1, 'P1Pts'], 
                              df2020_tmp.at[i, 'winrate3y_1'], 
                              df2020_tmp.at[i, 'winrate_1'], 
                              df2020_tmp.at[i+1, 'winrate3y_1'], 
                              df2020_tmp.at[i+1, 'winrate_1'], 
                              df2020_tmp.at[i, 'terrain_affinity_1'], 
                              df2020_tmp.at[i+1, 'terrain_affinity_1'], 
                              df2020_tmp.at[i, 'iodoor_affinity_1'], 
                              df2020_tmp.at[i+1, 'iodoor_affinity_1'], 
                              float(df2020_tmp.at[i, 'P1Rank']) - float(df2020_tmp.at[i+1, 'P1Rank']), 
                              float(df2020_tmp.at[i, 'P1Pts'])-float(df2020_tmp.at[i+1, 'P1Pts']), 
                              float(df2020_tmp.at[i, 'winrate3y_1'])-float(df2020_tmp.at[i+1, 'winrate3y_1']), 
                              float(df2020_tmp.at[i, 'winrate_1'])-float(df2020_tmp.at[i+1, 'winrate_1']), 
                              float(df2020_tmp.at[i, 'terrain_affinity_1'])-float(df2020_tmp.at[i+1, 'terrain_affinity_1']), 
                              float(df2020_tmp.at[i, 'iodoor_affinity_1'])-float(df2020_tmp.at[i+1, 'iodoor_affinity_1'])])
        if(y_p[i] == 2.0 and y_p[i+1] == 1.0):
            # Aggiungo una nuova riga al dataset
            rows_list.append([df2020_tmp.at[i, 'P2'], 
                              df2020_tmp.at[i+1, 'P1'], 
                              df2020_tmp.at[i, 'P2Rank'], 
                              df2020_tmp.at[i+1, 'P1Rank'], 
                              df2020_tmp.at[i, 'P2Pts'], 
                              df2020_tmp.at[i+1, 'P1Pts'], 
                              df2020_tmp.at[i, 'winrate3y_2'], 
                              df2020_tmp.at[i, 'winrate_2'], 
                              df2020_tmp.at[i+1, 'winrate3y_1'], 
                              df2020_tmp.at[i+1, 'winrate_1'], 
                              df2020_tmp.at[i, 'terrain_affinity_2'], 
                              df2020_tmp.at[i+1, 'terrain_affinity_1'], 
                              df2020_tmp.at[i, 'iodoor_affinity_2'], 
                              df2020_tmp.at[i+1, 'iodoor_affinity_1'], 
                              float(df2020_tmp.at[i, 'P2Rank']) - float(df2020_tmp.at[i+1, 'P1Rank']), 
                              float(df2020_tmp.at[i, 'P2Pts'])-float(df2020_tmp.at[i+1, 'P1Pts']), 
                              float(df2020_tmp.at[i, 'winrate3y_2'])-float(df2020_tmp.at[i+1, 'winrate3y_1']), 
                              float(df2020_tmp.at[i, 'winrate_2'])-float(df2020_tmp.at[i+1, 'winrate_1']), 
                              float(df2020_tmp.at[i, 'terrain_affinity_2'])-float(df2020_tmp.at[i+1, 'terrain_affinity_1']), 
                              float(df2020_tmp.at[i, 'iodoor_affinity_2'])-float(df2020_tmp.at[i+1, 'iodoor_affinity_1'])])
        if(y_p[i] == 1.0 and y_p[i+1] == 2.0):
            # Aggiungo una nuova riga al dataset
            rows_list.append([df2020_tmp.at[i, 'P1'], 
                              df2020_tmp.at[i+1, 'P2'], 
                              df2020_tmp.at[i, 'P1Rank'], 
                              df2020_tmp.at[i+1, 'P2Rank'], 
                              df2020_tmp.at[i, 'P1Pts'], 
                              df2020_tmp.at[i+1, 'P2Pts'], 
                              df2020_tmp.at[i, 'winrate3y_1'], 
                              df2020_tmp.at[i, 'winrate_1'], 
                              df2020_tmp.at[i+1, 'winrate3y_2'], 
                              df2020_tmp.at[i+1, 'winrate_2'], 
                              df2020_tmp.at[i, 'terrain_affinity_1'], 
                              df2020_tmp.at[i+1, 'terrain_affinity_2'], 
                              df2020_tmp.at[i, 'iodoor_affinity_1'], 
                              df2020_tmp.at[i+1, 'iodoor_affinity_2'], 
                              float(df2020_tmp.at[i, 'P1Rank']) - float(df2020_tmp.at[i+1, 'P2Rank']), 
                              float(df2020_tmp.at[i, 'P1Pts'])-float(df2020_tmp.at[i+1, 'P2Pts']), 
                              float(df2020_tmp.at[i, 'winrate3y_1'])-float(df2020_tmp.at[i+1, 'winrate3y_2']), 
                              float(df2020_tmp.at[i, 'winrate_1'])-float(df2020.at[i+1, 'winrate_2']), 
                              float(df2020_tmp.at[i, 'terrain_affinity_1'])-float(df2020_tmp.at[i+1, 'terrain_affinity_2']), 
                              float(df2020_tmp.at[i, 'iodoor_affinity_1'])-float(df2020_tmp.at[i+1, 'iodoor_affinity_2'])])
        if(y_p[i] == 2.0 and y_p[i+1] == 2.0):
            # Aggiungo una nuova riga al dataset
            rows_list.append([df2020_tmp.at[i, 'P2'], 
                              df2020_tmp.at[i+1, 'P2'], 
                              df2020_tmp.at[i, 'P2Rank'], 
                              df2020_tmp.at[i+1, 'P2Rank'], 
                              df2020_tmp.at[i, 'P2Pts'], 
                              df2020_tmp.at[i+1, 'P2Pts'], 
                              df2020_tmp.at[i, 'winrate3y_2'], 
                              df2020_tmp.at[i, 'winrate_2'], 
                              df2020_tmp.at[i+1, 'winrate3y_1'], 
                              df2020_tmp.at[i+1, 'winrate_2'], 
                              df2020_tmp.at[i, 'terrain_affinity_2'], 
                              df2020_tmp.at[i+1, 'terrain_affinity_2'], 
                              df2020_tmp.at[i, 'iodoor_affinity_2'], 
                              df2020_tmp.at[i+1, 'iodoor_affinity_2'], 
                              float(df2020_tmp.at[i, 'P2Rank']) - float(df2020_tmp.at[i+1, 'P2Rank']), 
                              float(df2020_tmp.at[i, 'P2Pts'])-float(df2020_tmp.at[i+1, 'P2Pts']), 
                              float(df2020_tmp.at[i, 'winrate3y_2'])-float(df2020_tmp.at[i+1, 'winrate3y_2']), 
                              float(df2020_tmp.at[i, 'winrate_2'])-float(df2020_tmp.at[i+1, 'winrate_2']), 
                              float(df2020_tmp.at[i, 'terrain_affinity_2'])-float(df2020_tmp.at[i+1, 'terrain_affinity_2']), 
                              float(df2020_tmp.at[i, 'iodoor_affinity_2'])-float(df2020_tmp.at[i+1, 'iodoor_affinity_2'])])


    df2 = pd.DataFrame(np.array(rows_list), columns = ['P1','P2', 'P1Rank','P2Rank','P1Pts','P2Pts','winrate3y_1','winrate_1','winrate3y_2','winrate_2','terrain_affinity_1','terrain_affinity_2','iodoor_affinity_1','iodoor_affinity_2','Rank_delta','Pts_delta','winrate3y_delta','win_rate_delta','terrain_affinity_delta','iodoor_affinity_delta'])
    display(HTML(df2.to_html()))
    df2020_tmp = df2

    # Usiamo il MinMax Scaler
    scaler = MinMaxScaler()
    scaler.fit(dataset2)

    # Creiamo il modello
    kNN = neighbors.KNeighborsClassifier(n_neighbors=67)
    kNN.fit(scaler.transform(dataset2),df_noWin)

    # Previsione vincitori
    to_analyze = df2020_tmp.drop(['P1', 'P2', 'winrate3y_1', 'winrate_1', 'winrate3y_2', 'winrate_2', 'terrain_affinity_1', 'terrain_affinity_2', 'iodoor_affinity_1', 'iodoor_affinity_2', 'P1Rank', 'P2Rank', 'P1Pts', 'P2Pts'], axis=1)
    df_pred = to_analyze.astype(float).values
    
    # Simuliamo con il modello
    y_pred_tmp = kNN.predict( scaler.transform(df_pred))

    # Stampo i vincitori del turno
    print("I vincitori sono:", y_pred_tmp )
    j = j/2

In [None]:
# Stampo il vincitore
vincitore = ''
if(y_pred_tmp[0] == 1.0):
    vincitore = df2020_tmp.at[0, 'P1']
if(y_pred_tmp[0] == 2.0):
    vincitore = df2020_tmp.at[0, 'P2'] 
print("Il vincitore degli Australian Open è:", vincitore)

In [None]:
# Stampo l'accuratezza prendendo il dataset con le partite con i risultati reali e li confronto con quelli trovati da noi
#  nel primo turno
acc = accuracy_score(y_true=df2020['Win'], y_pred=y_pred)
print('Accuracy: ', acc)