## Reading the dataset

In [61]:
import numpy as np
import pandas as pd

#Reading the dataset
dataset = pd.read_csv("../../Data/ProcessedData/CleanData.csv")

## Cleaning position field
Position is a category field consists of {GK, DF, MF, FW}

Since Goalkeeper (GK) have different stats compared to other positions, we remove all goalkeeper from the dataset

The other position are encode into numerical type to be able to go through a Linear Regression Model

In [62]:
dataset = dataset[dataset['position'] != 'GK']
dataset = dataset.replace(to_replace=['DF', 'MF', 'FW'],value=[1, 2, 3])
dataset['position'] = dataset['position'].astype(np.int64)

## Spliting dataset into train set and test set

In [63]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.1)
print("Train set:\n", train)
print("Test set:\n", test)

Train set:
       Unnamed: 0              name                  team  age          league  \
549          549       Fran García       Rayo Vallecano,   23         La Liga   
110          110        Adam Ounas                Lille,   26         Ligue 1   
259          259  Federico Dimarco                Inter,   25         Serie A   
1222        1222    Atakan Karazor            Stuttgart,   26      Bundesliga   
1393        1393      Tete Morente                Elche,   26         La Liga   
...          ...               ...                   ...  ...             ...   
1336        1336   Brennan Johnson    Nottingham Forest,   21  Premier League   
330          330  Jesper Lindstrøm  Eintracht Frankfurt,   22      Bundesliga   
952          952   Rayan Aït-Nouri               Wolves,   21  Premier League   
1278        1278  Stuart Armstrong          Southampton,   30  Premier League   
48            48        Leroy Sané               Bayern,   26      Bundesliga   

      position 

## Putting dataset through a Linear Regression model
Apps field is removed due to high correlation with minutesPlayed

Team and league field is removed since they have no relationship with a player's rating

In [64]:
from sklearn.neighbors import KNeighborsRegressor

input_columns = ['position', 'minutesPlayed','goal','assistTotal','yellowCard','redCards',
    'passRate','aerialWonPerGame','manOfMatch','tacklePerGame','interceptionPerGame','foulsPerGame',
    'offsideWonPerGame','clearancePerGame','wasDribbledPerGame','outFielderBlockPerGame','goalOwn',
    'shotsPerGame','keyPassPerGame','dribbleWonPerGame','foulGivenPerGame','offsideGivenPerGame',
    'dispossessedPerGame','turnOverPerGame','totalPassesPerGame','passSuccess','accurateCrossesPerGame',
    'accurateLongPassPerGame','accurateThroughBallPerGame']
output_column = ['rating']

X_train = train[input_columns]
Y_train = train[output_column]
X_test = test[input_columns]
Y_test = test[output_column]

model = KNeighborsRegressor(n_neighbors=5)
model.fit(X_train, Y_train)
print('Test R^2     : %.3f'%model.score(X_test, Y_test))
print('Training R^2 : %.3f'%model.score(X_train, Y_train))

Test R^2     : 0.458
Training R^2 : 0.539


## Try to apply the model on the whole dataset to predict Ballon D'or winner

In [65]:
X_dataset = dataset[input_columns]
Y_preds = model.predict(X_dataset)
dataset['preds'] = Y_preds

ballon_dor_winner = dataset.sort_values(['preds'],ascending=False)[:10]
print(ballon_dor_winner[['name', 'rating', 'preds']])

                      name  rating  preds
0             Lionel Messi    8.65  7.474
36                   Rodri    7.34  7.354
16         Vinícius Júnior    7.44  7.238
94   Branco van den Boomen    7.12  7.218
44            Mikel Merino    7.31  7.210
449         Quentin Merlin    6.73  7.156
120          Wilfried Zaha    7.06  7.154
84            João Cancelo    7.15  7.144
131        Felipe Anderson    7.03  7.134
11           Gabriel Jesus    7.59  7.124


## Predicting Kopa Trophy (U21 Ballon D'or) winner

In [66]:
dataset = dataset[dataset['age'] <= 21]
young_ballon_dor_winner = dataset.sort_values(['preds'],ascending=False)[:10]
print(young_ballon_dor_winner[['name', 'rating', 'preds']])

                   name  rating  preds
449      Quentin Merlin    6.73  7.156
34   Gabriel Martinelli    7.35  7.008
33          Bukayo Saka    7.35  7.002
27              Rodrygo    7.40  6.966
839   Kiliann Sildillia    6.54  6.954
742     Khéphren Thuram    6.58  6.934
826    Naouirou Ahamada    6.54  6.932
15      Jude Bellingham    7.45  6.930
360     Folarin Balogun    6.80  6.916
244      Moisés Caicedo    6.91  6.916
