## Reading the dataset

In [1]:
import numpy as np
import pandas as pd

#Reading the dataset
dataset = pd.read_csv("../../Data/ProcessedData/CleanData.csv")

## Cleaning position field
Position is a category field consists of {GK, DF, MF, FW}

Since Goalkeeper (GK) have different stats compared to other positions, we remove all goalkeeper from the dataset

The other position are encode into numerical type to be able to go through a Linear Regression Model

In [326]:
dataset = dataset[dataset['position'] != 'GK']
dataset = dataset.replace(to_replace=['DF', 'MF', 'FW'],value=[1, 2, 3])
dataset['position'] = dataset['position'].astype(np.int64)

## Spliting dataset into train set and test set

In [327]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.1)
print("Train set:\n", train)
print("Test set:\n", test)

Train set:
       Unnamed: 0                      name                team  age  \
1257        1257  Agustín Álvarez Martínez           Sassuolo,   21   
1307        1307             Jim Allevinah      Clermont Foot,   27   
1327        1327                Jens Stage      Werder Bremen,   26   
732          732              Rubén García            Osasuna,   29   
906          906         Giacomo Raspadori             Napoli,   22   
...          ...                       ...                 ...  ...   
530          530          Faitout Maouassa        Montpellier,   24   
1162        1162                Steve Cook  Nottingham Forest,   31   
67            67         Maximilian Arnold          Wolfsburg,   28   
828          828               Amine Harit          Marseille,   25   
1275        1275          Mikkel Damsgaard          Brentford,   22   

              league  position  apps  minutesPlayed  goal  assistTotal  ...  \
1257         Serie A         3     0            163     

## Putting dataset through a Linear Regression model
Apps field is removed due to high correlation with minutesPlayed

Team and league field is removed since they have no relationship with a player's rating

In [328]:
from sklearn.ensemble import RandomForestRegressor

input_columns = ['position', 'minutesPlayed','goal','assistTotal','yellowCard','redCards',
    'passRate','aerialWonPerGame','manOfMatch','tacklePerGame','interceptionPerGame','foulsPerGame',
    'offsideWonPerGame','clearancePerGame','wasDribbledPerGame','outFielderBlockPerGame','goalOwn',
    'shotsPerGame','keyPassPerGame','dribbleWonPerGame','foulGivenPerGame','offsideGivenPerGame',
    'dispossessedPerGame','turnOverPerGame','totalPassesPerGame','passSuccess','accurateCrossesPerGame',
    'accurateLongPassPerGame','accurateThroughBallPerGame']
output_column = ['rating']

X_train = train[input_columns]
Y_train = train[output_column]
X_test = test[input_columns]
Y_test = test[output_column]

model = RandomForestRegressor(n_estimators = 50, random_state = 0)
model.fit(X_train, Y_train)
print('Test R^2     : %.3f'%model.score(X_test, Y_test))
print('Training R^2 : %.3f'%model.score(X_train, Y_train))

  model.fit(X_train, Y_train)


Test R^2     : 0.808
Training R^2 : 0.964


## Try to apply the model on the whole dataset to predict Ballon D'or winner

In [329]:
X_dataset = dataset[input_columns]
Y_preds = model.predict(X_dataset)
dataset['preds'] = Y_preds

ballon_dor_winner = dataset.sort_values(['preds'],ascending=False)[:10]
print(ballon_dor_winner[['name', 'rating', 'preds']])

                     name  rating   preds
0            Lionel Messi    8.65  8.0278
1                  Neymar    7.96  7.7980
3           Kylian Mbappé    7.87  7.6842
5          Martin Terrier    7.76  7.6742
6      Robert Lewandowski    7.73  7.6634
2          Erling Haaland    7.90  7.6542
4         Kevin De Bruyne    7.76  7.6452
7          Joshua Kimmich    7.67  7.4668
11          Gabriel Jesus    7.59  7.4520
12  Khvicha Kvaratskhelia    7.55  7.4474


## Predicting Kopa Trophy (U21 Ballon D'or) winner

In [330]:
dataset = dataset[dataset['age'] <= 21]
young_ballon_dor_winner = dataset.sort_values(['preds'],ascending=False)[:10]
print(young_ballon_dor_winner[['name', 'rating', 'preds']])

                      name  rating   preds
12   Khvicha Kvaratskhelia    7.55  7.4474
9            Jamal Musiala    7.64  7.4146
25          Dango Ouattara    7.40  7.3058
15         Jude Bellingham    7.45  7.2996
33             Bukayo Saka    7.35  7.2734
34      Gabriel Martinelli    7.35  7.2714
27                 Rodrygo    7.40  7.1130
57                   Pedri    7.21  7.0944
121       Jeremie Frimpong    7.06  7.0362
202          Nico Williams    6.95  6.9500
