## Reading the dataset

In [128]:
import numpy as np
import pandas as pd

#Reading the dataset
dataset = pd.read_csv("../Data/ProcessedData/CleanData.csv")

## Cleaning position field
Position is a category field consists of {GK, DF, MF, FW}

Since Goalkeeper (GK) have different stats compared to other positions, we remove all goalkeeper from the dataset

The other position are encode into numerical type to be able to go through a Linear Regression Model

In [129]:
dataset = dataset[dataset['position'] != 'GK']
dataset = dataset.replace(to_replace=['DF', 'MF', 'FW'],value=[1, 2, 3])
dataset['position'] = dataset['position'].astype(np.int64)

## Spliting dataset into train set and test set

In [130]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.1)
print("Train set:\n", train)
print("Test set:\n", test)

Train set:
       Unnamed: 0                      name                  team  age  \
1036        1036               Lucas Olaza      Real Valladolid,   28   
1096        1096            Cyrille Bayala           AC Ajaccio,   26   
592          592            Anthony Gordon              Everton,   21   
935          935            Giovanni Reyna    Borussia Dortmund,   20   
288          288         Benoît Badiashile               Monaco,   21   
...          ...                       ...                   ...  ...   
673          673  Jean-Charles Castelletto               Nantes,   27   
304          304          Christopher Lenz  Eintracht Frankfurt,   28   
565          565               Kai Havertz              Chelsea,   23   
1141        1141            Thomas Delaine           Strasbourg,   30   
165          165              Eberechi Eze       Crystal Palace,   24   

              league  position  apps  minutesPlayed  goal  assistTotal  ...  \
1036         La Liga         1  

## Putting dataset through a Linear Regression model
Apps field is removed due to high correlation with minutesPlayed

Team and league field is removed since they have no relationship with a player's rating

In [131]:
from sklearn.linear_model import LinearRegression

input_columns = ['position', 'minutesPlayed','goal','assistTotal','yellowCard','redCards',
    'passRate','aerialWonPerGame','manOfMatch','tacklePerGame','interceptionPerGame','foulsPerGame',
    'offsideWonPerGame','clearancePerGame','wasDribbledPerGame','outFielderBlockPerGame','goalOwn',
    'shotsPerGame','keyPassPerGame','dribbleWonPerGame','foulGivenPerGame','offsideGivenPerGame',
    'dispossessedPerGame','turnOverPerGame','totalPassesPerGame','passSuccess','accurateCrossesPerGame',
    'accurateLongPassPerGame','accurateThroughBallPerGame']
output_column = ['rating']

X_train = train[input_columns]
Y_train = train[output_column]
X_test = test[input_columns]
Y_test = test[output_column]

model = LinearRegression()
model.fit(X_train, Y_train)
print('Test R^2     : %.3f'%model.score(X_test, Y_test))
print('Training R^2 : %.3f'%model.score(X_train, Y_train))

Test R^2     : 0.908
Training R^2 : 0.896


## Try to apply the model on the whole dataset to predict Ballon D'or winner

In [132]:
X_dataset = dataset[input_columns]
Y_preds = model.predict(X_dataset)
dataset['preds'] = Y_preds

ballon_dor_winner = dataset.sort_values(['preds'],ascending=False)[:10]
print(ballon_dor_winner[['name', 'rating', 'preds']])

                  name  rating     preds
0         Lionel Messi    8.65  8.605449
1               Neymar    7.96  8.269725
2       Erling Haaland    7.90  7.866667
5       Martin Terrier    7.76  7.864512
3        Kylian Mbappé    7.87  7.783637
4      Kevin De Bruyne    7.76  7.724590
9        Jamal Musiala    7.64  7.695884
6   Robert Lewandowski    7.73  7.672217
19     Niclas Füllkrug    7.42  7.592998
18  Christopher Nkunku    7.43  7.544513


## Predicting Kopa Trophy (U21 Ballon D'or) winner

In [133]:
dataset = dataset[dataset['age'] <= 21]
young_ballon_dor_winner = dataset.sort_values(['preds'],ascending=False)[:10]
print(young_ballon_dor_winner[['name', 'rating', 'preds']])

                      name  rating     preds
9            Jamal Musiala    7.64  7.695884
15         Jude Bellingham    7.45  7.516515
12   Khvicha Kvaratskhelia    7.55  7.500007
25          Dango Ouattara    7.40  7.311857
27                 Rodrygo    7.40  7.302358
33             Bukayo Saka    7.35  7.285692
34      Gabriel Martinelli    7.35  7.177151
121       Jeremie Frimpong    7.06  7.134649
57                   Pedri    7.21  6.974233
202          Nico Williams    6.95  6.929219
