## Reading the dataset, cleaning then split train and test set

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#Reading the dataset
dataset = pd.read_csv("/kaggle/input/whoscore-league-data/WhoScoreSummaryPlayerInfo_merged.csv")

#Cleaning position column
dataset['position'] = dataset['position'].str.strip()
dataset['position'] = dataset['position'].str.split(pat=' ').str[0]
dataset=dataset.replace(to_replace=['GK', 'DF', 'M', 'MF', 'FW'],value=[1, 2, 3, 3, 4])
dataset['position'] = dataset['position'].astype(np.int64)

train, test = train_test_split(dataset, test_size=0.1)
print("Train set:\n", train)
print("Test set:\n", test)

Train set:
                     name           team  age  position  apps  minutesPlayed  \
947        Jeremy Toljan      Sassuolo,   28         2    14           1319   
615        Janik Haberer  Union Berlin,   28         3    13            859   
179         Darwin Núñez     Liverpool,   23         3     7            606   
824            Óscar Gil      Espanyol,   24         2    10            870   
977    Takehiro Tomiyasu       Arsenal,   24         2     4            406   
...                  ...            ...  ...       ...   ...            ...   
564         Koray Günter        Verona,   28         2    11           1003   
1034        James Milner     Liverpool,   36         2     4            464   
488   Rolando Mandragora    Fiorentina,   25         3    10            870   
759              José Sá        Wolves,   29         1    15           1350   
1347      Anthony Lozano         Cadiz,   29         3     8            757   

      goal  assistTotal  yellowCard  re

## Putting dataset through a Linear Regression model

In [31]:
from sklearn.linear_model import LinearRegression

input_columns = ['position', 'apps', 'minutesPlayed','goal','assistTotal','yellowCard','redCards',
    'passRate','aerialWonPerGame','manOfMatch','tacklePerGame','interceptionPerGame','foulsPerGame',
    'offsideWonPerGame','clearancePerGame','wasDribbledPerGame','outFielderBlockPerGame','goalOwn',
    'shotsPerGame','keyPassPerGame','dribbleWonPerGame','foulGivenPerGame','offsideGivenPerGame',
    'dispossessedPerGame','turnOverPerGame','totalPassesPerGame','passSuccess','accurateCrossesPerGame',
    'accurateLongPassPerGame','accurateThroughBallPerGame']
output_column = ['rating']

X_train = train[input_columns]
Y_train = train[output_column]
X_test = test[input_columns]
Y_test = test[output_column]

model = LinearRegression()
model.fit(X_train, Y_train)
print('Test R^2     : %.3f'%model.score(X_test, Y_test))
print('Training R^2 : %.3f'%model.score(X_train, Y_train))

Test R^2     : 0.849
Training R^2 : 0.835


## Try to apply the model on the whole dataset

In [33]:
X_dataset = dataset[input_columns]
Y_preds = model.predict(X_dataset)
dataset['preds'] = Y_preds
dataset.to_csv('preds.csv', index=False)

ballon_dor_winner = dataset.sort_values(['preds'],ascending=False)[:10]
print(ballon_dor_winner[['name', 'rating', 'preds']])

                  name  rating     preds
0         Lionel Messi    8.65  8.642908
1               Neymar    7.96  8.122425
5       Martin Terrier    7.76  7.957384
2       Erling Haaland    7.90  7.795682
4      Kevin De Bruyne    7.76  7.754394
6   Robert Lewandowski    7.73  7.736290
18  Christopher Nkunku    7.43  7.654617
9        Jamal Musiala    7.64  7.649428
24      Vincenzo Grifo    7.41  7.608328
3        Kylian Mbappé    7.87  7.607145
