## Reading the dataset

In [249]:
import numpy as np
import pandas as pd

#Reading the dataset
dataset = pd.read_csv("./WhoScoreSavingPlayerInfo.csv")
dataset['saveTotal'] = dataset['saveTotal'] / dataset['apps']
dataset['saveObox'] = dataset['saveObox'] / dataset['apps']
dataset['savePenaltyArea'] = dataset['savePenaltyArea'] / dataset['apps']
dataset['saveSixYardBox'] = dataset['saveSixYardBox'] / dataset['apps']

## Spliting dataset into train set and test set

In [250]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.1)
print("Train set:\n", train)
print("Test set:\n", test)

Train set:
                        name              team  age position  apps  \
56                Ivo Grbic  Atletico Madrid,   26     , GK     2   
128          Florian Müller    VfB Stuttgart,   25     , GK    15   
54                Rui Silva       Real Betis,   28     , GK    14   
16            Koen Casteels        Wolfsburg,   30     , GK    15   
66           Janis Blaswich       RB Leipzig,   31     , GK     8   
..                      ...               ...  ...      ...   ...   
99   Vanja Milinkovic-Savic           Torino,   25     , GK    18   
19                Pau López        Marseille,   28     , GK    16   
65           Manuel Riemann           Bochum,   34     , GK    15   
18        Guglielmo Vicario           Empoli,   26     , GK    18   
45           Aaron Ramsdale          Arsenal,   24     , GK    18   

     minutesPlayed  saveTotal  saveSixYardBox  savePenaltyArea  saveObox  \
56             180   1.250000        0.250000         0.500000  0.500000   
128    

## Putting dataset through a Linear Regression model
Apps field is removed due to high correlation with minutesPlayed

Name and team field is removed since they have no relationship with a player's rating

In [251]:
from sklearn.linear_model import LinearRegression

input_columns = ['minutesPlayed','saveTotal','saveSixYardBox','savePenaltyArea','saveObox']
output_column = ['rating']

X_train = train[input_columns]
Y_train = train[output_column]
X_test = test[input_columns]
Y_test = test[output_column]

model = LinearRegression()
model.fit(X_train, Y_train)
print('Test R^2     : %.3f'%model.score(X_test, Y_test))
print('Training R^2 : %.3f'%model.score(X_train, Y_train))

Test R^2     : 0.448
Training R^2 : 0.313


## Try to apply the model on the whole dataset to predict Yashin Trophy winner

In [252]:
X_dataset = dataset[input_columns]
Y_preds = model.predict(X_dataset)
dataset['preds'] = Y_preds

yashin_winner = dataset.sort_values(['preds'],ascending=False)[:10]
print(yashin_winner[['name', 'rating', 'preds']])

                name  rating     preds
5    Wayne Hennessey    7.16  7.855881
0      Asmir Begovic    7.82  7.523490
25    Jan Olschowsky    6.89  7.015152
3    Guillermo Ochoa    7.16  6.935902
123   Fraser Forster    6.35  6.858708
21     Claudio Bravo    6.93  6.792704
27       Maxime Dupé    6.88  6.787808
67         Matz Sels    6.67  6.785179
35   Alexander Nübel    6.86  6.784955
46         Mory Diaw    6.80  6.780958
