In [1]:
# to handle datasets
import pandas as pd
from pandas import DataFrame
pd.pandas.set_option('display.max_columns',None)
import numpy as np

# divide test and train
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# to build models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to evaluate models
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# to persist the model and scaler
import joblib

### Load Original Data

In [2]:
# been a while now but load the original dataset
ffmlDf = pd.read_csv('ffmlDf_20-21')
print(ffmlDf.shape)
ffmlDf.head()

(11679, 26)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,points,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,penMissed,yelCards,redCards,saves,bonus,bonusPointSystem,influence,creativity,threat,ictIndex,netTransfers,selectedBy,costGBP,gameDate,playerName,oppositionTeam
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,76656,7.0,2020-09-12,"('Mesut', 'Özil')",Fulham
1,1,8170,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,5313,4.5,2020-09-12,"('Mark', 'Gillespie')",West Ham United
2,2,8190,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,2064,5.0,2020-09-12,"('Jacob', 'Murphy')",West Ham United
3,3,8210,8,84,1,0,1,0,0,0,0,0,0,0,2,32,32.0,23.3,50.0,10.5,0,538610,6.5,2020-09-12,"('Callum', 'Wilson')",West Ham United
4,4,8230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,65008,6.0,2020-09-12,"('Ryan', 'Fraser')",West Ham United


### Split into train and test set

In [3]:
# split into train and test
# dont forget the split used in earlier notebooks
# dont forget to set the seed
X_train, X_test, y_train, y_test = train_test_split(
    ffmlDf,
    ffmlDf['points'],
    test_size = 0.2, # 80:20 split
    random_state = 0 # setting the seed
)

X_train.shape, X_test.shape

((9343, 26), (2336, 26))

In [4]:
X_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,points,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,penMissed,yelCards,redCards,saves,bonus,bonusPointSystem,influence,creativity,threat,ictIndex,netTransfers,selectedBy,costGBP,gameDate,playerName,oppositionTeam
9827,9827,11635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,4.5,2021-01-12,"('Owen', 'Otasowie')",Everton
10422,10422,3512,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,4.0,2021-01-17,"('Stephen', 'Henderson')",Manchester City
3986,3986,4156,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16.4,0.8,0.0,1.7,0,10082,4.5,2020-11-07,"('Tim', 'Ream')",West Ham United
2531,2531,4686,3,90,0,0,0,1,0,0,0,0,0,3,0,21,12.2,0.0,0.0,1.2,0,828679,5.5,2020-10-18,"('Kasper', 'Schmeichel')",Aston Villa
1315,1315,9734,3,76,0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,55175,7.0,2020-09-27,"('Giovani', 'Lo Celso')",Newcastle United


### Load the selected Features

In [5]:
features = pd.read_csv('selected_features.csv')
# convert to list
features = features['0'].to_list()

print('Number of feature selected:',len(features))

Number of feature selected: 16


### Engineer Missing Values

In [6]:
[
    var for var in features
    if ffmlDf[var].isnull().sum() > 0
]

[]

1. No missing values to engineer
2. No need to transform variables
3. Temporal variables dropped rather than engineered

### Rare Labels

In [7]:
# brought forward from notebook 3
cat_vars = [
    var for var in features
    if ffmlDf[var].dtype == 'O'
]
print(cat_vars)

def FindFrequentLabels(df, var, rare_perc):
    df = df.copy()
    tmp = df.groupby(var)['points'].count() / len(df)
    return tmp[tmp > rare_perc].index

# Rare Labels Romoved and Replaced!!!

for var in cat_vars:
    frequent_list = FindFrequentLabels(ffmlDf, var, 0.001)
    # i want values of 0.001 so im being a little lenient i feel
    print(var)
    print(frequent_list)
    print()
    
    X_train[var] = np.where(X_train[var].isin(frequent_list), X_train[var], 'Rare')
    X_test[var] = np.where(X_test[var].isin(frequent_list), X_test[var], 'Rare')

['playerName']
playerName
Index(['("N'Golo", 'Kanté')', '('Aaron', 'Connolly')',
       '('Aaron', 'Cresswell')', '('Aaron', 'Mooy')', '('Aaron', 'Ramsdale')',
       '('Aaron', 'Wan-Bissaka')', '('Abdoulaye', 'Doucouré')',
       '('Aboubakar', 'Kamara')', '('Adam', 'Forshaw')', '('Adam', 'Lallana')',
       ...
       '('Xherdan', 'Shaqiri')', '('Yan', 'Valery')', '('Yerry', 'Mina')',
       '('Yoshinori', 'Muto')', '('Youri', 'Tielemans')',
       '('Yves', 'Bissouma')', '('Zack', 'Steffen')',
       '('Zeze Steven', 'Sessegnon')', '('Çaglar', 'Söyüncü')',
       '('Ørjan', 'Nyland')'],
      dtype='object', name='playerName', length=601)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[var] = np.where(X_train[var].isin(frequent_list), X_train[var], 'Rare')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[var] = np.where(X_test[var].isin(frequent_list), X_test[var], 'Rare')


In [8]:
X_train[features].head()

Unnamed: 0,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,yelCards,redCards,saves,bonus,influence,creativity,threat,costGBP,playerName
9827,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,4.5,"('Owen', 'Otasowie')"
10422,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,4.0,"('Stephen', 'Henderson')"
3986,0,0,0,0,0,0,0,0,0,0,0,16.4,0.8,0.0,4.5,"('Tim', 'Ream')"
2531,90,0,0,0,1,0,0,0,0,3,0,12.2,0.0,0.0,5.5,"('Kasper', 'Schmeichel')"
1315,76,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,7.0,"('Giovani', 'Lo Celso')"


### Encoding Categorical Variables

In [9]:
# code was brought forward from notebook 3_FeatureEngineering
# Encoding Categorical Variables
def ReplaceCategories(train, test, var, target):
    # order labels in a var from low to high
    ordered_labels = train.groupby(var)[target].mean().sort_values().index
    
    # create a dictionary of ordered labels to integer values
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}
    
    # use the dict to replace cat strings with integers
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)
    print(var)
    #print(ordinal_label)
    print(ordered_labels)
    print()

for var in cat_vars:
    ReplaceCategories(X_train, X_test, var, 'points')

playerName
Index(['('Neil', 'Taylor')', '('Rúnar Alex', 'Rúnarsson')', '('Jed', 'Steer')',
       '('Chris', 'Smalling')', '('Jean-Philippe', 'Gbamin')',
       '('Christian', 'Atsu')', '('Jean Michaël', 'Seri')',
       '('Jay-Roy', 'Grot')', '('Christian', 'Walton')', '('Jason', 'Steele')',
       ...
       '('Callum', 'Wilson')', '('Andrew', 'Robertson')',
       '('Marcus', 'Rashford')', '('Aaron', 'Cresswell')',
       '('Patrick', 'Bamford')', '('Emiliano', 'Martínez')',
       '('Mohamed', 'Salah')', '('Heung-Min', 'Son')',
       '('Bruno Miguel', 'Borges Fernandes')', '('Harry', 'Kane')'],
      dtype='object', name='playerName', length=602)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[var] = train[var].map(ordinal_label)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[var] = test[var].map(ordinal_label)


##### Check for na in test and train

use features rather than X_train.columns or X_test.columns

In [10]:
[
    var for var in features
    if X_train[var].isnull().sum() > 0
]

[]

In [11]:
[
    var for var in features
    if X_test[var].isnull().sum() > 0
]

[]

### Feature Scaling

In [12]:
# capture the targets
y_train = X_train['points']
y_test = X_test['points']

###### Setting X_train and X_test

We need to only use X_train/test = X_train/test[features] when scaling and predicting

In [13]:
X_train = X_train[features] # model has to be fit to the chosen features
X_test = X_test[features] # features has to be used here as well

In [14]:
X_train.head()

Unnamed: 0,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,yelCards,redCards,saves,bonus,influence,creativity,threat,costGBP,playerName
9827,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,4.5,248
10422,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,4.0,58
3986,0,0,0,0,0,0,0,0,0,0,0,16.4,0.8,0.0,4.5,212
2531,90,0,0,0,1,0,0,0,0,3,0,12.2,0.0,0.0,5.5,538
1315,76,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,7.0,331


In [15]:
X_test.head()

Unnamed: 0,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,yelCards,redCards,saves,bonus,influence,creativity,threat,costGBP,playerName
8833,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,4.0,151
8134,0,0,0,0,0,0,0,0,0,0,0,17.8,0.7,2.0,5.0,196
9195,63,0,0,1,0,0,0,0,0,0,0,2.4,0.3,0.0,7.0,497
8339,0,0,0,0,0,0,0,0,0,0,0,1.0,0.3,0.0,5.0,316
5426,90,0,0,0,1,0,0,0,0,0,0,11.4,3.4,6.0,5.5,349


#### fit the scaler

In [16]:
# setup scaler
scaler = MinMaxScaler()

# fit the scaler
scaler.fit(X_train)

MinMaxScaler()

In [17]:
scaler.data_max_

array([ 90. ,   4. ,   4. ,   1. ,   7. ,   1. ,   1. ,   1. ,   1. ,
        11. ,   3. , 117.2,  78.4, 161. ,  12. , 601. ])

In [18]:
scaler.data_min_

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 4., 0.])

In [19]:
# transform the train and test set
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Train the Linear Regression: Lasso

In [20]:
# setup the model
# remember to set the random_state/seed
lin_model = Lasso(alpha=0.005, random_state=0)
# train the model
lin_model.fit(X_train, y_train) # X_train[features]

# persist the model for future use
joblib.dump(lin_model, 'lasso_regression.pkl')

['lasso_regression.pkl']

In [21]:
# predict train using the model
pred = lin_model.predict(X_train)
# determine the mse, rmse and r2
print('train mse: {}'.format(int(mean_squared_error(y_train, pred))))
print('train rmse: {}'.format(int(sqrt(mean_squared_error(y_train, pred)))))
print('train r2: {}'.format(r2_score(y_train, pred)))
print()
# predict test using the model
pred = lin_model.predict(X_test)
# determine the mse, rmse and r2
print('test mse: {}'.format(int(mean_squared_error(y_test, pred))))
print('test rmse: {}'.format(int(sqrt(mean_squared_error(y_test, pred)))))
print('test r2: {}'.format(r2_score(y_test, pred)))

train mse: 0
train rmse: 0
train r2: 0.9276211534465508

test mse: 0
test rmse: 0
test r2: 0.9309082169527179
