In [1]:
# to handle datasets
import pandas as pd
from pandas import DataFrame
pd.pandas.set_option('display.max_columns',None)
import numpy as np

# divide test and train
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# to build models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to evaluate models
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# to persist the model and scaler
import joblib

### Load Original Data

In [2]:
# been a while now but load the original dataset
ffmlDf = pd.read_csv('ffmlDf_20-21')
print(ffmlDf.shape)
ffmlDf.head()

(9685, 25)


Unnamed: 0.1,Unnamed: 0,points,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,penMissed,yelCards,redCards,saves,bonus,bonusPointSystem,influence,creativity,threat,ictIndex,netTransfers,selectedBy,costGBP,gameDate,playerName,oppositionTeam
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,76656,7.0,2020-09-12,"('Mesut', 'Özil')",Fulham
1,6378,6,90,0,0,1,0,0,0,0,0,0,0,0,27,26.0,0.1,2.0,2.8,0,88657,5.0,2020-09-12,"('Federico', 'Fernández')",West Ham United
2,6394,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,3326,4.5,2020-09-12,"('Ciaran', 'Clark')",West Ham United
3,6410,7,90,0,0,1,0,0,0,0,0,0,3,0,26,27.0,0.0,0.0,2.7,0,13715,5.0,2020-09-12,"('Karl', 'Darlow')",West Ham United
4,6426,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,27245,5.0,2020-09-12,"('Martin', 'Dubravka')",West Ham United


### Split into train and test set

In [3]:
# split into train and test
# dont forget the split used in earlier notebooks
# dont forget to set the seed
X_train, X_test, y_train, y_test = train_test_split(
    ffmlDf,
    ffmlDf['points'],
    test_size = 0.2, # 80:20 split
    random_state = 0 # setting the seed
)

X_train.shape, X_test.shape

((7748, 25), (1937, 25))

In [4]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,points,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,penMissed,yelCards,redCards,saves,bonus,bonusPointSystem,influence,creativity,threat,ictIndex,netTransfers,selectedBy,costGBP,gameDate,playerName,oppositionTeam
3883,7532,9,90,0,0,1,0,0,0,0,0,0,0,3,12,24.0,18.6,4.0,4.7,0,748080,4.5,2020-11-06,"('Kyle', 'Walker-Peters')",Newcastle United
6150,7382,1,21,0,0,0,0,0,0,0,0,0,0,0,5,7.2,7.2,23.0,3.7,0,54083,6.5,2020-12-07,"('Nathan', 'Redmond')",Brighton and Hove Albion
4706,3125,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,109,5.0,2020-11-22,"('Jean-Philippe', 'Gbamin')",Fulham
1127,8591,2,73,0,0,0,2,0,0,0,0,0,0,0,3,4.8,12.8,9.0,2.7,0,37744,5.5,2020-09-26,"('Grady', 'Diangana')",Chelsea
4639,6338,1,16,0,0,0,0,0,0,0,0,0,0,0,3,7.0,14.0,48.0,6.9,0,21822,5.5,2020-11-21,"('Andy', 'Carroll')",Chelsea


### Load the selected Features

In [5]:
features = pd.read_csv('selected_features.csv')
# convert to list
features = features['0'].to_list()

print('Number of feature selected:',len(features))

Number of feature selected: 18


### Engineer Missing Values

In [6]:
[
    var for var in features
    if ffmlDf[var].isnull().sum() > 0
]

[]

1. No missing values to engineer
2. No need to transform variables
3. Temporal variables dropped rather than engineered

### Rare Labels

In [7]:
# brought forward from notebook 3
cat_vars = [
    var for var in features
    if ffmlDf[var].dtype == 'O'
]
print(cat_vars)

def FindFrequentLabels(df, var, rare_perc):
    df = df.copy()
    tmp = df.groupby(var)['points'].count() / len(df)
    return tmp[tmp > rare_perc].index

# Rare Labels Romoved and Replaced!!!

for var in cat_vars:
    frequent_list = FindFrequentLabels(ffmlDf, var, 0.001)
    # i want values of 0.001 so im being a little lenient i feel
    print(var)
    print(frequent_list)
    print()
    
    X_train[var] = np.where(X_train[var].isin(frequent_list), X_train[var], 'Rare')
    X_test[var] = np.where(X_test[var].isin(frequent_list), X_test[var], 'Rare')

['playerName', 'oppositionTeam']
playerName
Index(['("N'Golo", 'Kanté')', '('Aaron', 'Connolly')',
       '('Aaron', 'Cresswell')', '('Aaron', 'Mooy')', '('Aaron', 'Ramsdale')',
       '('Aaron', 'Wan-Bissaka')', '('Abdoulaye', 'Doucouré')',
       '('Aboubakar', 'Kamara')', '('Adam', 'Forshaw')', '('Adam', 'Lallana')',
       ...
       '('Xherdan', 'Shaqiri')', '('Yan', 'Valery')', '('Yerry', 'Mina')',
       '('Yoshinori', 'Muto')', '('Youri', 'Tielemans')',
       '('Yves', 'Bissouma')', '('Zack', 'Steffen')',
       '('Zeze Steven', 'Sessegnon')', '('Çaglar', 'Söyüncü')',
       '('Ørjan', 'Nyland')'],
      dtype='object', name='playerName', length=600)

oppositionTeam
Index(['Arsenal', 'Aston Villa', 'Brighton and Hove Albion', 'Burnley',
       'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Leeds',
       'Leicester City', 'Liverpool', 'Manchester City', 'Manchester United',
       'Newcastle United', 'Sheffield United', 'Southampton',
       'Tottenham Hotspur', 'West Brom

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[var] = np.where(X_train[var].isin(frequent_list), X_train[var], 'Rare')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[var] = np.where(X_test[var].isin(frequent_list), X_test[var], 'Rare')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[var] = np.where(X_train[var].isin(freque

In [8]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,points,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,penMissed,yelCards,redCards,saves,bonus,bonusPointSystem,influence,creativity,threat,ictIndex,netTransfers,selectedBy,costGBP,gameDate,playerName,oppositionTeam
3883,7532,9,90,0,0,1,0,0,0,0,0,0,0,3,12,24.0,18.6,4.0,4.7,0,748080,4.5,2020-11-06,"('Kyle', 'Walker-Peters')",Newcastle United
6150,7382,1,21,0,0,0,0,0,0,0,0,0,0,0,5,7.2,7.2,23.0,3.7,0,54083,6.5,2020-12-07,"('Nathan', 'Redmond')",Brighton and Hove Albion
4706,3125,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,109,5.0,2020-11-22,"('Jean-Philippe', 'Gbamin')",Fulham
1127,8591,2,73,0,0,0,2,0,0,0,0,0,0,0,3,4.8,12.8,9.0,2.7,0,37744,5.5,2020-09-26,"('Grady', 'Diangana')",Chelsea
4639,6338,1,16,0,0,0,0,0,0,0,0,0,0,0,3,7.0,14.0,48.0,6.9,0,21822,5.5,2020-11-21,"('Andy', 'Carroll')",Chelsea


### Encoding Categorical Variables

In [9]:
# code was brought forward from notebook 3_FeatureEngineering
# Encoding Categorical Variables
def ReplaceCategories(train, test, var, target):
    # order labels in a var from low to high
    ordered_labels = train.groupby(var)[target].mean().sort_values().index
    
    # create a dictionary of ordered labels to integer values
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}
    
    # use the dict to replace cat strings with integers
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)
    print(var)
    print(ordinal_label)
    print()

for var in cat_vars:
    ReplaceCategories(X_train, X_test, var, 'points')

playerName
{"('Mohammed', 'Salisu')": 0, "('Teden', 'Mengi')": 1, "('Phil', 'Jagielka')": 2, "('Chris', 'Smalling')": 3, "('Phil', 'Jones')": 4, "('Christian', 'Atsu')": 5, "('Tahith', 'Chong')": 6, "('James', 'Trafford')": 7, "('Reiss', 'Nelson')": 8, "('Christian', 'Walton')": 9, "('James', 'Garner')": 10, "('Claudio', 'Bravo')": 11, "('Cole', 'Palmer')": 12, "('Connor', 'Wickham')": 13, "('James', 'Bree')": 14, "('Jake', 'Vokins')": 15, "('Terence', 'Kongolo')": 16, "('Jack', 'Wilshere')": 17, "('Paulo', 'Gazzaniga')": 18, "('Jarrad', 'Branthwaite')": 19, "('Brandon', 'Pierrick')": 20, "('Jonas', 'Lössl')": 21, "('John', 'Ruddy')": 22, "('Bruno André', 'Cavaco Jordao')": 23, "('Joe', 'Hart')": 24, "('Timothy', 'Fosu-Mensah')": 25, "('Jesse', 'Lingard')": 26, "('Jed', 'Steer')": 27, "('Jean-Philippe', 'Gbamin')": 28, "('Calum', 'Chambers')": 29, "('Cameron', 'Carter-Vickers')": 30, "('Jean Michaël', 'Seri')": 31, "('Jay-Roy', 'Grot')": 32, "('Cedric', 'Kipre')": 33, "('Jason', 'Steel

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[var] = train[var].map(ordinal_label)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[var] = test[var].map(ordinal_label)


##### Check for na in test and train

use features rather than X_train.columns or X_test.columns

In [10]:
[
    var for var in features
    if X_train[var].isnull().sum() > 0
]

[]

In [11]:
[
    var for var in features
    if X_test[var].isnull().sum() > 0
]

[]

### Feature Scaling

In [12]:
# capture the targets
y_train = X_train['points']
y_test = X_test['points']

In [13]:
# setup scaler
scaler = MinMaxScaler()

# fit the scaler
scaler.fit(X_train[features])

MinMaxScaler()

In [14]:
scaler.data_max_

array([ 90. ,   4. ,   4. ,   1. ,   7. ,   1. ,   1. ,   1. ,   1. ,
        11. ,   3. ,  69. , 117.2,  78.4, 161. ,  12. , 600. ,  19. ])

In [15]:
scaler.data_min_

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -7.,  0.,
        0.,  0.,  4.,  0.,  0.])

In [16]:
# transform the train and test set
X_train = scaler.transform(X_train[features])
X_test = scaler.transform(X_test[features])

### Train the Linear Regression: Lasso

In [17]:
# setup the model
# remember to set the random_state/seed
lin_model = Lasso(alpha=0.005, random_state=0)
# train the model
lin_model.fit(X_train, y_train)

# persist the model for future use
joblib.dump(lin_model, 'lasso_regression.pkl')

['lasso_regression.pkl']

In [18]:
# predict train using the model
pred = lin_model.predict(X_train)
# determine the mse, rmse and r2
print('train mse: {}'.format(int(mean_squared_error(y_train, pred))))
print('train rmse: {}'.format(int(sqrt(mean_squared_error(y_train, pred)))))
print('train r2: {}'.format(r2_score(y_train, pred)))
print()
# predict test using the model
pred = lin_model.predict(X_test)
# determine the mse, rmse and r2
print('test mse: {}'.format(int(mean_squared_error(y_test, pred))))
print('test rmse: {}'.format(int(sqrt(mean_squared_error(y_test, pred)))))
print('test r2: {}'.format(r2_score(y_test, pred)))

train mse: 0
train rmse: 0
train r2: 0.9281609379705237

test mse: 0
test rmse: 0
test r2: 0.9328015241205657
