In [1]:
# to handle datasets
import pandas as pd
from pandas import DataFrame
pd.pandas.set_option('display.max_columns', None)
import numpy as np

# to plot
import matplotlib.pyplot as plt
import seaborn as sns

# divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

In [2]:
# load dataset and drop the unneeded columns in the dataFrame
# rows with no gametime played, unnamed: 0 and datetime
ffmlDf = pd.read_csv('ffmlDF_20-21')

ffmlDf.head()

Unnamed: 0.1,Unnamed: 0,points,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,penMissed,yelCards,redCards,saves,bonus,bonusPointSystem,influence,creativity,threat,ictIndex,netTransfers,selectedBy,costGBP,gameDate,playerName,oppositionTeam
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,76656,7.0,2020-09-12,"('Mesut', 'Özil')",Fulham
1,6378,6,90,0,0,1,0,0,0,0,0,0,0,0,27,26.0,0.1,2.0,2.8,0,88657,5.0,2020-09-12,"('Federico', 'Fernández')",West Ham United
2,6394,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,3326,4.5,2020-09-12,"('Ciaran', 'Clark')",West Ham United
3,6410,7,90,0,0,1,0,0,0,0,0,0,3,0,26,27.0,0.0,0.0,2.7,0,13715,5.0,2020-09-12,"('Karl', 'Darlow')",West Ham United
4,6426,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,27245,5.0,2020-09-12,"('Martin', 'Dubravka')",West Ham United


In [3]:
ffmlDf.shape

(9685, 25)

In [4]:
# drop unneccessary columns
ffmlDf = ffmlDf.drop(columns=['Unnamed: 0','gameDate'])

# remove zerominsplayed - taken from 2_DataAnalysis
def RemoveZeroMinsPlayed(df):
    df = df.copy().where(df['minsPlayed'] != 0)
    df.dropna(axis=0, inplace=True)
    return df

ffmlDf = RemoveZeroMinsPlayed(ffmlDf)

ffmlDf.head()

Unnamed: 0,points,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,penMissed,yelCards,redCards,saves,bonus,bonusPointSystem,influence,creativity,threat,ictIndex,netTransfers,selectedBy,costGBP,playerName,oppositionTeam
1,6.0,90.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,26.0,0.1,2.0,2.8,0.0,88657.0,5.0,"('Federico', 'Fernández')",West Ham United
3,7.0,90.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,26.0,27.0,0.0,0.0,2.7,0.0,13715.0,5.0,"('Karl', 'Darlow')",West Ham United
7,6.0,90.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,21.8,0.3,0.0,2.2,0.0,219489.0,4.5,"('Jamaal', 'Lascelles')",West Ham United
11,5.0,90.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,27.0,22.4,19.7,2.0,4.4,0.0,21964.0,4.5,"('Javier', 'Manquillo')",West Ham United
16,2.0,90.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.0,12.8,2.0,0.0,1.5,0.0,5941.0,5.0,"('Isaac', 'Hayden')",West Ham United


In [5]:
ffmlDf.shape

(4480, 23)

In [6]:
# separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(ffmlDf,
                                                   ffmlDf['points'],
                                                   test_size=0.2,
                                                   random_state=0)
# 80:20 split
# target is df['points']
X_train.shape, X_test.shape #everything is looking good so far

((3584, 23), (896, 23))

In [11]:
# missing values now have to be handled
# so num_vars and cat_vars are needed

num_vars = [
    var for var in ffmlDf.columns
    if X_train[var].dtypes != 'O'
]

print(len(num_vars),'num_vars')
num_vars

21 num_vars


['points',
 'minsPlayed',
 'goalsScored',
 'assists',
 'cleanSheets',
 'goalsConceded',
 'ownGoals',
 'penSaved',
 'penMissed',
 'yelCards',
 'redCards',
 'saves',
 'bonus',
 'bonusPointSystem',
 'influence',
 'creativity',
 'threat',
 'ictIndex',
 'netTransfers',
 'selectedBy',
 'costGBP']

In [12]:
#

cat_vars = [
    var for var in ffmlDf.columns
    if X_train[var].dtypes == 'O'
]

print(len(cat_vars),'cat_vars')
cat_vars

2 cat_vars


['playerName', 'oppositionTeam']