In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import os
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import sklearn

## Load the dataset and clean 

In [2]:
fifa = pd.read_csv("FIFA19data.csv", sep=r'\s*,\s*', engine='python')
fifa.head()

Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,International Reputation,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,158023,L. Messi,31,Argentina,94,94,FC Barcelona,€110.5M,€565K,5.0,...,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0
1,20801,Cristiano Ronaldo,33,Portugal,94,94,Juventus,€77M,€405K,5.0,...,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,190871,Neymar Jr,26,Brazil,92,93,Paris Saint-Germain,€118.5M,€290K,5.0,...,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0
3,193080,De Gea,27,Spain,91,93,Manchester United,€72M,€260K,4.0,...,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0
4,192985,K. De Bruyne,27,Belgium,91,92,Manchester City,€102M,€355K,4.0,...,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0


In [3]:
import re
from io import StringIO
tmp=fifa['Wage']
#tmp.to_string(index=False) #to print out to see 
tmp2=re.sub(r'.([ 0-9.]+)K*',r'\1',tmp.to_string(index=False))
tmp3 = pd.read_csv(StringIO("0\n"+tmp2))
fifa['Wage']=tmp3

In [4]:
fifa['Wage'].unique()

array([565, 405, 290, 260, 355, 340, 420, 455, 380,  94, 205, 125, 285,
       225, 145, 240, 315, 200, 130, 300, 215, 100, 255, 165, 265, 160,
       150, 245, 110,  77, 115, 210, 195, 230, 250, 135, 155, 180, 175,
       190, 185,  21,  82,  73,  92,  88,  96, 170,  66, 235,  28, 105,
        38,  81,  57,  15,  63,  22,  84, 120,  90,  72,  93,  45,  74,
        51,  42,  31,  75,  25, 140,  41,  78,  53,  95,  80,  43,  60,
        85,  64,  67,  18,  70,  91,  20,  49,  87,  86,  26,  29,  55,
        35,  33,  56,  30,  11,  59,  23,  46,  39,  32,  36,  98,  54,
        68,  58,  27,  40,  44,  19,   1,  61,  50,  99,  17,  52,  62,
        12,  10,  71,  14,  76,  48,  65,  69,  24,  34,  16,  37,  47,
        89,   0,  97,  79,  13,  83,   6,   3,   9,   8,   7,   4,   2,
         5], dtype=int64)

In [5]:
fifa = fifa.drop('ID', 1)
fifa = fifa.drop('Name', 1)
fifa = fifa.drop('Nationality', 1)
fifa = fifa.drop('Club', 1)
fifa = fifa.drop('Value', 1)
fifa = fifa.drop('Wage', 1)
fifa = fifa.drop('Body Type', 1)
fifa = fifa.drop('Potential', 1)

In [6]:
fifa.head()

Unnamed: 0,Age,Overall,International Reputation,Weak Foot,Skill Moves,Work Rate,Position,Contract Valid Until,Crossing,Finishing,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,31,94,5.0,4.0,4.0,Medium/ Medium,RF,2021,84.0,95.0,...,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0
1,33,94,5.0,4.0,5.0,High/ Low,ST,2022,84.0,94.0,...,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,26,92,5.0,5.0,5.0,High/ Medium,LW,2022,79.0,87.0,...,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0
3,27,91,4.0,3.0,1.0,Medium/ Medium,GK,2020,17.0,13.0,...,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0
4,27,91,4.0,5.0,4.0,High/ High,RCM,2023,93.0,82.0,...,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0


In [7]:
for col in fifa.columns:
    fifa[col].fillna(value=fifa[col].mode()[0], inplace=True)

factors = ['International Reputation', 'Weak Foot', 'Skill Moves', 'Work Rate', 'Position', 'Contract Valid Until']

for var in factors:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(fifa[var], prefix=var)
    fifa = pd.concat([fifa,cat_list], axis = 1)
    fifa = fifa.drop(var, 1)

In [8]:
X = fifa.copy()
X = X.drop('Overall', 1)
Y = fifa.copy()
Y = Y['Overall']

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,Y, test_size=0.9, random_state=31)

# Basic Linear Model

In [10]:
lm1 = LinearRegression()
lm1.fit(X_train, y_train)
lm1_predictions = lm1.predict(X_test)
lm1_r2 = r2_score(y_test,lm1_predictions)
print(lm1_r2)

0.8904970737556855


# CrossValidation

In [11]:
cv_predictions = cross_val_predict(lm1, X_test, y_test, cv=5)
cv_r2 = r2_score(y_test,cv_predictions)
print(cv_r2)

0.895506109826533


In [12]:
#cross validation score 
(cross_val_score(lm1, X_test, y_test, cv=5, )).mean()

0.8954636747573896

In [13]:
#The same as r square 
(cross_val_score(lm1, X_test, y_test, cv=5,scoring='r2' )).mean()

0.8954636747573896

In [16]:
lm1.score(X_test,y_test)

0.8904970737556855

Cross validation model is better than basic linear model. 

In [15]:
sorted(sklearn.metrics.SCORERS.keys()) 

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

# Lasso Regression

In [17]:
lasso = Lasso()
lasso.fit(X_train,y_train)
lasso1_predictions = lasso.predict(X_test)
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)
coeff_used = np.sum(lasso.coef_!=0)

In [21]:
print("lasso training score:", train_score)
print("lasso test score: ", test_score)
print("number of features used: ", coeff_used)
#print("test r2 score: ", r2_lasso1)

lasso training score: 0.8584100702105251
lasso test score:  0.8508221432029555
number of features used:  23


In [26]:
#Adjusted R2 comparision 
lm_train_score=lm1.score(X_train,y_train)
lm_test_score=lm1.score(X_test,y_test)
# print("lasso training score:", lm_train_score)
# print("lasso test score: ", lm_test_score)

lm_ra = 1-(1-lm_train_score)*((len(X_train)-1)/(len(X_train)-len(lm1.coef_)-1))
print("linear regression R square : ",lm_ra)
print("linear regression training score : ",lm_train_score)
print("\n")

lasso_ra = 1-(1-train_score)*((len(X_train)-1)/(len(X_train)-coeff_used-1))
print("Lasso regression R square : ",lasso_ra)
print("Lasso regression training score: ",train_score)

linear regression R square :  0.8966076450336329
linear regression training score :  0.9035421515239555


Lasso regression R square :  0.8565968361430651
Lasso regression training score:  0.8584100702105251


In [27]:
lasso = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, cv = 5)

lasso_regressor.fit(X_train, y_train)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [28]:
lasso_regressor.best_params_

{'alpha': 0.01}

In [33]:
print("using lasso regression grid search:")
lasso_regressor.score(X_train,y_train)

using lasso regression grid search:


0.900304099950968

In [32]:
coeff_used = np.sum(lasso_regressor.best_estimator_.coef_!=0)
print("coefficients used:", coeff_used)

coefficients used: 59


In [34]:
lasso2_predictions = lasso_regressor.predict(X_test)

# AIC BIC

In [35]:
def AIC(y_true, y_hat, coeff_used):
    resid = y_true - y_hat
    sse = sum(resid**2)
    n = len(y_hat)
    return n*np.log(sse/n) + 2*coeff_used

def BIC(y_true, y_hat, coeff_used):
    resid = y_true - y_hat
    sse = sum(resid**2)
    n = len(y_hat)
    return n*np.log(sse/n) + np.log(n)*coeff_used

In [38]:
#aic and bic of simple linear model
print("aic and bic of simple linear model:")
aic_lm1 = AIC(y_test, lm1_predictions, (len(X_test.columns)+1))
print(aic_lm1)
bic_lm1 = BIC(y_test, lm1_predictions, (len(X_test.columns)+1))
print(bic_lm1)

aic and bic of simple linear model:
27300.326786055277
28247.948750890464


In [39]:
print("aic and bic of lasso model:")
aic_lasso2 = AIC(y_test, lasso2_predictions, (coeff_used+1))
print(aic_lasso2)
bic_lasso2 = BIC(y_test, lasso2_predictions, (coeff_used+1))
print(bic_lasso2)

aic and bic of lasso model:
27274.691985757767
27736.94660275054
