# Saudi League Analysis
# Part 3: Modeling

In [94]:
import numpy as np 
import pandas as pd

import seaborn as sns

from sklearn.preprocessing import LabelEncoder

# Content:
## 1- Load Data
## 2- Data Preparation
- Correlation Based
- Recursive Feature Elimination

## 3- Models Selection
- Check Different Models
- Hyper-parameters Tuning

## 4- Model Evaluation

***
# 1- ***Load Data***
***

In [49]:
p14 = pd.read_csv('p_period 14-21')
p18 = pd.read_csv('p_period 18-21')
p14.drop('index', axis=1, inplace=True)
p18.drop('index', axis=1, inplace=True)
p18 = p18.drop(['90s.1','Gls.2','PK.1','PKatt.1','Born.1','MP.1','90s.2'], axis=1)


In [50]:
print(p14.shape)
print(p18.shape)

(3719, 42)
(1991, 53)


- 156.198
- 105.523

***
# 1- ***Data Preparation***
***

In [51]:
p14.head(3)

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,Subs,Mn/Sub,unSub,PPM,onG,onGA,+/-,+/-90,On-Off,year
0,1,Malek AlAbadalmanam,KSA,F,Al-Fayha,23.0,1998.0,22,8,735,...,14,14.0,2,1.32,7.0,4.0,3.0,0.37,0.64,2021
1,2,Saleh AlAbbas,KSA,M,Al-Faisaly,27.0,1993.0,9,1,247,...,8,24.0,12,0.22,2.0,5.0,-3.0,-1.09,-0.87,2021
2,3,Hamad AlAbdan,KSA,M,Al-Hazem,21.0,2000.0,8,3,298,...,5,19.0,2,1.0,1.0,5.0,-4.0,-1.21,-0.35,2021


In [52]:
p18.head(3)

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,+/-90,On-Off,Fls,Fld,Off,Crs,Int,TklW,OG,year
0,1,Malek AlAbadalmanam,KSA,F,Al-Fayha,23.0,1998.0,22,8,735,...,0.37,0.64,27,12,3,1,1,1,0,2021
1,2,Saleh AlAbbas,KSA,M,Al-Faisaly,27.0,1993.0,9,1,247,...,-1.09,-0.87,5,7,1,1,1,1,0,2021
2,3,Hamad AlAbdan,KSA,M,Al-Hazem,21.0,2000.0,8,3,298,...,-1.21,-0.35,3,0,0,2,1,2,0,2021


In [53]:
print(p14.columns)
print()
print(p14.info())

Index(['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts',
       'Min', '90s', 'Gls', 'Ast', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR',
       'Gls.1', 'Ast.1', 'G+A', 'G-PK.1', 'G+A-PK', 'index.1', 'MP.1', 'Min.1',
       'Mn/MP', 'Min%', '90s.1', 'Starts.1', 'Mn/Start', 'Compl', 'Subs',
       'Mn/Sub', 'unSub', 'PPM', 'onG', 'onGA', '+/-', '+/-90', 'On-Off',
       'year'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3719 entries, 0 to 3718
Data columns (total 42 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Rk        3719 non-null   int64  
 1   Player    3719 non-null   object 
 2   Nation    3719 non-null   object 
 3   Pos       3719 non-null   object 
 4   Squad     3719 non-null   object 
 5   Age       3719 non-null   float64
 6   Born      3719 non-null   float64
 7   MP        3719 non-null   int64  
 8   Starts    3719 non-null   int64  
 9   Min       3719 non-null   int64  
 10 

***Label Encoder***

- D = 0
- F = 1
- G = 2
- M = 3

In [54]:
lbl = LabelEncoder()
p14.Pos = lbl.fit_transform(p14.Pos)
p18.Pos = lbl.fit_transform(p18.Pos)

In [64]:
p14['Nation'].value_counts()

KSA    2692
BRA     243
ALG      58
MAR      56
TUN      46
       ... 
GAB       1
SUD       1
BEN       1
LTU       1
LIB       1
Name: Nation, Length: 100, dtype: int64

In [73]:
p14['Nation'] = p14['Nation'].apply(lambda x: 1 if x =='KSA' else 0)
p18['Nation'] = p18['Nation'].apply(lambda x: 1 if x =='KSA' else 0)

In [56]:
p14.drop('Squad',axis=1, inplace=True)
p18.drop('Squad',axis=1, inplace=True)

In [74]:
p14.head()

Unnamed: 0,Rk,Player,Nation,Pos,Age,Born,MP,Starts,Min,90s,...,Subs,Mn/Sub,unSub,PPM,onG,onGA,+/-,+/-90,On-Off,year
0,1,Malek AlAbadalmanam,1,1,23.0,1998.0,22,8,735,8.2,...,14,14.0,2,1.32,7.0,4.0,3.0,0.37,0.64,2021
1,2,Saleh AlAbbas,1,3,27.0,1993.0,9,1,247,2.7,...,8,24.0,12,0.22,2.0,5.0,-3.0,-1.09,-0.87,2021
2,3,Hamad AlAbdan,1,3,21.0,2000.0,8,3,298,3.3,...,5,19.0,2,1.0,1.0,5.0,-4.0,-1.21,-0.35,2021
3,4,Ayoub Abdellaoui,0,0,28.0,1993.0,22,19,1598,17.8,...,3,13.0,0,0.95,19.0,25.0,-6.0,-0.34,-0.26,2021
4,5,Saud Abdulhamid,1,0,22.0,1999.0,13,10,970,10.8,...,3,31.0,0,2.77,29.0,6.0,23.0,2.13,1.51,2021


In [75]:
p18.head()

Unnamed: 0,Rk,Player,Nation,Pos,Age,Born,MP,Starts,Min,90s,...,+/-90,On-Off,Fls,Fld,Off,Crs,Int,TklW,OG,year
0,1,Malek AlAbadalmanam,1,1,23.0,1998.0,22,8,735,8.2,...,0.37,0.64,27,12,3,1,1,1,0,2021
1,2,Saleh AlAbbas,1,3,27.0,1993.0,9,1,247,2.7,...,-1.09,-0.87,5,7,1,1,1,1,0,2021
2,3,Hamad AlAbdan,1,3,21.0,2000.0,8,3,298,3.3,...,-1.21,-0.35,3,0,0,2,1,2,0,2021
3,4,Ayoub Abdellaoui,0,0,28.0,1993.0,22,19,1598,17.8,...,-0.34,-0.26,23,20,1,22,21,11,0,2021
4,5,Saud Abdulhamid,1,0,22.0,1999.0,13,10,970,10.8,...,2.13,1.51,20,15,0,43,7,24,0,2021


In [76]:
pp18 = p18.copy()
pp14 = p14.copy()

In [79]:
pp14.columns

Index(['Rk', 'Player', 'Nation', 'Pos', 'Age', 'Born', 'MP', 'Starts', 'Min',
       '90s', 'Gls', 'Ast', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'Gls.1',
       'Ast.1', 'G+A', 'G-PK.1', 'G+A-PK', 'index.1', 'MP.1', 'Min.1', 'Mn/MP',
       'Min%', '90s.1', 'Starts.1', 'Mn/Start', 'Compl', 'Subs', 'Mn/Sub',
       'unSub', 'PPM', 'onG', 'onGA', '+/-', '+/-90', 'On-Off', 'year'],
      dtype='object')

In [81]:
pp18 = pp18.drop(['Rk', 'Player','Born', 'Starts.1', 'Min.1' , 'year'],axis=1)
pp14 = pp14.drop(['Rk', 'Player','Born', 'Starts.1', 'Min.1' , 'year', 'index.1', '90s.1'], axis=1)

In [82]:
pp18.head()

Unnamed: 0,Nation,Pos,Age,MP,Starts,Min,90s,Gls,Ast,G-PK,...,+/-,+/-90,On-Off,Fls,Fld,Off,Crs,Int,TklW,OG
0,1,1,23.0,22,8,735,8.2,4,0,4,...,3.0,0.37,0.64,27,12,3,1,1,1,0
1,1,3,27.0,9,1,247,2.7,0,0,0,...,-3.0,-1.09,-0.87,5,7,1,1,1,1,0
2,1,3,21.0,8,3,298,3.3,0,0,0,...,-4.0,-1.21,-0.35,3,0,0,2,1,2,0
3,0,0,28.0,22,19,1598,17.8,0,0,0,...,-6.0,-0.34,-0.26,23,20,1,22,21,11,0
4,1,0,22.0,13,10,970,10.8,0,2,0,...,23.0,2.13,1.51,20,15,0,43,7,24,0


In [83]:
pp14.head()

Unnamed: 0,Nation,Pos,Age,MP,Starts,Min,90s,Gls,Ast,G-PK,...,Compl,Subs,Mn/Sub,unSub,PPM,onG,onGA,+/-,+/-90,On-Off
0,1,1,23.0,22,8,735,8.2,4,0,4,...,2.0,14,14.0,2,1.32,7.0,4.0,3.0,0.37,0.64
1,1,3,27.0,9,1,247,2.7,0,0,0,...,0.0,8,24.0,12,0.22,2.0,5.0,-3.0,-1.09,-0.87
2,1,3,21.0,8,3,298,3.3,0,0,0,...,0.0,5,19.0,2,1.0,1.0,5.0,-4.0,-1.21,-0.35
3,0,0,28.0,22,19,1598,17.8,0,0,0,...,14.0,3,13.0,0,0.95,19.0,25.0,-6.0,-0.34,-0.26
4,1,0,22.0,13,10,970,10.8,0,2,0,...,9.0,3,31.0,0,2.77,29.0,6.0,23.0,2.13,1.51


In [93]:
X = pp14[['Nation', 'Pos', 'Age', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast',
       'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'Gls.1', 'Ast.1', 'G+A',
       'G-PK.1', 'G+A-PK', 'MP.1', 'Mn/MP', 'Min%', 'Mn/Start', 'Compl',
       'Subs', 'Mn/Sub', 'unSub', 'PPM', 'onG', 'onGA', '+/-', '+/-90',
       'On-Off']]
y = X.values

In [95]:
sns.distplot(y,kde = False)

KeyboardInterrupt: 