Linear, logistic regression - useful for 
* Analyzing the relationships between data (coefficients)
* Making predictions 

Random Forest - flexible, fewer assumptions about your data. Great for predictions, useless for relationships between variables. Theory on page 245.

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from patsy import dmatrices
from pandas import DataFrame, Series
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from os import path

# Directory where data is stored
DATA_DIR = '../resources/code-soccer-files-main/data'

df = pd.read_csv(path.join(DATA_DIR, 'player_match.csv'))

In [2]:
# In this case we want to classify the player position  
xvars = ['shot', 'goal', 'assist', 'pass', 'pass_accurate', 'tackle', 'accel',
        'counter', 'opportunity', 'keypass', 'own_goal', 'interception',
        'smart', 'clearance', 'cross', 'air_duel', 'air_duel_won',
        'gk_leave_line', 'gk_save_attempt', 'throw', 'corner', 'started']

yvar = 'pos'

df[xvars + [yvar]].head()
df[yvar].value_counts(normalize=True)

# splitting into two sets - one for learning, the other for testing
train, test = train_test_split(df, test_size=0.20)

model = RandomForestClassifier(n_estimators=100) # n-estimators (number of different trees the algorithm will run)
model.fit(train[xvars], train[yvar])


In [5]:
# checking how the modes does on the holdout set
test['pos_hat'] = model.predict(test[xvars])
test['correct'] = (test['pos_hat'] == test[yvar])
test['correct'].mean()


0.7134328358208956

In [8]:
# running each of out test samples through each of the 100 trees and checking frequency
model.predict_proba(test[xvars])

# putting into a dataframe
probs = DataFrame(model.predict_proba(test[xvars]),
                  index=test.index,
                  columns=model.classes_) # must have the same index as test
probs.head()


Unnamed: 0,DEF,FWD,GKP,MID
802,0.19,0.25,0.0,0.56
300,0.22,0.43,0.0,0.35
1050,0.49,0.04,0.0,0.47
639,0.41,0.07,0.0,0.52
427,0.87,0.01,0.04,0.08


In [11]:
# comparing to the actual position
results = pd.concat([
    test[['name', 'team', 'pos', 'pos_hat', 'correct']],
    probs], axis=1)

print(results.sample(10).round(2))
print()
results.groupby('pos')[['correct', 'FWD', 'MID', 'DEF', 'GKP']].mean()

               name      team  pos pos_hat  correct   DEF   FWD   GKP   MID
1315   M. Mandžukić   Croatia  FWD     DEF    False  0.48  0.27  0.00  0.25
1330    A. Kramarić   Croatia  FWD     MID    False  0.08  0.21  0.02  0.69
1075      C. N'Doye   Senegal  MID     FWD    False  0.19  0.43  0.01  0.37
1207       D. Sakho   Senegal  FWD     FWD     True  0.07  0.53  0.00  0.40
237    Adrien Silva  Portugal  MID     DEF    False  0.43  0.36  0.01  0.20
715   N. Milenković    Serbia  DEF     DEF     True  0.98  0.00  0.00  0.02
85        L. Suárez   Uruguay  FWD     FWD     True  0.02  0.72  0.00  0.26
1622      R. Lukaku   Belgium  FWD     FWD     True  0.08  0.52  0.02  0.38
1466     J. Giménez   Uruguay  DEF     DEF     True  0.57  0.17  0.00  0.26
1088    D. Kownacki    Poland  FWD     FWD     True  0.00  0.60  0.00  0.40



Unnamed: 0_level_0,correct,FWD,MID,DEF,GKP
pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DEF,0.787879,0.060803,0.277603,0.659473,0.002121
FWD,0.532468,0.504688,0.397147,0.095308,0.002857
GKP,1.0,0.014516,0.047097,0.03129,0.907097
MID,0.695312,0.258157,0.524793,0.205175,0.011875


In [13]:
# Cross validation - reduces noise (multiple holdout sets and blending them together)
model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, df[xvars], df[yvar], cv=10)
print(scores)
scores.mean()

[0.78571429 0.73652695 0.7245509  0.69461078 0.73652695 0.64670659
 0.77844311 0.73053892 0.66467066 0.68862275]


0.7186911890504705

In [14]:
# Feature importance (made the biggest difference in splitting correctly)
model = RandomForestClassifier(n_estimators=100)
model.fit(df[xvars], df[yvar])  # running model fitting on entire dataset
Series(model.feature_importances_, xvars).sort_values(ascending=False)

pass_accurate      0.116191
pass               0.115827
throw              0.104585
clearance          0.081634
air_duel           0.073670
interception       0.072313
gk_leave_line      0.050324
counter            0.048219
air_duel_won       0.047913
shot               0.043471
gk_save_attempt    0.043092
cross              0.041080
opportunity        0.033653
accel              0.029428
tackle             0.024206
corner             0.023893
keypass            0.017293
started            0.017133
goal               0.009751
assist             0.005339
own_goal           0.000985
smart              0.000000
dtype: float64