# Importer les bibliothéques

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Importer dataset

In [37]:
dt=pd.read_csv("./dataCleaned.csv")

In [59]:
train=dt[dt["season"]<2019]

In [60]:
test=dt[dt["season"]>=2019]

In [61]:
x_train=train.drop(["Result_code"],axis=1)
y_train=train["Result_code"]
x_test=test.drop(["Result_code"],axis=1)
y_test=test["Result_code"]


In [62]:
x_train=x_train.drop(["Unnamed: 0"],axis=1)
x_test=x_test.drop(["Unnamed: 0"],axis=1)

In [96]:
x_train

Unnamed: 0,team_code,Opponent_code,season,Round_code,Venue_code,Referee_code,Formation_code,Day_code,GF,GF_rolling,GA,GA_rolling
0,0,5,2010,32,1,41,19,5,4,3.000000,1,0.666667
1,0,31,2010,33,0,35,12,5,1,4.000000,1,0.666667
2,0,35,2010,34,1,25,12,5,2,2.333333,3,1.000000
3,0,11,2010,35,0,27,19,6,0,2.333333,2,1.666667
4,0,2,2010,36,1,24,19,5,2,1.000000,1,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
9606,38,8,2018,28,1,6,7,5,0,1.000000,0,2.000000
9607,38,0,2018,24,1,41,7,2,3,1.000000,1,1.333333
9608,38,34,2018,29,0,40,7,5,2,1.333333,1,1.333333
9609,38,14,2018,30,1,15,7,5,1,1.666667,0,0.666667


# Training

## Random Forest

In [97]:
rf=RandomForestClassifier(n_estimators=50,min_samples_split=10,random_state=1)

In [98]:
rf.fit(x_train,y_train)

In [99]:
predict_rf=rf.predict(x_test)

### Evaluation

#### Accuracy 

In [100]:
from sklearn.metrics import accuracy_score

In [101]:
acc=accuracy_score(y_test,predict_rf)

In [102]:
acc

0.9983487450462352

## SVM

In [103]:
from sklearn.svm import SVC

In [104]:
svm = SVC(kernel='linear', C=1.0)
svm.fit(x_train, y_train)

In [105]:
pred_svm = svm.predict(x_test)

### Evaluation

#### Accuracy

In [106]:
acc_svm=accuracy_score(y_test,pred_svm)

In [107]:
acc_svm

1.0

## XGBoost

In [75]:
from xgboost import XGBClassifier

In [76]:
model = XGBClassifier()
model.fit(x_train, y_train)


In [77]:
predXgb = model.predict(x_test)

### Evaluation

#### Accuracy

In [78]:
acc_xg=accuracy_score(y_test,predXgb)

In [79]:
acc_xg

0.9993394980184941

## Light GBM

In [80]:
import lightgbm as lgb

In [81]:
train_data = lgb.Dataset(x_train, label=y_train)
test_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
params = {
    'objective': 'multiclass',
    'num_class': 3,  # Number of classes in the target variable
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}


In [82]:
num_round = 100
model = lgb.train(params, train_data, num_round, valid_sets=[test_data])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 227
[LightGBM] [Info] Number of data points in the train set: 6735, number of used features: 12
[LightGBM] [Info] Start training from score -1.404121
[LightGBM] [Info] Start training from score -0.977519
[LightGBM] [Info] Start training from score -0.972402


In [83]:
y_pred_prob = model.predict(x_test, num_iteration=model.best_iteration)
predGBM = np.argmax(y_pred_prob, axis=1)

### Evaluation

#### Accurracy

In [84]:
accuracy = accuracy_score(y_test, predGBM)

In [85]:
accuracy

0.9993394980184941

## CatBoost

In [86]:
from catboost import CatBoostClassifier


In [87]:
cat_features = None
y_train_cat = y_train.astype(str)
y_test_cat = y_test.astype(str)

In [88]:
model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass')

In [89]:
model.fit(x_train, y_train_cat, cat_features=cat_features, verbose=10)

0:	learn: 0.9164120	total: 4.14ms	remaining: 410ms
10:	learn: 0.2890304	total: 40.2ms	remaining: 325ms
20:	learn: 0.1197295	total: 80.4ms	remaining: 302ms
30:	learn: 0.0581303	total: 120ms	remaining: 267ms
40:	learn: 0.0314797	total: 151ms	remaining: 217ms
50:	learn: 0.0205499	total: 182ms	remaining: 175ms
60:	learn: 0.0136599	total: 214ms	remaining: 137ms
70:	learn: 0.0098498	total: 254ms	remaining: 104ms
80:	learn: 0.0079647	total: 290ms	remaining: 67.9ms
90:	learn: 0.0066220	total: 322ms	remaining: 31.9ms
99:	learn: 0.0052728	total: 356ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x157c8017640>

In [90]:
predCat = model.predict(x_test)

### Evaluation

#### Accurracy

In [91]:
accuracy = accuracy_score(y_test_cat, predCat)

In [92]:
accuracy

1.0

In [94]:
train

Unnamed: 0.1,Unnamed: 0,team_code,Opponent_code,season,Round_code,Venue_code,Referee_code,Formation_code,Day_code,Result_code,GF,GF_rolling,GA,GA_rolling
0,3,0,5,2010,32,1,41,19,5,2,4,3.000000,1,0.666667
1,4,0,31,2010,33,0,35,12,5,0,1,4.000000,1,0.666667
2,5,0,35,2010,34,1,25,12,5,1,2,2.333333,3,1.000000
3,6,0,11,2010,35,0,27,19,6,1,0,2.333333,2,1.666667
4,7,0,2,2010,36,1,24,19,5,2,2,1.000000,1,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9606,6835,38,8,2018,28,1,6,7,5,0,0,1.000000,0,2.000000
9607,6836,38,0,2018,24,1,41,7,2,2,3,1.000000,1,1.333333
9608,6837,38,34,2018,29,0,40,7,5,2,2,1.333333,1,1.333333
9609,6838,38,14,2018,30,1,15,7,5,2,1,1.666667,0,0.666667


In [95]:
test

Unnamed: 0.1,Unnamed: 0,team_code,Opponent_code,season,Round_code,Venue_code,Referee_code,Formation_code,Day_code,Result_code,GF,GF_rolling,GA,GA_rolling
339,6840,0,23,2019,0,0,24,12,6,2,1,1.333333,0,1.666667
340,6841,0,9,2019,11,1,27,12,5,2,2,1.666667,1,0.666667
341,6842,0,19,2019,22,0,2,14,5,1,1,2.000000,3,0.666667
342,6843,0,33,2019,32,1,24,16,6,0,2,1.333333,2,1.333333
343,6844,0,34,2019,33,0,2,8,6,0,2,1.666667,2,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9758,9875,38,8,2022,27,0,9,18,5,1,0,1.666667,6,0.666667
9759,9876,38,1,2022,28,1,41,18,5,2,1,1.000000,0,2.666667
9760,9877,38,21,2022,29,0,13,18,5,1,0,1.000000,2,2.000000
9761,9878,38,13,2022,30,1,9,18,5,0,1,0.333333,1,2.666667
