## **Supervised ML classification algorithm to predict next round winner team (CT & T)**
## **Algorithm**

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, classification_report

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

from joblib import dump

In [2]:
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 30)

### Data

In [3]:
df = pd.read_csv('../data/processed/5_1_base_predict_winner.csv')

In [4]:
df.head()

Unnamed: 0,file,round,wp_ct_val,wp_t_val,nade_ct_val,nade_t_val,ct_alive,t_alive,prev_ct_winner,ct_winner,prev_bomb_planted,bomb_planted,ct_cons_wins,t_cons_wins,ct_val_pred,t_val_pred,ct_round_type,t_round_type,ct_nxt_rnd_type_pred,t_nxt_rnd_type_pred,nxt_ct_winner
0,0,1,1000.0,1166.666667,550,1200,5,5,0.5,1,0.5,0,0,0,4078.134589,3943.272665,0,0,2,1,0
1,0,2,10100.0,3687.5,1100,50,4,0,1.0,0,0.0,0,1,0,17819.702711,6290.616771,2,1,3,1,1
2,0,3,4125.0,11700.0,900,2450,0,1,0.0,0,0.0,1,0,1,7038.468589,19600.790638,2,3,1,3,1
3,0,4,1000.0,11700.0,0,1600,0,3,0.0,0,1.0,1,0,2,1452.468928,22568.098741,1,2,3,3,0
4,0,5,15500.0,12750.0,1400,1700,0,4,0.0,1,1.0,0,0,3,22676.205763,24459.855175,3,3,1,3,0


#### We are going to predict <code>nxt_ct_winner</code> whose values are 1 if CT is the winner team or 0 if T is the winner team

## Define features and target

In [5]:
df.columns

Index(['file', 'round', 'wp_ct_val', 'wp_t_val', 'nade_ct_val', 'nade_t_val',
       'ct_alive', 't_alive', 'prev_ct_winner', 'ct_winner',
       'prev_bomb_planted', 'bomb_planted', 'ct_cons_wins', 't_cons_wins',
       'ct_val_pred', 't_val_pred', 'ct_round_type', 't_round_type',
       'ct_nxt_rnd_type_pred', 't_nxt_rnd_type_pred', 'nxt_ct_winner'],
      dtype='object')

In [6]:
FEATS = ['file', 'round', 'wp_ct_val', 'wp_t_val', 'nade_ct_val', 'nade_t_val',
       'ct_alive', 't_alive', 'prev_ct_winner', 'ct_winner',
       'prev_bomb_planted', 'bomb_planted', 'ct_cons_wins', 't_cons_wins',
       'ct_val_pred', 't_val_pred', 'ct_round_type', 't_round_type',
       'ct_nxt_rnd_type_pred', 't_nxt_rnd_type_pred']
TARGET = 'nxt_ct_winner'

## Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df[FEATS], df[TARGET])

## Algorithm election

In [7]:
#Preprocessor

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, FEATS)])

In [9]:
# regressor = RandomForestClassifier(max_depth=2, random_state=0)
# regressor = SGDClassifier(early_stopping=False, loss='log')
# regressor = LinearSVC()
# regressor = LogisticRegression()
# regressor = SVC(gamma='auto', probability=True) # Too slooooow, I did not get a result
# regressor = KNeighborsClassifier()
regressor = LGBMClassifier()  # better without this (boosting_type='rf', bagging_freq=1, bagging_fraction = 0.9, n_estimators=100)
# regressor = GradientBoostingClassifier(n_estimators=100) # Same as LGBM but one-thread processing, too slow
# regressor = AdaBoostClassifier(n_estimators=100)
# regressor = ExtraTreesClassifier(n_estimators=100)
# model = RandomForestClassifier(max_depth=2, random_state=0)
# model = SGDClassifier(early_stopping=True, loss='hinge')

model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])

model.fit(X_train, y_train);

In [10]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

In [11]:
sk_report = classification_report(
    digits=6,
    y_true=y_test, 
    y_pred=y_pred)
print(sk_report)

              precision    recall  f1-score   support

           0   0.609173  0.738845  0.667772     43074
           1   0.558187  0.410396  0.473016     34630

    accuracy                       0.592466     77704
   macro avg   0.583680  0.574620  0.570394     77704
weighted avg   0.586450  0.592466  0.580976     77704



In [12]:
roc_auc_score(y_test, y_prob[:,1])

0.6197619449761701

| Algorithm | Accuracy Score | roc-auc-score |
|----------|---------------|------------------|
| SGDClassifier | 0.578760 | 0.5911053042759113 |
| RandomForestClassifier | 0.571669 | 0.6005791010926799 |
| LinearSVC | 0.583278 | no proba |
| LogisticRegression | 0.583175 | 0.5963322313284364 |
| SVC | Too slow, not finished |  |
| KNeighborsClassifier | 0.548157 | 0.555818468720543 |
| <font color='green'>LGBMClassifier</font> | <font color='green'>0.591952</font> | <font color='green'>0.6226857960041758</font> |
| GradientBoostingClassifier | 0.590793 | 0.6215877286341829 |
| AdaBoostClassifier | 0.583149 | 0.6086323297666735 |
| ExtraTreesClassifier | 0.569790 | 0.588826622540981 |

#### From the experimentation of various algorithms we conclude to use finally **LGBMClassifier** again

## Grid search to hyperparameter tuning

In [13]:
# To obtain the best hyperparameters of our model, we test a wider range of values and then 
# we reduce until obtaining close but different values, in order to obtain the best results.

param_grid = {
    'regressor__num_leaves': [47,48,49,50,51,52],
    'regressor__n_estimators': [350,360,370,380,390,400,410,420,430],
    'regressor__min_data_in_leaf': [4,5,6,7,8,12,15],
    'regressor__max_depth': [5,6,7],
    'regressor__learning_rate': [0.02,0.03,0.04,0.045,0.05,0.055,0.06],
    'regressor__feature_fraction': [0.75,0.8,0.85,0.9,0.95,1],
    'regressor__bagging_frequency': [0.75,0.80,0.85,0.9,0.95,1],
    'regressor__bagging_fraction': [0.75,0.8,0.85,0.9,0.95,1],
}


regressor = LGBMClassifier()  

model = Pipeline(steps=[('t_preprocessor', preprocessor),
                           ('regressor', regressor)])

# MODEL --------------------------------------

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=5, 
                                 scoring='accuracy', 
                                 n_jobs=-1,
                                 n_iter=500)


grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 42.5min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 53.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 66.3min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed: 80.0min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 92.3min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('t_preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('scaler',
                                                                                                StandardScaler())]),
                                                                               ['file',
                                                                                'round',
                                                                                'wp_ct_val',
                                                                                'wp_t_val',
                                                                                'nade_ct_val',
                                                                                'nade_t_val',
                                                   

In [14]:
#Once we have trained the model with different hyperparameter values, 
#we see which parameters have obtained the best score:
grid_search.best_params_

{'regressor__num_leaves': 49,
 'regressor__n_estimators': 360,
 'regressor__min_data_in_leaf': 5,
 'regressor__max_depth': 5,
 'regressor__learning_rate': 0.02,
 'regressor__feature_fraction': 0.8,
 'regressor__bagging_frequency': 0.8,
 'regressor__bagging_fraction': 1}

In [15]:
# see the best score that we have trained:
grid_search.best_score_

0.5931903135054399


| Parameter | Value |
|----------|---------------|
| regressor__num_leaves | 49 |
| regressor__n_estimators | 360 |
| regressor__min_data_in_leaf | 5 |
| regressor__max_depth | 5 |
| regressor__learning_rate | 0.02 |
| regressor__feature_fraction | 0.8 |
| regressor__bagging_frequency | 0.8 |
| regressor__bagging_fraction | 1 |
| RESULT | 0.5931903135054399 |

### Let's look the diference between by default and hyperparameter tuning for LGBMClassifier

In [8]:
#Preprocessor

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, FEATS)])

In [9]:
regressor = LGBMClassifier()

# regressor = LGBMClassifier(num_leaves=49,
#                           n_estimators=360,
#                           min_data_in_leaf=5,
#                           max_depth=5,
#                           learning_rate=0.02,
#                           feature_fraction=0.8,
#                           bagging_frequency=0.8,
#                           bagging_fraction=1)

# regressor = LGBMClassifier(num_leaves=50,
#                           n_estimators=350,
#                           min_data_in_leaf=15,
#                           max_depth=7,
#                           learning_rate=0.055,
#                           feature_fraction=0.8,
#                           bagging_frequency=0.75,
#                           bagging_fraction=0.75)

model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])

model.fit(X_train, y_train);

In [10]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

In [11]:
sk_report = classification_report(
    digits=6,
    y_true=y_test, 
    y_pred=y_pred)
print(sk_report)

              precision    recall  f1-score   support

           0   0.609604  0.742644  0.669580     43127
           1   0.558951  0.406802  0.470892     34577

    accuracy                       0.593200     77704
   macro avg   0.584278  0.574723  0.570236     77704
weighted avg   0.587064  0.593200  0.581167     77704



In [12]:
roc_auc_score(y_test, y_prob[:,1])

0.623211161615989

| Algorithm | Accuracy Score | roc-auc-score |
|----------|---------------|------------------|
| <font color='green'>LGBMC default</font> | <font color='green'>0.593200</font> | <font color='green'>0.623211161615989</font> |
| LGBMC hyp. gridsearch | 0.591861 | 0.6222019109276811 |
| LGBMC hyp. 4_3 | 0.592711 | 0.624584921501874 |

After comparing the performance of LGBMClassifier with the hyperparameters obtained from the grid search, and the parameters of the 4_3_ml_algorithm we saw that it reaches a better score with the **default parameters**.

Maybe we are not making an accurate grid search

### **TRAIN MODEL WITH FULL DATASET & SAVE**

In [8]:
regressor = LGBMClassifier()

model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])

model.fit(df[FEATS], df[TARGET]);

nxt_ct_winner_pred = model.predict(df[FEATS])

In [9]:
dump(model, '../models/nxt_ct_winner.joblib') # Save model

['../models/nxt_ct_winner.joblib']

In [10]:
df['nxt_ct_winner_pred'] = nxt_ct_winner_pred

In [11]:
df.to_csv('../data/results/predicted_nxt_ct_winner.csv', index=False) # Export the DataFrame with the prediction column added

In [12]:
df.head()

Unnamed: 0,file,round,wp_ct_val,wp_t_val,nade_ct_val,nade_t_val,ct_alive,t_alive,prev_ct_winner,ct_winner,prev_bomb_planted,bomb_planted,ct_cons_wins,t_cons_wins,ct_val_pred,t_val_pred,ct_round_type,t_round_type,ct_nxt_rnd_type_pred,t_nxt_rnd_type_pred,nxt_ct_winner,nxt_ct_winner_pred
0,0,1,1000.0,1166.666667,550,1200,5,5,0.5,1,0.5,0,0,0,4078.134589,3943.272665,0,0,2,1,0,0
1,0,2,10100.0,3687.5,1100,50,4,0,1.0,0,0.0,0,1,0,17819.702711,6290.616771,2,1,3,1,1,0
2,0,3,4125.0,11700.0,900,2450,0,1,0.0,0,0.0,1,0,1,7038.468589,19600.790638,2,3,1,3,1,1
3,0,4,1000.0,11700.0,0,1600,0,3,0.0,0,1.0,1,0,2,1452.468928,22568.098741,1,2,3,3,0,0
4,0,5,15500.0,12750.0,1400,1700,0,4,0.0,1,1.0,0,0,3,22676.205763,24459.855175,3,3,1,3,0,0
