## **Supervised ML classification algorithm to predict next round wiiner team (CT & T)**
## **Algorithm**

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, classification_report

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

In [2]:
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 30)

### Data

In [3]:
df = pd.read_csv('../data/processed/5_1_base_predict_winner.csv')

In [4]:
df.head()

Unnamed: 0,file,round,wp_ct_val,wp_t_val,nade_ct_val,nade_t_val,ct_alive,t_alive,prev_ct_winner,ct_winner,prev_bomb_planted,bomb_planted,ct_cons_wins,t_cons_wins,ct_val_pred,t_val_pred,ct_round_type,t_round_type,ct_nxt_rnd_type_pred,t_nxt_rnd_type_pred,nxt_ct_winner
0,0,1,1000.0,1166.666667,550,1200,5,5,0.5,1,0.5,0,0,0,4078.134589,3943.272665,0,0,2,1,0
1,0,2,10100.0,3687.5,1100,50,4,0,1.0,0,0.0,0,1,0,17819.702711,6290.616771,2,1,3,1,1
2,0,3,4125.0,11700.0,900,2450,0,1,0.0,0,0.0,1,0,1,7038.468589,19600.790638,2,3,1,3,1
3,0,4,1000.0,11700.0,0,1600,0,3,0.0,0,1.0,1,0,2,1452.468928,22568.098741,1,2,3,3,0
4,0,5,15500.0,12750.0,1400,1700,0,4,0.0,1,1.0,0,0,3,22676.205763,24459.855175,3,3,1,3,0


#### We are going to predict <code>nxt_ct_winner</code> whose values are 1 if CT is the winner team or 0 if T is the winner team

## Define features and target

In [5]:
df.columns

Index(['file', 'round', 'wp_ct_val', 'wp_t_val', 'nade_ct_val', 'nade_t_val',
       'ct_alive', 't_alive', 'prev_ct_winner', 'ct_winner',
       'prev_bomb_planted', 'bomb_planted', 'ct_cons_wins', 't_cons_wins',
       'ct_val_pred', 't_val_pred', 'ct_round_type', 't_round_type',
       'ct_nxt_rnd_type_pred', 't_nxt_rnd_type_pred', 'nxt_ct_winner'],
      dtype='object')

In [6]:
FEATS = ['file', 'round', 'wp_ct_val', 'wp_t_val', 'nade_ct_val', 'nade_t_val',
       'ct_alive', 't_alive', 'prev_ct_winner', 'ct_winner',
       'prev_bomb_planted', 'bomb_planted', 'ct_cons_wins', 't_cons_wins',
       'ct_val_pred', 't_val_pred', 'ct_round_type', 't_round_type',
       'ct_nxt_rnd_type_pred', 't_nxt_rnd_type_pred']
TARGET = 'nxt_ct_winner'

## Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df[FEATS], df[TARGET])

## Algorithm election

In [8]:
#Preprocessor

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, FEATS)])

In [9]:
# regressor = RandomForestClassifier(max_depth=2, random_state=0)
# regressor = SGDClassifier(early_stopping=False, loss='log')
# regressor = LinearSVC()
# regressor = LogisticRegression()
# regressor = SVC(gamma='auto', probability=True) # Too slooooow, I did not get a result
# regressor = KNeighborsClassifier()
# regressor = LGBMClassifier()  # better without this (boosting_type='rf', bagging_freq=1, bagging_fraction = 0.9, n_estimators=100)
# regressor = GradientBoostingClassifier(n_estimators=100) # Same as LGBM but one-thread processing, too slow
# regressor = AdaBoostClassifier(n_estimators=100)
regressor = ExtraTreesClassifier(n_estimators=100)

model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])

model.fit(X_train, y_train);

In [10]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

In [11]:
sk_report = classification_report(
    digits=6,
    y_true=y_test, 
    y_pred=y_pred)
print(sk_report)

              precision    recall  f1-score   support

           0   0.597914  0.694247  0.642490     43267
           1   0.518350  0.413422  0.459978     34437

    accuracy                       0.569790     77704
   macro avg   0.558132  0.553834  0.551234     77704
weighted avg   0.562653  0.569790  0.561604     77704



In [12]:
roc_auc_score(y_test, y_prob[:,1])

0.588826622540981

| Algorithm | Accuracy Score | roc-auc-score |
|----------|---------------|------------------|
| SGDClassifier | 0.578760 | 0.5911053042759113 |
| RandomForestClassifier | 0.571669 | 0.6005791010926799 |
| LinearSVC | 0.583278 | no proba |
| LogisticRegression | 0.583175 | 0.5963322313284364 |
| SVC | Too slow, not finished |  |
| KNeighborsClassifier | 0.548157 | 0.555818468720543 |
| <font color='green'>LGBMClassifier</font> | <font color='green'>0.591952</font> | 0.6226857960041758 |
| GradientBoostingClassifier | 0.590793 | 0.6215877286341829 |
| AdaBoostClassifier | 0.583149 | 0.6086323297666735 |
| ExtraTreesClassifier | 0.569790 | 0.588826622540981 |