## **Supervised ML regression algorithm to predict next round team value (CT & T)**
## **Algorithm**

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

In [2]:
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 30)

### Data

In [3]:
ct_df = pd.read_csv('../data/processed/4_base_predict_next_rnd_ct_type.csv')
t_df = pd.read_csv('../data/processed/4_base_predict_next_rnd_t_type.csv')

### Encoding

We use a manual encoding to be sure of being the same values for round_type and nxt_rnd_type.

If we use LabelEncoder() it could be possible to have different values for the same label.

In [4]:
round_type_dic = {'PISTOL_ROUND':0, 'ECO':1, 'MEDIUM':2, 'FULL':3, 'LAST':4}

In [5]:
ct_df['round_type'] = ct_df['round_type'].apply(lambda x: round_type_dic[x])
ct_df['nxt_rnd_type'] = ct_df['nxt_rnd_type'].apply(lambda x: round_type_dic[x])

t_df['round_type'] = t_df['round_type'].apply(lambda x: round_type_dic[x])
t_df['nxt_rnd_type'] = t_df['nxt_rnd_type'].apply(lambda x: round_type_dic[x])

## Define features and target

In [6]:
CT_FEATS = ['file', 'round', 'wp_ct_val', 'nade_ct_val', 'ct_alive', 't_alive', 'ct_winner', 'bomb_planted', 'ct_cons_wins', 't_cons_wins', 'ct_val_pred', 'round_type']
T_FEATS = ['file', 'round', 'wp_t_val', 'nade_t_val', 'ct_alive', 't_alive', 'ct_winner', 'bomb_planted', 'ct_cons_wins', 't_cons_wins', 't_val_pred', 'round_type']
TARGET = 'nxt_rnd_type'

In [7]:
#Preprocessor

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor_ct = ColumnTransformer(transformers=[('num', numeric_transformer, CT_FEATS)])
preprocessor_t = ColumnTransformer(transformers=[('num', numeric_transformer, T_FEATS)])

### Beggining of stackoverflow solution

In [None]:
X_train, X_test, y_train, y_test = train_test_split(ct_df[CT_FEATS], ct_df[TARGET])


# To obtain the best hyperparameters of our model, we test a wider range of values and then 
# we reduce until obtaining close but different values, in order to obtain the best results.

param_grid = {
    'regressor__num_leaves': [47,48,49,50,51,52],
    'regressor__n_estimators': [350,360,370,380,390,400,410,420,430],
    'regressor__min_data_in_leaf': [4,5,6,7,8,12,15],
    'regressor__max_depth': [5,6,7],
    'regressor__learning_rate': [0.02,0.03,0.04,0.045,0.05,0.055,0.06],
    'regressor__feature_fraction': [0.75,0.8,0.85,0.9,0.95,1],
    'regressor__bagging_frequency': [0.75,0.80,0.85,0.9,0.95,1],
    'regressor__bagging_fraction': [0.75,0.8,0.85,0.9,0.95,1],
}


regressor = LGBMClassifier()  # better without this (boosting_type='rf', bagging_freq=1, bagging_fraction = 0.9, n_estimators=100)

model = Pipeline(steps=[('ct_preprocessor', preprocessor_ct),
                           ('regressor', regressor)])

# MODEL --------------------------------------

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=5, 
                                 scoring='accuracy', 
                                 n_jobs=7,
                                 n_iter=20)


grid_search.fit(X_train, y_train)


# model.fit(X_train, y_train);


# sk_report = classification_report(
#     digits=6,
#     y_true=y_test, 
#     y_pred=model.predict(X_test))
# print(sk_report)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.


In [None]:
#Once we have trained the model with different hyperparameter values, 
#we see which parameters have obtained the best score:
grid_search.best_params_

In [None]:
# see the best score that we have trained:
grid_search.best_score_

| Parameter | Value |
|----------|---------------|
| regressor__num_leaves | 48 |
| regressor__n_estimators | 430 |
| regressor__min_data_in_leaf | 7 |
| regressor__max_depth | 7 |
| regressor__learning_rate | 0.055 |
| regressor__feature_fraction | 0.75 |
| regressor__bagging_frequency | 1 |
| regressor__bagging_fraction | 1 |
| RESULT | 0.6447327118410268 |



In [None]:
# model.score(X_test, y_test)

| Algorithm | Score |
|----------|---------------|
| SGDClassifier | 0.5366519098115927 |
| RandomForestClassifier | 0.5304102748893236 |
| LinearSVC | 0.5668691444455883 |
| LogisticRegression | 0.5623648718212705 |
| SVC | 000 |
| KNeighborsClassifier | 0.5497271697724699 |
| LGBMClassifier | 0.644432719036343 |
| GradientBoostingClassifier | 0.644432719036343 |
| AdaBoostClassifier | 0.5766241120148254 |
| ExtraTreesClassifier | 0.6142669618037682 |

                  precision    recall  f1-score   support

               0   0.998375  0.908311  0.951216      3381
               1   0.541842  0.542639  0.542240     16335
               2   0.468735  0.065367  0.114733     14564
               3   0.667474  0.905000  0.768298     40516
               4   0.529908  0.237620  0.328110      2908

        accuracy                       0.646621     77704
       macro avg   0.641267  0.531787  0.540919     77704
    weighted avg   0.613064  0.646621  0.589764     77704



                  precision    recall  f1-score   support

               0   0.997674  0.911631  0.952713      3293
               1   0.538167  0.541853  0.540004     16355
               2   0.456726  0.059926  0.105951     14618
               3   0.665389  0.906518  0.767459     40393
               4   0.555121  0.233169  0.328400      3045

        accuracy                       0.644330     77704
       macro avg   0.642615  0.530619  0.538905     77704
    weighted avg   0.609118  0.644330  0.585784     77704

