## **Supervised ML regression algorithm to predict next round team value (CT & T)**
## **Algorithm**

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

In [2]:
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 30)

### Data

In [3]:
ct_df = pd.read_csv('../data/processed/4_base_predict_next_rnd_ct_type.csv')
t_df = pd.read_csv('../data/processed/4_base_predict_next_rnd_t_type.csv')

### Encoding

We use a manual encoding to be sure of being the same values for round_type and nxt_rnd_type.

If we use LabelEncoder() it could be possible to have different values for the same label.

In [4]:
round_type_dic = {'PISTOL_ROUND':0, 'ECO':1, 'MEDIUM':2, 'FULL':3, 'LAST':4}

In [5]:
ct_df['round_type'] = ct_df['round_type'].apply(lambda x: round_type_dic[x])
ct_df['nxt_rnd_type'] = ct_df['nxt_rnd_type'].apply(lambda x: round_type_dic[x])

t_df['round_type'] = t_df['round_type'].apply(lambda x: round_type_dic[x])
t_df['nxt_rnd_type'] = t_df['nxt_rnd_type'].apply(lambda x: round_type_dic[x])

## Define features and target

In [6]:
CT_FEATS = ['file', 'round', 'wp_ct_val', 'nade_ct_val', 'ct_alive', 't_alive', 'ct_winner', 'bomb_planted', 'ct_cons_wins', 't_cons_wins', 'ct_val_pred', 'round_type']
T_FEATS = ['file', 'round', 'wp_t_val', 'nade_t_val', 'ct_alive', 't_alive', 'ct_winner', 'bomb_planted', 'ct_cons_wins', 't_cons_wins', 't_val_pred', 'round_type']
TARGET = 'nxt_rnd_type'

In [7]:
#Preprocessor

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor_ct = ColumnTransformer(transformers=[('num', numeric_transformer, CT_FEATS)])
preprocessor_t = ColumnTransformer(transformers=[('num', numeric_transformer, T_FEATS)])

### Beggining of stackoverflow solution

In [None]:
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=5000, n_features=5,
                           n_informative=5, n_redundant=0,
                           n_classes=5, random_state=0, 
                           shuffle=False)

X_train, X_test, y_train, y_test = train_test_split(ct_df[CT_FEATS], ct_df[TARGET])

# regressor = RandomForestClassifier(max_depth=2, random_state=0)
# regressor = SGDClassifier(early_stopping=True, loss='hinge')
# regressor = LinearSVC()
# regressor = LogisticRegression()
regressor = SVC(gamma='auto', probability=True)
# regressor = KNeighborsClassifier()
# regressor = LGBMClassifier()  # better without this (boosting_type='rf', bagging_freq=1, bagging_fraction = 0.9, n_estimators=100)
# regressor = GradientBoostingClassifier(n_estimators=100) # Same as LGBM but one-thread processing, too slow
# regressor = AdaBoostClassifier(n_estimators=100)
# regressor = ExtraTreesClassifier(n_estimators=100)

# model = RandomForestClassifier(max_depth=2, random_state=0)
# model = SGDClassifier(early_stopping=True, loss='hinge')

model = Pipeline(steps=[('ct_preprocessor', preprocessor_ct),
                           ('regressor', regressor)])

model.fit(X_train, y_train);


# sk_report = classification_report(
#     digits=6,
#     y_true=y_test, 
#     y_pred=model.predict(X_test))
# print(sk_report)


In [9]:
model.score(X_test, y_test)

0.5623648718212705

| Algorithm | Score |
|----------|---------------|
| SGDClassifier | 0.5366519098115927 |
| RandomForestClassifier | 0.5304102748893236 |
| LinearSVC | 0.5668691444455883 |
| LogisticRegression | 0.5623648718212705 |
| SVC | 000 |
| KNeighborsClassifier | 0.5497271697724699 |
| LGBMClassifier | 0.644432719036343 |
| GradientBoostingClassifier | 0.644432719036343 |
| AdaBoostClassifier | 0.5766241120148254 |
| ExtraTreesClassifier | 0.6142669618037682 |

#### average = 'macro'

                    precision    recall  f1-score   support

               0   0.047330  0.032713  0.038687      3332
               1   0.145251  0.001603  0.003170     16224
               2   0.383403  0.056582  0.098611     14616
               3   0.539235  0.968083  0.692653     40511
               4   0.017857  0.001986  0.003575      3021

        accuracy                       0.517168     77704
        macro avg  0.226615  0.212193  0.167339     77704
        weighted avg 0.386299  0.517168  0.382123     77704


-----
#### average = 'micro'  original code


                  precision    recall  f1-score   support

               0   0.000000  0.000000  0.000000      3305
               1   0.225766  0.932514  0.363522     16359
               2   0.166667  0.000068  0.000136     14655
               3   0.673026  0.157701  0.255528     40266
               4   0.017316  0.003847  0.006296      3119

        accuracy                       0.278210     77704
       macro avg   0.216555  0.218826  0.125096     77704
    weighted avg   0.428419  0.278210  0.209224     77704

---- 
#### average = 'micro' not original code

              precision    recall  f1-score   support

           0   0.000000  0.000000  0.000000      3399
           1   0.000000  0.000000  0.000000     16312
           2   0.187268  0.994156  0.315168     14546
           3   0.723992  0.008447  0.016698     40371
           4   0.000000  0.000000  0.000000      3076

    accuracy                       0.190492     77704
    macro avg   0.182252  0.200521  0.066373     77704
    weighted avg   0.411205  0.190492  0.067674     77704

### **SPLITS**

In [11]:
ct_train, ct_test = train_test_split(ct_df)
t_train, t_test = train_test_split(t_df)

In [12]:
print(ct_train.shape, ct_test.shape)
print(t_train.shape, t_test.shape)

(233109, 13) (77704, 13)
(233109, 13) (77704, 13)


### **MODEL**

In [13]:
# regressor = LGBMRegressor(boosting_type='gbdt', 
#                        bagging_freq=1, 
#                        bagging_fraction = 0.9, 
#                        n_estimators=100)

regressor = RandomForestClassifier()

In [14]:
ct_model = Pipeline(steps=[('ct_preprocessor', preprocessor_ct),
                           ('regressor', regressor)])

# t_model = Pipeline(steps=[('t_preprocessor', preprocessor_t),
#                            ('regressor', regressor)])

In [15]:
ct_model.fit(ct_train[CT_FEATS], ct_train[TARGET]);

# t_model.fit(t_train[T_FEATS], t_train[TARGET]);

In [16]:
y_ct_test = ct_model.predict_proba(ct_test[CT_FEATS])[:,1]
y_ct_train = ct_model.predict_proba(ct_train[CT_FEATS])[:,1]

# y_ct_test = ct_model.predict(ct_test[CT_FEATS])
# y_ct_train = ct_model.predict(ct_train[CT_FEATS])

# y_t_test = t_model.predict(t_test[T_FEATS])
# y_t_train = t_model.predict(t_train[T_FEATS])

### **CHECK PERFORMANCE**

In [None]:
# 

print(f"CT test error: {roc_auc_score(ct_test[TARGET].to_numpy(), y_ct_test, average = 'macro', multi_class='ovr')}")
print(f"CT train error: {roc_auc_score(ct_train[TARGET].to_numpy(), y_ct_train, average = 'macro', multi_class='ovr')}")
print()
# print(f"T test error: {roc_auc_score(y_pred=y_t_test, y_true=t_test[T_TARGET])}")
# print(f"T train error: {roc_auc_score(y_pred=y_t_train, y_true=t_train[T_TARGET])}")