## **2nd Supervised ML LightGBM regressor algorithm**

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

from lightgbm import LGBMRegressor
from joblib import dump, load

In [2]:
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 30)

### Data

In [3]:
df = pd.read_csv('../data/processed/base_to_ml_predicted_team_value.csv')

### Preprocessing

Erase column 'round_type' because we cannot know this value from the live records

In [4]:
df.drop(['round_type'], axis=1, inplace=True)

In [5]:
df.tail()

Unnamed: 0,file,round,wp_ct_val,wp_t_val,nade_ct_val,nade_t_val,ct_alive,t_alive,ct_winner,bomb_planted,ct_cons_wins,t_cons_wins,ct_val_real,t_val_real
310808,esea_match_13829171.dem,20,15500.0,13500.0,600,1900,0,4,0.0,0.0,0,3,23250,28150
310809,esea_match_13829171.dem,21,17562.5,16062.5,3200,1400,5,0,1.0,0.0,1,0,30950,25450
310810,esea_match_13829171.dem,22,23750.0,13500.0,2700,1400,2,0,1.0,0.0,2,0,32900,16600
310811,esea_match_13829171.dem,23,14312.5,13500.0,1600,2600,2,4,0.0,0.0,0,1,25950,24450
310812,esea_match_13829171.dem,24,23750.0,20333.333333,1200,2700,0,2,0.0,1.0,0,2,14300,27500


We want to predict 'ct_val' and 't_val' so we will split the dataframe into 2 dataframes, one with CT data and other with T data

**ct_df** = ['file',
 'round',
 'wp_ct_val',
 'nade_ct_val',
 'ct_alive',
 't_alive',
 'ct_winner',
 'bomb_planted',
 'ct_cons_wins',
 't_cons_wins',
 'ct_val_real']


**t_df** = ['file',
 'round',
 'wp_t_val',
 'nade_t_val',
 'ct_alive',
 't_alive',
 'ct_winner',
 'bomb_planted',
 'ct_cons_wins',
 't_cons_wins',
 't_val_real']

In [6]:
# Two DataFrames, one per CT another for T

ct_df = df[['file', 'round', 'wp_ct_val', 'nade_ct_val', 'ct_alive', 't_alive', 'ct_winner', 'bomb_planted', 'ct_cons_wins', 't_cons_wins', 'ct_val_real']]
t_df = df[['file', 'round', 'wp_t_val', 'nade_t_val', 'ct_alive', 't_alive', 'ct_winner', 'bomb_planted', 'ct_cons_wins', 't_cons_wins', 't_val_real']]

In [7]:
display(ct_df.head())
t_df.tail()

Unnamed: 0,file,round,wp_ct_val,nade_ct_val,ct_alive,t_alive,ct_winner,bomb_planted,ct_cons_wins,t_cons_wins,ct_val_real
0,esea_match_13779704.dem,1,1000.0,550,5,5,0.5,0.5,0,0,4550
1,esea_match_13779704.dem,2,10100.0,1100,4,0,1.0,0.0,1,0,18450
2,esea_match_13779704.dem,3,4125.0,900,0,1,0.0,0.0,0,1,9550
3,esea_match_13779704.dem,4,1000.0,0,0,3,0.0,1.0,0,2,1600
4,esea_match_13779704.dem,5,15500.0,1400,0,4,0.0,1.0,0,3,23350


Unnamed: 0,file,round,wp_t_val,nade_t_val,ct_alive,t_alive,ct_winner,bomb_planted,ct_cons_wins,t_cons_wins,t_val_real
310808,esea_match_13829171.dem,20,13500.0,1900,0,4,0.0,0.0,0,3,28150
310809,esea_match_13829171.dem,21,16062.5,1400,5,0,1.0,0.0,1,0,25450
310810,esea_match_13829171.dem,22,13500.0,1400,2,0,1.0,0.0,2,0,16600
310811,esea_match_13829171.dem,23,13500.0,2600,2,4,0.0,0.0,0,1,24450
310812,esea_match_13829171.dem,24,20333.333333,2700,0,2,0.0,1.0,0,2,27500


In [8]:
# LabelEncoder to categorical feature -> file
le = LabelEncoder()

ct_df['file'] = le.fit_transform(ct_df.file)
t_df['file'] = le.fit_transform(t_df.file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [9]:
display(ct_df.head())
t_df.tail()

Unnamed: 0,file,round,wp_ct_val,nade_ct_val,ct_alive,t_alive,ct_winner,bomb_planted,ct_cons_wins,t_cons_wins,ct_val_real
0,0,1,1000.0,550,5,5,0.5,0.5,0,0,4550
1,0,2,10100.0,1100,4,0,1.0,0.0,1,0,18450
2,0,3,4125.0,900,0,1,0.0,0.0,0,1,9550
3,0,4,1000.0,0,0,3,0.0,1.0,0,2,1600
4,0,5,15500.0,1400,0,4,0.0,1.0,0,3,23350


Unnamed: 0,file,round,wp_t_val,nade_t_val,ct_alive,t_alive,ct_winner,bomb_planted,ct_cons_wins,t_cons_wins,t_val_real
310808,12184,20,13500.0,1900,0,4,0.0,0.0,0,3,28150
310809,12184,21,16062.5,1400,5,0,1.0,0.0,1,0,25450
310810,12184,22,13500.0,1400,2,0,1.0,0.0,2,0,16600
310811,12184,23,13500.0,2600,2,4,0.0,0.0,0,1,24450
310812,12184,24,20333.333333,2700,0,2,0.0,1.0,0,2,27500


## Define features and target

In [10]:
# Define all the features

CT_FEATS = ['file', 'round', 'wp_ct_val', 'nade_ct_val', 'ct_alive', 't_alive', 'ct_winner', 'bomb_planted', 'ct_cons_wins', 't_cons_wins']
T_FEATS = ['file', 'round', 'wp_t_val', 'nade_t_val', 'ct_alive', 't_alive', 'ct_winner', 'bomb_planted', 'ct_cons_wins', 't_cons_wins']
CT_TARGET = 'ct_val_real'
T_TARGET = 't_val_real'

In [11]:
#Preprocessor

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor_ct = ColumnTransformer(transformers=[('num', numeric_transformer, CT_FEATS)])
preprocessor_t = ColumnTransformer(transformers=[('num', numeric_transformer, T_FEATS)])

### **SPLITS**
We are going to use cross-validation, but we still hide 5% of the data to make a final prediction and be aware of possible overfitting.

In [12]:
ct_train, ct_test = train_test_split(ct_df, train_size=0.95)
t_train, t_test = train_test_split(t_df, train_size=0.95)

In [13]:
print(ct_train.shape, ct_test.shape)
print(t_train.shape, t_test.shape)

(295272, 11) (15541, 11)
(295272, 11) (15541, 11)


### **MODEL**

In [14]:
regressor = LGBMRegressor()

In [15]:
# ct_model = Pipeline(steps=[('ct_preprocessor', preprocessor_ct),
#                            ('regressor', regressor)])

t_model = Pipeline(steps=[('t_preprocessor', preprocessor_t),
                           ('regressor', regressor)])

In [16]:
# To obtain the best hyperparameters of our model, we test a wider range of values and then 
# we reduce until obtaining close but different values, in order to obtain the best results.

param_grid = {
    'regressor__num_leaves': [47,48,49,50,51,52],
    'regressor__n_estimators': [350,360,370,380,390,400,410,420,430],
    'regressor__min_data_in_leaf': [4,5,6,7,8,12,15],
    'regressor__max_depth': [5,6,7],
    'regressor__learning_rate': [0.02,0.03,0.04,0.045,0.05,0.055,0.06],
    'regressor__feature_fraction': [0.75,0.8,0.85,0.9,0.95,1],
    'regressor__bagging_frequency': [0.75,0.80,0.85,0.9,0.95,1],
    'regressor__bagging_fraction': [0.75,0.8,0.85,0.9,0.95,1],
}

#CT MODEL --------------------------------------

# grid_search = RandomizedSearchCV(ct_model, 
#                                  param_grid, 
#                                  cv=5, 
#                                  verbose=5, 
#                                  scoring='r2', 
#                                  n_jobs=-1,
#                                  n_iter=700)

# grid_search.fit(ct_train[CT_FEATS], ct_train[CT_TARGET])


# T MODEL ---------------------------------------

grid_search = RandomizedSearchCV(t_model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=5, 
                                 scoring='r2', 
                                 n_jobs=-1,
                                 n_iter=500)

grid_search.fit(t_train[T_FEATS], t_train[T_TARGET])

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 21.2min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed: 28.9min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 38.1min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 48.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 59.8min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed: 72.2min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 83.2min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('t_preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('scaler',
                                                                                                StandardScaler())]),
                                                                               ['file',
                                                                                'round',
                                                                                'wp_t_val',
                                                                                'nade_t_val',
                                                                                'ct_alive',
                                                                                't_alive',
                                                        

In [18]:
#Once we have trained the model with different hyperparameter values, 
#we see which parameters have obtained the best score:
grid_search.best_params_

{'regressor__num_leaves': 51,
 'regressor__n_estimators': 400,
 'regressor__min_data_in_leaf': 15,
 'regressor__max_depth': 7,
 'regressor__learning_rate': 0.06,
 'regressor__feature_fraction': 0.85,
 'regressor__bagging_frequency': 0.95,
 'regressor__bagging_fraction': 1}

In [19]:
# see the best score that we have trained:
grid_search.best_score_

-2691.897964099603

### Best hyperparameter values for **ct_model**

In [20]:
{'regressor__num_leaves': 51,
 'regressor__n_estimators': 400,
 'regressor__min_data_in_leaf': 15,
 'regressor__max_depth': 7,
 'regressor__learning_rate': 0.06,
 'regressor__feature_fraction': 0.85,
 'regressor__bagging_frequency': 0.95,
 'regressor__bagging_fraction': 1}

{'regressor__num_leaves': 51,
 'regressor__n_estimators': 400,
 'regressor__min_data_in_leaf': 15,
 'regressor__max_depth': 7,
 'regressor__learning_rate': 0.06,
 'regressor__feature_fraction': 0.85,
 'regressor__bagging_frequency': 0.95,
 'regressor__bagging_fraction': 1}

### Best hyperparameter values for **t_model**

In [17]:
{'regressor__num_leaves': 51,
 'regressor__n_estimators': 400,
 'regressor__min_data_in_leaf': 15,
 'regressor__max_depth': 7,
 'regressor__learning_rate': 0.06,
 'regressor__feature_fraction': 0.85,
 'regressor__bagging_frequency': 0.95,
 'regressor__bagging_fraction': 1}

{'regressor__num_leaves': 51,
 'regressor__n_estimators': 400,
 'regressor__min_data_in_leaf': 15,
 'regressor__max_depth': 7,
 'regressor__learning_rate': 0.06,
 'regressor__feature_fraction': 0.85,
 'regressor__bagging_frequency': 0.95,
 'regressor__bagging_fraction': 1}

### CT Regressor

In [16]:
regressor = LGBMRegressor(num_leaves=51,
                          n_estimators=400,
                          min_data_in_leaf=15,
                          max_depth=7,
                          learning_rate=0.06,
                          feature_fraction=0.85,
                          bagging_frequency=0.95,
                          bagging_fraction=1)

### T Regressor

In [15]:
regressor = LGBMRegressor(num_leaves=51,
                          n_estimators=400,
                          min_data_in_leaf=15,
                          max_depth=7,
                          learning_rate=0.06,
                          feature_fraction=0.85,
                          bagging_frequency=0.95,
                          bagging_fraction=1)

### Models

In [15]:
# ct_model = Pipeline(steps=[('ct_preprocessor', preprocessor_ct),
#                            ('regressor', regressor)])

t_model = Pipeline(steps=[('t_preprocessor', preprocessor_t),
                           ('regressor', regressor)])

### Test hyperparameter values from gridsearch

In [16]:
# ct_model.fit(ct_train[CT_FEATS], ct_train[CT_TARGET]);

In [17]:
t_model.fit(t_train[T_FEATS], t_train[T_TARGET]);

### **CHECK PERFORMANCE**

In [18]:
# y_ct_test = ct_model.predict(ct_test[CT_FEATS])
# y_ct_train = ct_model.predict(ct_train[CT_FEATS])

y_t_test = t_model.predict(t_test[T_FEATS])
y_t_train = t_model.predict(t_train[T_FEATS])

In [19]:
# Mean_squared_error

# print(f"CT test error: {mean_squared_error(y_pred=y_ct_test, y_true=ct_test[CT_TARGET], squared=False)}")
# print(f"CT train error: {mean_squared_error(y_pred=y_ct_train, y_true=ct_train[CT_TARGET], squared=False)}")
# print()
print(f"T test error: {mean_squared_error(y_pred=y_t_test, y_true=t_test[T_TARGET], squared=False)}")
print(f"T train error: {mean_squared_error(y_pred=y_t_train, y_true=t_train[T_TARGET], squared=False)}")

T test error: 2704.993275624408
T train error: 2660.0813278751643


In [20]:
# r2_score

# print(f"CT test error: {r2_score(y_pred=y_ct_test, y_true=ct_test[CT_TARGET])}")
# print(f"CT train error: {r2_score(y_pred=y_ct_train, y_true=ct_train[CT_TARGET])}")
print()
print(f"T test error: {r2_score(y_pred=y_t_test, y_true=t_test[T_TARGET])}")
print(f"T train error: {r2_score(y_pred=y_t_train, y_true=t_train[T_TARGET])}")


T test error: 0.9065775597739754
T train error: 0.9093794975918977


--------------------------------
### LGBM REGRESSOR
CT test error:______Base: 2710.808 || 0.93391______<font color='green'>Hyp. tuning: 2672.174 || 0.93586</font>

CT train error:______Base: 2698.860 || 0.93466______<font color='green'>Hyp. tuning: 2628.522 || 0.93788</font>

T test error:______Base: 2725.320 || 0.90388______<font color='green'>Hyp. tuning: 2704.993 || 0.90657</font>

T train error:______Base: 2728.544 || 0.90519______<font color='green'>Hyp. tuning: 2660.081 || 0.90937</font>

###  **Train model with full dataset**

#### **CT Model. Train & Save**

In [14]:
regressor = LGBMRegressor(num_leaves=51,
                          n_estimators=400,
                          min_data_in_leaf=15,
                          max_depth=7,
                          learning_rate=0.06,
                          feature_fraction=0.85,
                          bagging_frequency=0.95,
                          bagging_fraction=1)

ct_model = Pipeline(steps=[('ct_preprocessor', preprocessor_ct),
                           ('regressor', regressor)])

ct_model.fit(ct_df[CT_FEATS], ct_df[CT_TARGET])

ct_df_pred_val = ct_model.predict(ct_df[CT_FEATS])

In [27]:
dump(ct_model, '../models/ct_team_value.joblib') # Save model

['../models/ct_team_value.joblib']

In [29]:
ct_df['ct_val_pred'] = ct_df_pred_val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [31]:
ct_df.to_csv('../data/results/ct_predicted_value') # Export the ct DataFrame with the prediction column added

#### **T Model. Train & Save**

In [14]:
regressor = LGBMRegressor(num_leaves=51,
                          n_estimators=400,
                          min_data_in_leaf=15,
                          max_depth=7,
                          learning_rate=0.06,
                          feature_fraction=0.85,
                          bagging_frequency=0.95,
                          bagging_fraction=1)

t_model = Pipeline(steps=[('t_preprocessor', preprocessor_t),
                           ('regressor', regressor)])

t_model.fit(t_df[T_FEATS], t_df[T_TARGET])

t_df_pred_val = t_model.predict(t_df[T_FEATS])

In [15]:
dump(t_model, '../models/t_team_value.joblib') # Save model

['../models/t_team_value.joblib']

In [16]:
t_df['t_val_pred'] = t_df_pred_val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
t_df.to_csv('../data/results/t_predicted_value') # Export the t DataFrame with the prediction column added