In [1]:
# input needed are X_train, y_train_le, X_test
# remaining questions
# better way to convert np results to df
# any visualization

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import random

In [4]:
# lgbm accepts df for X and np for y (m,)
# lgbmclassifier() requires y to be labelled

In [5]:
X_train = pd.read_csv("X_train.csv")
X_train.shape

(7905, 25)

In [6]:
y_train_le = pd.read_csv("y_train_le.csv").to_numpy().ravel()
y_train_le.shape

(7905,)

In [7]:
X_test = pd.read_csv("X_test.csv")
X_test.shape

(5271, 25)

## set up cross validation

In [8]:
def cross_validate(model):
    """
    cross-validate model with StratifiedKFold()
    """
    skf = StratifiedKFold(shuffle=True,random_state=20231212) # default n_splits=5
    oof_pred = np.full((len(X_train),3),np.nan)
    cross_entropy_list = []
    
    for fold,(idx_tr,idx_va) in enumerate(skf.split(X_train,y_train_le)):
        X_tr = X_train.iloc[idx_tr]
        X_va = X_train.iloc[idx_va]
        y_tr = y_train_le[idx_tr]
        y_va = y_train_le[idx_va]
        model.fit(X_tr,y_tr)
        y_va_pred = model.predict_proba(X_va)
        oof_pred[idx_va] = y_va_pred
        cross_entropy = log_loss(y_va,y_va_pred)
        cross_entropy_list.append(cross_entropy)
    
    mean_cross_entropy = round(np.array(cross_entropy_list).mean(),10)
    # each fold produces y_va_pred, oof_pred, cross_entropy
    # after all folds produces cross_entropy_list, mean_cross_entropy
    return mean_cross_entropy, model.get_params()

## baseline multinomial logistic regression model

In [9]:
mlr = cross_validate(LogisticRegression(multi_class="multinomial",random_state=20231212))
mlr
# 0.5165245092

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(0.5165245092,
 {'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 100,
  'multi_class': 'multinomial',
  'n_jobs': None,
  'penalty': 'l2',
  'random_state': 20231212,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False})

## baseline lgbm model

In [10]:
lgbm = cross_validate(LGBMClassifier(random_state=20231221))
lgbm
# 0.4690187363

(0.4690187363,
 {'boosting_type': 'gbdt',
  'class_weight': None,
  'colsample_bytree': 1.0,
  'importance_type': 'split',
  'learning_rate': 0.1,
  'max_depth': -1,
  'min_child_samples': 20,
  'min_child_weight': 0.001,
  'min_split_gain': 0.0,
  'n_estimators': 100,
  'n_jobs': -1,
  'num_leaves': 31,
  'objective': None,
  'random_state': 20231221,
  'reg_alpha': 0.0,
  'reg_lambda': 0.0,
  'silent': 'warn',
  'subsample': 1.0,
  'subsample_for_bin': 200000,
  'subsample_freq': 0})

## tune lgbm models - random search
inspired by https://www.kaggle.com/code/willkoehrsen/intro-to-model-tuning-grid-and-random-search/notebook

In [11]:
# search domain
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(20, 150)),
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10)),
    'subsample': list(np.linspace(0.5, 1, 100)),
    'is_unbalance': [True, False]
}

In [12]:
def random_search(param_grid, num_evals):
    """
    random search for hyperparameter tuning
    """
    # empty dataframe for results
    results = pd.DataFrame(columns = ['cross_entropy', 'hyper_params'], index = list(range(num_evals)))
    
    # keep searching until reach num_evals
    random.seed(20231231)
    for i in range(num_evals):
        
        # init random hyperparameters
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']

        # eval random hyperparameters
        cross_entropy, hyper_params = cross_validate(LGBMClassifier(**hyperparameters,random_state=20231231))
        results.loc[i, :] = cross_entropy, hyper_params
    
    # sort with lowest cross_entropy on top
    results.sort_values('cross_entropy', ascending = True, inplace = True)
    results.reset_index(inplace = True)
    return results

In [13]:
random_results = random_search(param_grid, num_evals = 1000)
random_results.head()

Unnamed: 0,index,cross_entropy,hyper_params
0,25,0.44176,"{'boosting_type': 'gbdt', 'class_weight': None..."
1,300,0.44329,"{'boosting_type': 'gbdt', 'class_weight': None..."
2,198,0.443475,"{'boosting_type': 'dart', 'class_weight': None..."
3,392,0.443586,"{'boosting_type': 'dart', 'class_weight': None..."
4,3,0.443741,"{'boosting_type': 'dart', 'class_weight': None..."


In [20]:
random_results["hyper_params"]

0      {'boosting_type': 'gbdt', 'class_weight': None...
1      {'boosting_type': 'gbdt', 'class_weight': None...
2      {'boosting_type': 'dart', 'class_weight': None...
3      {'boosting_type': 'dart', 'class_weight': None...
4      {'boosting_type': 'dart', 'class_weight': None...
                             ...                        
995    {'boosting_type': 'dart', 'class_weight': None...
996    {'boosting_type': 'dart', 'class_weight': None...
997    {'boosting_type': 'dart', 'class_weight': None...
998    {'boosting_type': 'dart', 'class_weight': None...
999    {'boosting_type': 'dart', 'class_weight': None...
Name: hyper_params, Length: 1000, dtype: object

In [23]:
param_df = pd.DataFrame(random_results["hyper_params"].tolist())
param_df.head(10)

Unnamed: 0,boosting_type,class_weight,colsample_bytree,importance_type,learning_rate,max_depth,min_child_samples,min_child_weight,min_split_gain,n_estimators,...,num_leaves,objective,random_state,reg_alpha,reg_lambda,silent,subsample,subsample_for_bin,subsample_freq,is_unbalance
0,gbdt,,0.6,split,0.059162,-1,60,0.001,0.0,100,...,93,,20231231,0.734694,0.489796,warn,0.686869,200000,0,False
1,gbdt,,0.6,split,0.086339,-1,225,0.001,0.0,100,...,66,,20231231,0.061224,0.081633,warn,0.585859,240000,0,True
2,dart,,0.6,split,0.211152,-1,160,0.001,0.0,100,...,45,,20231231,0.367347,0.020408,warn,0.712121,40000,0,False
3,dart,,0.733333,split,0.189037,-1,120,0.001,0.0,100,...,144,,20231231,0.183673,0.632653,warn,0.580808,260000,0,True
4,dart,,0.644444,split,0.148061,-1,90,0.001,0.0,100,...,100,,20231231,0.816327,1.0,warn,0.969697,100000,0,True
5,gbdt,,0.6,split,0.036461,-1,40,0.001,0.0,100,...,44,,20231231,0.857143,0.897959,warn,0.914141,20000,0,True
6,dart,,0.688889,split,0.207294,-1,135,0.001,0.0,100,...,126,,20231231,1.0,0.510204,warn,0.989899,140000,0,True
7,gbdt,,0.733333,split,0.063985,-1,40,0.001,0.0,100,...,27,,20231231,0.081633,0.653061,warn,0.555556,200000,0,False
8,dart,,0.777778,split,0.137534,-1,60,0.001,0.0,100,...,69,,20231231,0.469388,0.612245,warn,0.762626,100000,0,False
9,dart,,0.688889,split,0.225229,-1,190,0.001,0.0,100,...,43,,20231231,0.387755,0.326531,warn,0.550505,160000,0,True


In [41]:
max(np.log10(param_df.head(10)["learning_rate"]))

-0.6473763420103276

In [40]:
np.log10(0.02)

-1.6989700043360187

In [48]:
np.log10(0.24)

-0.619788758288394

In [49]:
# search domain (updated after one round of random_search)
param_grid_1 = {
    'boosting_type': ['gbdt', 'dart'],
    'num_leaves': list(range(27, 145)),
    'learning_rate': list(np.logspace(np.log10(0.02), np.log10(0.24), base = 10, num = 1000)),
    'subsample_for_bin': list(range(20000, 280000, 20000)),
    'min_child_samples': list(range(40, 230, 5)),
    'reg_alpha': list(np.linspace(0.06, 1)),
    'reg_lambda': list(np.linspace(0.02, 1)),
    'colsample_bytree': list(np.linspace(0.6, 0.8, 10)),
    'subsample': list(np.linspace(0.55, 1, 100)),
    'is_unbalance': [True, False]
}

In [None]:
random_results_1 = random_search(param_grid_1, num_evals = 1000)
random_results_1.head()

In [None]:
#random_results.loc[882, "cross_entropy"]

0.4421671665
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.6,
 'importance_type': 'split',
 'learning_rate': 0.04830087399761322,
 'max_depth': -1,
 'min_child_samples': 110,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 141,
 'objective': None,
 'random_state': 20231231,
 'reg_alpha': 0.8775510204081632,
 'reg_lambda': 0.42857142857142855,
 'silent': 'warn',
 'subsample': 0.5555555555555556,
 'subsample_for_bin': 120000,
 'subsample_freq': 0,
 'is_unbalance': True}

In [None]:
#def grid_search(param_grid, num_evals):
    #"""grid search for hyperparameter tuning"""
    # empty dataframe for results
    #results = pd.DataFrame(columns = ['cross_entropy', 'hyper_params'], index = list(range(num_evals)))
    
    # https://codereview.stackexchange.com/questions/171173/list-all-possible-permutations-from-a-python-dictionary-of-lists
    #keys, values = zip(*param_grid.items()) 
    #i = 0
    
    # iterate through every possible combination of hyperparameters
    #for v in itertools.product(*values):
        
        # create a hyperparameter dictionary
        #hyperparameters = dict(zip(keys, v))
        
        # set the subsample ratio accounting for boosting type
        #hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']
        
        # eval the hyperparameters
        #cross_entropy, hyper_params = cross_validate(LGBMClassifier(**hyperparameters,random_state=20231231))
        #results.loc[i, :] = cross_entropy, hyper_params
        #i += 1
        
        # normally would not limit iterations
        #if i > num_evals:
            #break
       
    # sort with lowest cross_entropy on top
    #results.sort_values('cross_entropy', ascending = True, inplace = True)
    #results.reset_index(inplace = True)
    #return results

In [None]:
#random_hyper_params = random_results.loc[882, 'hyper_params']
#random_hyper_params

In [None]:
# fit, train, test model
#model = LGBMClassifier(**random_hyper_params)
#model.fit(X_train, y_train_le)
#y_test_pred = model.predict_proba(X_test)

In [None]:
#y_test_pred

In [None]:
#sample_submission = pd.read_csv("sample_submission.csv")
#sample_submission.head()

In [None]:
#y_test_pred_df = pd.DataFrame(y_test_pred, columns =['Status_C', 'Status_CL', 'Status_D'])
#y_test_pred_df.head()

In [None]:
#y_test_pred_df = pd.concat([sample_submission["id"],y_test_pred_df],axis=1)
#y_test_pred_df.head()

In [None]:
#y_test_pred_df.to_csv("y_test_pred_df_20231231.csv",index=False)