In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, log_loss

In [3]:
from collections import defaultdict

In [4]:
import lightgbm as lgbm

In [5]:
def get_summary_grid(grid):
    parms = list(grid.param_grid.keys())
    columns = ['Iter #']
    scorings = grid.scoring
    for scoring in scorings:
        for typ in ['train', 'test']:
            columns.append('{}_{}'.format(typ, scoring))
    columns.extend(['fit_time', 'score_time'])
    for param in grid.param_grid.keys():
        columns.append(param)
    df = pd.DataFrame(columns=columns)
    cv_res = grid.cv_results_
    for col in columns[1:-len(parms)]:
        df[col] = cv_res['mean_' + col]
    items = 1
    for k, v in grid.param_grid.items():
        items *= len(v)
    df['Iter #'] = np.array(range(items)) + 1
    res = defaultdict(list)
    for each in grid.cv_results_['params']:
        for p in parms:
            res[p].append(each[p])
    for k, v in res.items():
        df[k] = v
    return df

In [6]:
def get_summary_randomized(grid):
    parms = list(grid.param_distributions.keys())
    columns = []
    scorings = grid.scoring
    for scoring in scorings:
        for typ in ['train', 'test']:
            columns.append('{}_{}'.format(typ, scoring))
    columns.extend(['fit_time', 'score_time'])
    for param in parms:
        columns.append(param)
    df = pd.DataFrame(columns=columns)
    cv_res = grid.cv_results_
    for col in columns[0:-len(parms)]:
        df[col] = cv_res['mean_' + col]
    items = 1
#     for k, v in grid.param_distributions.items():
#         items *= len(v)
#     df['Iter #'] = np.array(range(items)) + 1
    res = defaultdict(list)
    for each in grid.cv_results_['params']:
        for p in parms:
            res[p].append(each[p])
    for k, v in res.items():
        df[k] = v
    return df

In [7]:
train = pd.read_csv('./data/train_cleaned.csv')

In [8]:
train.head()

Unnamed: 0,department_Analytics,department_Finance,department_HR,department_Legal,department_Operations,department_Procurement,department_R&D,department_Sales & Marketing,department_Technology,region_region_1,...,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,work_fraction,start_year,dep_avg_training_score,reg_avg_training_score,edu_avg_training_score,gen_avg_training_score
0,0,0,0,0,0,0,0,1,0,0,...,8,1,0,49,0.228571,27,50.261698,63.940946,64.06124,63.889897
1,0,0,0,0,1,0,0,0,0,0,...,4,0,0,60,0.133333,26,60.226648,65.69944,63.422046,63.17355
2,0,0,0,0,0,0,0,1,0,0,...,7,0,0,50,0.205882,27,50.261698,61.270023,63.422046,63.17355
3,0,0,0,0,0,0,0,1,0,0,...,10,0,0,50,0.25641,29,50.261698,64.330213,63.422046,63.17355
4,0,0,0,0,0,0,0,0,1,0,...,2,0,0,73,0.044444,43,79.928692,65.084071,63.422046,63.17355


In [9]:
y = np.load('./data/y.npy')

In [10]:
test = pd.read_csv('./data/test_cleaned.csv')

In [11]:
eid = pd.read_csv('./data/test.csv')['employee_id']

## GridSearch

In [6]:
params = {
    'max_depth': [2, 4, 6, 8, 10], 
    'subsample': [0.25, 0.5, 0.75, 1.0], # Row sampling like Random Forest 
    'colsample_bytree': [0.3, 0.5, 0.7, 1.0],  # Column sampling like Random Forest
    'num_leaves': [2**2, 2**4, 2**6, 2**8]
}

In [11]:
estimator = lgbm.LGBMClassifier(random_state=42, 
                                silent=False, 
                                n_jobs=1, 
                                class_weight='balanced',
                                objective='binary')

In [12]:
grid = RandomizedSearchCV(estimator=estimator,
                    param_distributions=params,
                    scoring={'accuracy', 'neg_log_loss', 'f1', 'recall', 'roc_auc'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=StratifiedKFold(5),
                    return_train_score=True,
                    verbose=2,
                    n_jobs=-1)

In [13]:
grid.fit(train, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  45 out of  50 | elapsed:  1.5min remaining:   10.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.5min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
          error_score='raise-deprecating',
          estimator=LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
        max_depth=-1, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=100, n_jobs=1, num_leaves=31,
        objective='binary', random_state=42, reg_alpha=0.0, reg_lambda=0.0,
        silent=False, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'subsample': [0.25, 0.5, 0.75, 1.0], 'colsample_bytree': [0.3, 0.5, 0.7, 1.0], 'max_depth': [2, 4, 6, 8, 10], 'num_leaves': [4, 16, 64, 256]},
          pre_dispatch='2*n_jobs', random_state=None, refit='f1',
          return_train_score=True,
          scoring={'recall', 'accuracy', 'f1', 'neg_log_loss', 'roc_auc'},


In [19]:
grid_res = get_summary_randomized(grid)

In [20]:
grid_res.sort_values('test_f1', ascending=False).head()

Unnamed: 0,train_recall,test_recall,train_accuracy,test_accuracy,train_f1,test_f1,train_neg_log_loss,test_neg_log_loss,train_roc_auc,test_roc_auc,fit_time,score_time,subsample,colsample_bytree,max_depth,num_leaves
7,0.978685,0.817271,0.830513,0.801635,0.495919,0.412403,-0.274431,-0.308938,0.96799,0.908288,14.594561,0.640953,0.5,0.7,10,64
0,0.970758,0.824982,0.823429,0.79649,0.48363,0.408513,-0.289042,-0.319805,0.963563,0.908278,14.360805,0.617024,1.0,0.5,8,256
3,0.960315,0.87811,0.78224,0.766512,0.428974,0.390488,-0.320756,-0.339335,0.944955,0.908304,57.65098,0.508825,1.0,1.0,6,256
8,0.960315,0.87811,0.78224,0.766512,0.428974,0.390488,-0.320756,-0.339335,0.944955,0.908304,12.799671,0.475912,0.75,1.0,6,256
5,0.954424,0.917526,0.75374,0.746935,0.397672,0.381836,-0.349659,-0.357434,0.930814,0.91266,22.281309,0.456421,0.25,0.5,10,16


In [24]:
params = {
    'n_estimators': [100, 200, 300, 500, 700, 1000],
    'learning_rate': [0.001, 0.01, 0.1]
}

In [25]:
estimator = grid.best_estimator_

In [26]:
grid2 = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'neg_log_loss', 'f1', 'recall', 'roc_auc'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=StratifiedKFold(5),
                    return_train_score=True,
                    verbose=2,
                    n_jobs=-1)

In [27]:
grid2.fit(train, y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=0.7, importance_type='split', learning_rate=0.1,
        max_depth=10, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=100, n_jobs=1, num_leaves=64,
        objective='binary', random_state=42, reg_alpha=0.0, reg_lambda=0.0,
        silent=False, subsample=0.5, subsample_for_bin=200000,
        subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 200, 300, 500, 700, 1000], 'learning_rate': [0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'recall', 'accuracy', 'f1', 'neg_log_loss', 'roc_auc'},
       verbose=2)

In [32]:
grid_res = get_summary_grid(grid2)

In [33]:
grid_res.sort_values('test_f1', ascending=False).head()

Unnamed: 0,Iter #,train_recall,test_recall,train_accuracy,test_accuracy,train_f1,test_f1,train_neg_log_loss,test_neg_log_loss,train_roc_auc,test_roc_auc,fit_time,score_time,n_estimators,learning_rate
17,18,1.0,0.503433,0.981996,0.902751,0.904436,0.468631,-0.061938,-0.240877,0.999849,0.887778,11.307185,9.889362,1000,0.1
16,17,0.999786,0.553559,0.964782,0.890728,0.828706,0.463294,-0.096222,-0.238637,0.999282,0.89155,7.823344,7.790822,700,0.1
15,16,0.998929,0.593619,0.946012,0.877208,0.759187,0.451777,-0.129053,-0.243773,0.997938,0.894975,5.83796,4.677376,500,0.1
14,15,0.995341,0.678452,0.910761,0.85524,0.655171,0.444034,-0.180662,-0.261727,0.992973,0.900143,4.032319,2.431508,300,0.1
13,14,0.990788,0.73715,0.880601,0.836301,0.58567,0.434136,-0.218437,-0.27845,0.985926,0.904037,2.589913,1.388452,200,0.1


In [35]:
params = {
    'reg_alpha': [0.1, 1, 10],
    'reg_lambda': [0.1, 1, 10]
}

In [37]:
estimator = grid2.best_estimator_

In [38]:
grid3 = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'precision', 'f1', 'recall', 'roc_auc'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=StratifiedKFold(5),
                    return_train_score=True,
                    verbose=2,
                    n_jobs=-1)

In [39]:
grid3.fit(train, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  37 out of  45 | elapsed:  4.7min remaining:  1.0min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  4.8min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=0.7, importance_type='split', learning_rate=0.1,
        max_depth=10, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=1000, n_jobs=1, num_leaves=64,
        objective='binary', random_state=42, reg_alpha=0.0, reg_lambda=0.0,
        silent=False, subsample=0.5, subsample_for_bin=200000,
        subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'reg_alpha': [0.1, 1, 10], 'reg_lambda': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'recall', 'f1', 'accuracy', 'roc_auc', 'precision'},
       verbose=2)

In [41]:
grid_res = get_summary_grid(grid3)

In [42]:
grid_res.sort_values('test_f1', ascending=False).head()

Unnamed: 0,Iter #,train_recall,test_recall,train_f1,test_f1,train_accuracy,test_accuracy,train_roc_auc,test_roc_auc,train_precision,test_precision,fit_time,score_time,reg_alpha,reg_lambda
0,1,1.0,0.509861,0.906727,0.472095,0.982475,0.902879,0.999854,0.892015,0.829384,0.439582,9.511136,12.207341,0.1,0.1
3,4,1.0,0.525281,0.895422,0.471379,0.980103,0.89965,0.9998,0.895805,0.810659,0.427552,10.405811,9.868875,1.0,0.1
1,2,1.0,0.532782,0.874199,0.468708,0.975483,0.897114,0.999715,0.895828,0.77654,0.4184,9.725919,12.251292,0.1,1.0
4,5,1.0,0.542636,0.863947,0.467597,0.973175,0.894742,0.999621,0.896868,0.760484,0.410833,10.285917,9.023313,1.0,1.0
2,3,0.999679,0.6084,0.761765,0.461656,0.946719,0.879142,0.99762,0.901192,0.615395,0.371975,10.124613,13.493482,0.1,10.0


Submission 1 with the existing model

In [55]:
pred = grid3.best_estimator_.predict(test)

In [56]:
pred

array([0, 0, 0, ..., 0, 0, 1])

In [58]:
pd.DataFrame({'employee_id': eid, 'is_promoted': pred}).to_csv('./data/sub1.csv', index=False)

In [59]:
!ls data

sub1.csv  test_cleaned.csv  test.csv  train_cleaned.csv  train.csv  y.npy


### Leaderboard Score - 0.4498229044, Rank - 897

Improving the score by changing the threshold

[H[2J