In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, log_loss, 
    confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score)

In [3]:
from collections import defaultdict

In [4]:
import lightgbm as lgbm
import xgboost as xgb

In [5]:
def get_summary_grid(grid):
    parms = list(grid.param_grid.keys())
    columns = ['Iter #']
    scorings = grid.scoring
    for scoring in scorings:
        for typ in ['train', 'test']:
            columns.append('{}_{}'.format(typ, scoring))
    columns.extend(['fit_time', 'score_time'])
    for param in grid.param_grid.keys():
        columns.append(param)
    df = pd.DataFrame(columns=columns)
    cv_res = grid.cv_results_
    for col in columns[1:-len(parms)]:
        df[col] = cv_res['mean_' + col]
    items = 1
    for k, v in grid.param_grid.items():
        items *= len(v)
    df['Iter #'] = np.array(range(items)) + 1
    res = defaultdict(list)
    for each in grid.cv_results_['params']:
        for p in parms:
            res[p].append(each[p])
    for k, v in res.items():
        df[k] = v
    return df

In [6]:
def get_summary_randomized(grid):
    parms = list(grid.param_distributions.keys())
    columns = []
    scorings = grid.scoring
    for scoring in scorings:
        for typ in ['train', 'test']:
            columns.append('{}_{}'.format(typ, scoring))
    columns.extend(['fit_time', 'score_time'])
    for param in parms:
        columns.append(param)
    df = pd.DataFrame(columns=columns)
    cv_res = grid.cv_results_
    for col in columns[0:-len(parms)]:
        df[col] = cv_res['mean_' + col]
    items = 1
#     for k, v in grid.param_distributions.items():
#         items *= len(v)
#     df['Iter #'] = np.array(range(items)) + 1
    res = defaultdict(list)
    for each in grid.cv_results_['params']:
        for p in parms:
            res[p].append(each[p])
    for k, v in res.items():
        df[k] = v
    return df

In [7]:
dtrain = pd.read_csv('./data/dtrain.csv')
dtest = pd.read_csv('./data/dtest.csv')

In [8]:
dtest.drop('is_promoted', inplace=True, axis=1)

In [9]:
y = dtrain.is_promoted.values
X = dtrain.drop('is_promoted', axis=1)

In [10]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [11]:
X.drop('employee_id', axis=1, inplace=True)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42)

## GridSearch

In [13]:
# params = {
#     'max_depth': [2, 4, 6, 8, 10], 
#     'subsample': [0.25, 0.5, 0.75, 1.0], # Row sampling like Random Forest 
#     'colsample_bytree': [0.3, 0.5, 0.7, 1.0],  # Column sampling like Random Forest
#     'num_leaves': [2**2, 2**4, 2**6, 2**8]
# }

params = {
  'min_child_weight':[5, 6, 8],
  'max_depth': range(3,10,2),
  'n_estimators':[200, 300, 400, 500],
#   'scale_pos_weight':[1, 2, 3, 4],
#   'colsample_bytree':[0.7, 0.8], 
#   'subsample':[0.5, 0.7, 0.8, 0.9],
#   'gamma':[0, 0.2, 0.4, 0.6]
}

In [14]:
estimator = lgbm.LGBMClassifier(random_state=42, 
                                silent=False, 
                                n_jobs=1, 
                                class_weight='balanced',
                                objective='binary')
# estimator = xgb.XGBClassifier(
#     learning_rate=0.1,
#     objective= 'binary:logistic',
#     seed=42,
#     nthread=4
# )

In [15]:
grid = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'neg_log_loss', 'f1', 'recall', 'roc_auc'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=StratifiedKFold(5, shuffle=True,),
                    return_train_score=True,
                    verbose=2,
                    n_jobs=-1)

In [16]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   51.2s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 13.3min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
        max_depth=-1, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=100, n_jobs=1, num_leaves=31,
        objective='binary', random_state=42, reg_alpha=0.0, reg_lambda=0.0,
        silent=False, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [200, 300, 400, 500], 'min_child_weight': [5, 6, 8], 'max_depth': range(3, 10, 2)},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'roc_auc', 'recall', 'neg_log_loss', 'f1', 'accuracy'},
       verbose=2)

In [17]:
grid_res = get_summary_grid(grid)

In [18]:
grid_res.sort_values('test_f1', ascending=False).head()

Unnamed: 0,Iter #,train_roc_auc,test_roc_auc,train_recall,test_recall,train_neg_log_loss,test_neg_log_loss,train_f1,test_f1,train_accuracy,test_accuracy,fit_time,score_time,n_estimators,min_child_weight,max_depth
39,40,0.997668,0.900655,0.999801,0.624468,-0.145711,-0.245394,0.743987,0.456559,0.940964,0.872531,35.459476,8.688395,500,5,9
43,44,0.99722,0.900663,0.999668,0.628192,-0.14998,-0.247799,0.736755,0.45465,0.938723,0.870775,34.683436,8.436816,500,6,9
47,48,0.996526,0.902379,0.999668,0.6375,-0.157111,-0.250326,0.722972,0.454298,0.934259,0.868677,32.830574,6.155426,500,8,9
46,47,0.994018,0.903836,0.998737,0.675532,-0.179474,-0.259948,0.678237,0.452412,0.918687,0.859805,29.722765,6.759035,400,8,9
38,39,0.995495,0.901978,0.998271,0.660904,-0.169483,-0.255429,0.69261,0.451472,0.923995,0.862291,30.04693,7.789849,400,5,9


In [19]:
params = {
      'colsample_bytree':[0.7, 0.8], 
      'subsample':[0.5, 0.7, 0.8, 0.9],
      'reg_lambda':[0, 0.2, 0.4, 0.6, 0.8],
      'reg_alpha':[0, 0.2, 0.4, 0.6, 0.8],
}

In [20]:
estimator = grid.best_estimator_

In [21]:
grid2 = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'neg_log_loss', 'f1', 'recall', 'roc_auc'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=StratifiedKFold(2),
                    return_train_score=True,
                    verbose=2,
                    n_jobs=-1)

In [22]:
grid2.fit(X_train, y_train)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 20.8min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 24.7min finished


GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
        max_depth=9, min_child_samples=20, min_child_weight=5,
        min_split_gain=0.0, n_estimators=500, n_jobs=1, num_leaves=31,
        objective='binary', random_state=42, reg_alpha=0.0, reg_lambda=0.0,
        silent=False, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'colsample_bytree': [0.7, 0.8], 'reg_lambda': [0, 0.2, 0.4, 0.6, 0.8], 'reg_alpha': [0, 0.2, 0.4, 0.6, 0.8], 'subsample': [0.5, 0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'roc_auc', 'recall', 'neg_log_loss', 'f1', 'accuracy'},
       verbose=2)

In [23]:
grid_res = get_summary_grid(grid2)

In [24]:
grid_res.sort_values('test_f1', ascending=False).head()

Unnamed: 0,Iter #,train_roc_auc,test_roc_auc,train_recall,test_recall,train_neg_log_loss,test_neg_log_loss,train_f1,test_f1,train_accuracy,test_accuracy,fit_time,score_time,colsample_bytree,reg_lambda,reg_alpha,subsample
84,85,0.999764,0.896027,1.0,0.538032,-0.093582,-0.224847,0.877851,0.47123,0.976098,0.896479,21.823719,18.141407,0.7,0.2,0.8,0.5
85,86,0.999764,0.896027,1.0,0.538032,-0.093582,-0.224847,0.877851,0.47123,0.976098,0.896479,21.437577,18.04717,0.7,0.2,0.8,0.7
86,87,0.999764,0.896027,1.0,0.538032,-0.093582,-0.224847,0.877851,0.47123,0.976098,0.896479,21.742348,18.677173,0.7,0.2,0.8,0.8
87,88,0.999764,0.896027,1.0,0.538032,-0.093582,-0.224847,0.877851,0.47123,0.976098,0.896479,21.120753,17.616878,0.7,0.2,0.8,0.9
183,184,0.99978,0.896345,1.0,0.539894,-0.091333,-0.224562,0.878557,0.471022,0.976281,0.896022,22.578263,19.695053,0.8,0.0,0.8,0.9


In [25]:
grid2.best_params_

{'colsample_bytree': 0.7,
 'reg_alpha': 0.8,
 'reg_lambda': 0.2,
 'subsample': 0.5}

In [68]:
model_xgb = xgb.XGBClassifier(learning_rate=0.1, 
                          n_estimators=200, 
                          max_depth=4, 
                          min_child_weight=7, 
                          gamma=0.4,
                          nthread=16, 
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective= 'binary:logistic',
                          scale_pos_weight=3,
                          seed=29)
model_xgb.fit(X_train, y_train)
y_xg = model_xgb.predict(X_test)

In [78]:
print(confusion_matrix(y_test, y_xg))
predictions = [value for value in y_xg]
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print("Accuracy_score: %.2f%% on test dataset" % (accuracy * 100.0))
print("precision_score: %.2f%% on test dataset" % (precision * 100.0))
print("recall_score: %.2f%% on test dataset" % (recall * 100.0))
print("f1_score: %.2f%% on test dataset" % (f1 * 100.0))
print("roc_auc test set", roc_auc_score(y_test, model_xgb.predict_proba(X_test)[:,1]))
print("roc_auc training set", roc_auc_score(y_train, model_xgb.predict_proba(X_train)[:,1]))

[[9832  222]
 [ 527  381]]
Accuracy_score: 93.17% on test dataset
precision_score: 63.18% on test dataset
recall_score: 41.96% on test dataset
f1_score: 50.43% on test dataset
roc_auc test set 0.9123738420459038
roc_auc training set 0.9415925009898929


In [79]:
model_xgb.fit(X, y)
# y_xg = model_xgb.predict(X_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=4, min_child_weight=7, missing=None,
       n_estimators=200, n_jobs=1, nthread=16, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=3,
       seed=29, silent=True, subsample=0.8)

## Submission 2 

In [29]:
dtest_eid = dtest.employee_id
dtest.drop('employee_id', inplace=True, axis=1)

In [100]:
y_valid = model_xgb.predict(dtest)
submission = dtest.copy()
submission['employee_id'] = dtest_eid
submission['is_promoted'] = y_valid

In [103]:
submission[['employee_id', 'is_promoted']].to_csv('./data/sub2.csv', index=False)

## Leaderboard Score: 0.5220077220, Rank: 60

In [27]:
lgbm_model = grid2.best_estimator_.fit(X, y)

In [31]:
y_valid = lgbm_model.predict(dtest)
submission = dtest.copy()
submission['employee_id'] = dtest_eid
submission['is_promoted'] = y_valid

In [32]:
submission[['employee_id', 'is_promoted']].to_csv('./data/sub3.csv', index=False)