# This notebook will show how to train a complete models in some popular tools: random forest regressor, xgboost and lightGBM

    Instructor: Yimin Nie
    Email: ymnie888@gmail.com
    
    In the notebook, I show you the entire pipeline using taxi trip data set, and show how to put all workable codes into 
    a python project to run your code

## 1. Import useful libs 

In [3]:
import pandas as pd
import numpy as np
import datetime
import gc
import math
try:
   import cPickle as pickle
except:
   import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [4]:
#Function to reduce dataframe memory footprint, reduce float and int to the minimum dtype
def reducemem(self):
    for c in self:
        if self[c].dtype =='int64':
            if self[c].max()<np.iinfo(np.int32).max and self[c].min()>np.iinfo(np.int32).min:
                self[c]=self[c].astype(np.int32)           
            if self[c].max()<np.iinfo(np.int16).max and self[c].min()>np.iinfo(np.int16).min:
                self[c]=self[c].astype(np.int16)
            if self[c].max()<np.iinfo(np.int8).max and self[c].min()>np.iinfo(np.int8).min:
                self[c]=self[c].astype(np.int8) 
                
        if self[c].dtype =='float64':
            if self[c].max()<np.finfo(np.float32).max and self[c].min()>np.finfo(np.float32).min:
                self[c]=self[c].astype(np.float32)           
            if self[c].max()<np.finfo(np.float16).max and self[c].min()>np.finfo(np.float16).min:
                self[c]=self[c].astype(np.float16)


## 2. Load data

In [5]:
data_path = ""

In [7]:
# START
# IMPORT DATA TYPE DICTIONARY csv

with open('ctdict2.pkl', 'rb') as handle:
    cvsdict = pickle.load(handle)

df = pd.read_csv('df_application_train_new2.csv',dtype=cvsdict)


## 3. process the data and extract features 

In [8]:
df=df.replace(-np.Inf, 0)
df=df.replace(np.Inf, 0)
df=df.replace(np.nan, 0)
def trim(st):
    st=st.strip(" ")
    st=st.replace(',', '')
    st=st.replace(' ', '_')
    st=st.replace(':', '_')
    return st
df = df.rename(columns=lambda x: trim(x))
reducemem(df)
gc.collect()
y_train=[]
X_train=[]

def allsample(df):
    y_train = df['TARGET']
    X_train = df.drop(columns=['TARGET'])
    del  df
    return(X_train , y_train)

def equaltargetsplit(df):    
    x1=df[df['TARGET']==1]
    x0=df[df['TARGET']==0]
    x0=x0.sample(n=x1.shape[0], replace=True, random_state=1)
    x=pd.concat([x1,x0],axis=0).reset_index(drop=True)
    y_train=x['TARGET']
    X_train=x.drop(columns=['TARGET'])
    del df,x1,x0,x
    return(X_train , y_train)

#call your sampling method 
X_train , y_train = allsample(df)
#X_train , y_train = equaltargetsplit(df)

gc.collect()

0

## 4. Build your models


    before building your models, make sure 
        (1) your target ( regression or classification)
        (2) evaluation metric in terms of your target
        (3) how to train your model (here I use 5-fold cross validation)
        

### Define some models 

In [9]:
def model_lgb():
    params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': -1,
    'silent':-1,
    "max_depth": 20,
    "num_leaves": 250,
    "max_bin": 2500,
    "n_estimators": 50000
}
    model = lgb.LGBMRegressor(**params)
    return model

def model_xgb():
    model = xgb.XGBRegressor(colsample_bytree=0.4,
                     gamma=0,                 
                     learning_rate=0.07,
                     max_depth=3,
                     min_child_weight=1.5,
                     n_estimators=10000,                                                                    
                     reg_alpha=0.75,
                     reg_lambda=0.45,
                     subsample=0.6,
                     seed=42
                ) 
    return model

def model_rf():
    model = RandomForestRegressor(
        n_estimators=config.n_estimator,
        max_depth = config.max_depth,
        random_state=config.seed,
        n_jobs=config.n_jobs,
    )
    return model


### use k-fold CV

In [None]:
gc.collect()
kf = KFold(5)
cv_scores = []
model_name = 'lgb'
for i, (tr_idx, vl_idx) in enumerate(kf.split(X_train, y_train)):
    print('FOLD {} \n'.format(i))
    X_tr, y_tr = X_train.loc[tr_idx], y_train[tr_idx]
    X_vl, y_vl = X_train.loc[vl_idx], y_train[vl_idx]

    if model_name == 'lgb':
        model = model_lgb()
        model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_vl, y_vl)], \
                  eval_metric='auc', verbose=200, early_stopping_rounds=500)
        with open('lgb_model_{}.pkl'.format(i), 'wb') as handle:
            pickle.dump(model, handle)
        del model, X_tr, X_vl
        gc.collect()
        
    if model_name == 'rf':
        model = model_rf()
        model.fit(X_tr, y_tr)
        with open('rf_model_{}.pkl'.format(i), 'wb') as handle:
            pickle.dump(model, handle)
        del model, X_tr, X_vl
        gc.collect()
        
    if model_name == 'xgb':
        model = model_xgb()
        train_data  = xgb.DMatrix(X_tr, label=y_tr)
        valid_data  = xgb.DMatrix(X_vl, label=y_vl)
        evallist = [(train_data, 'train'), (valid_data, 'valid')]
        parms = {'max_depth':15, #maximum depth of a tree 8 12
         'objective':'reg:linear',
         'eta'      :0.05, #0.3
         'subsample':0.9,#SGD will use this percentage of data 0.8 0.99
         'lambda '  :3, #L2 regularization term,>1 more conservative 4 
         'colsample_bytree ':0.6, #0.9
         'colsample_bylevel':0.7, #1 0.7
         'min_child_weight': 0.5, #10 0.5
         #'nthread'  :3 ... default is max cores
         'eval_metric':'rmse'}  #number of cpu core to use
        # running for 2k iterations 
        model = xgb.train(parms, train_data, num_boost_round=2000, evals = evallist,
                          early_stopping_rounds=50, maximize=False, 
                          verbose_eval=100)
#         model.fit(X_tr, y_tr,eval_set=(X_vl, y_vl))
        with open('rf_model_{}.pkl'.format(i), 'wb') as handle:
            pickle.dump(model, handle)
        del model, X_tr, X_vl
        gc.collect()

FOLD 0 

Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.884712	training's binary_logloss: 0.207723	valid_1's auc: 0.76691	valid_1's binary_logloss: 0.242939
[400]	training's auc: 0.945511	training's binary_logloss: 0.176518	valid_1's auc: 0.777762	valid_1's binary_logloss: 0.23814
[600]	training's auc: 0.974707	training's binary_logloss: 0.153694	valid_1's auc: 0.78238	valid_1's binary_logloss: 0.236516
[800]	training's auc: 0.988598	training's binary_logloss: 0.135554	valid_1's auc: 0.784364	valid_1's binary_logloss: 0.235885
[1000]	training's auc: 0.994883	training's binary_logloss: 0.120789	valid_1's auc: 0.785131	valid_1's binary_logloss: 0.235761
[1200]	training's auc: 0.997781	training's binary_logloss: 0.108158	valid_1's auc: 0.7858	valid_1's binary_logloss: 0.235974
