## Notebook 3 : *Data Modelling* 
This notebook is for modelling. 

---------------------------------------

## Table of contents 

### 1. Data preparation
 - 1.1 Load Data
 - 1.2 Optimize memory
 
### 2. Building  Light Gradient Boosting model

---

## 1. Data preparation
## 1.1 Load feature engineered Data

In [1]:
import pandas as pd
from pandas import Series
pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=1.1)
sns.set_style('whitegrid')
from datetime import datetime 

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('C:/Users/dj.lee/Desktop/data/train/train_preprocessing.csv')
print('Train data:', df_train.shape)

df_test = pd.read_csv('C:/Users/dj.lee/Desktop/data/test/test_preprocessing.csv')
print('Test data:', df_test.shape)

del df_train['Date']
del df_train['MachineIdentifier']

del df_test['Date']
del df_test['MachineIdentifier']

Train data: (8921483, 60)
Test data: (7853253, 59)


In [3]:
df_train['Lag_from_Start'].fillna(df_train['Lag_from_Start'].mode()[0], inplace=True)
df_test['Lag_from_Start'].fillna(df_test['Lag_from_Start'].mode()[0], inplace=True)

df_train['Hard_Ratio_for_OS'].fillna(df_train['Hard_Ratio_for_OS'].mode()[0], inplace=True)
df_test['Hard_Ratio_for_OS'].fillna(df_test['Hard_Ratio_for_OS'].mode()[0], inplace=True)

## 1.2 Optimize memory

In [4]:
def reduce_memory(df,f):
    mx = df[f].max()
    if mx < 256:
            df[f] = df[f].astype('uint8')
    elif mx < 65536:
        df[f] = df[f].astype('uint16')
    else:
        df[f] = df[f].astype('uint32')

In [5]:
used_features = list(df_train.columns)
used_features.remove('HasDetections')

In [6]:
print('Optimizing memory...')
for f in used_features:
    reduce_memory(df_train, f)
    reduce_memory(df_test, f)

Optimizing memory...


## 2. Building Light Gradient Boosting model
In this project, I build light gradient boosting model.  

In [7]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import gc

pred_val = np.zeros(len(df_test))
folds = StratifiedKFold(n_splits=5, shuffle=True)
selected_features = used_features

ct = 0
for idxT, idxV in folds.split(df_train[selected_features], df_train['HasDetections']):
    # TRAIN LGBM
    ct += 1; print('####### FOLD ',ct,'#########')
    df_trainA = df_train.loc[idxT]
    df_trainB = df_train.loc[idxV]
    model = lgb.LGBMClassifier(n_estimators=10000, colsample_bytree=0.5, objective='binary', num_leaves=2048,
            max_depth=-1, learning_rate=0.04)
    h=model.fit(df_trainA[selected_features], df_trainA['HasDetections'], eval_metric='auc',
            eval_set=[(df_trainB[selected_features], df_trainB['HasDetections'])], verbose=200,
            early_stopping_rounds=100)
    
    # PREDICT TEST
    del df_trainA, df_trainB; x=gc.collect()
    idx = 0; ct2 = 1; chunk = 200
    print('Predicting test...')
    while idx < len(df_test):
        idx2 = min(idx + chunk, len(df_test))
        idx = range(idx, idx2)
        pred_val[idx] += model.predict_proba(df_test.iloc[idx][selected_features])[:,1]
        #print('Finished predicting part',ct2)
        ct2 += 1; idx = idx2

pred_val_final = pred_val/5

####### FOLD  1 #########
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.72292	valid_0's binary_logloss: 0.608365
[400]	valid_0's auc: 0.724495	valid_0's binary_logloss: 0.606954
[600]	valid_0's auc: 0.724797	valid_0's binary_logloss: 0.60669
[800]	valid_0's auc: 0.724868	valid_0's binary_logloss: 0.606636
Early stopping, best iteration is:
[831]	valid_0's auc: 0.724884	valid_0's binary_logloss: 0.606624
Predicting test...
####### FOLD  2 #########
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.722532	valid_0's binary_logloss: 0.6088
[400]	valid_0's auc: 0.724062	valid_0's binary_logloss: 0.607432
[600]	valid_0's auc: 0.724332	valid_0's binary_logloss: 0.607178
[800]	valid_0's auc: 0.724383	valid_0's binary_logloss: 0.607119
Early stopping, best iteration is:
[754]	valid_0's auc: 0.724395	valid_0's binary_logloss: 0.607114
Predicting test...
####### FOLD  3 #########
Training until validation scores don't imp

In [8]:
Id_data = pd.read_csv('C:/Users/dj.lee/Desktop/data/test/test.csv', usecols=['MachineIdentifier'])

pre = pd.DataFrame({'MachineIdentifier': Id_data['MachineIdentifier'], 'HasDetections': pred_val_final})
pre.to_csv('C:/Users/dj.lee/Desktop/data/test/pre_lgbm.csv', index=False)

In [139]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import random

In [286]:
chunksize = 3000000
SGD = SGDClassifier(loss='log')

for df_train in pd.read_csv('C:/Users/dj.lee/Desktop/data/train/train_preprocessing.csv', chunksize=chunksize, iterator=True):
    del df_train['Date']
    del df_train['MachineIdentifier']
    df_train['Lag_from_Start'].fillna(df_train['Lag_from_Start'].mode()[0], inplace=True)
    df_train['Hard_Ratio_for_OS'].fillna(df_train['Hard_Ratio_for_OS'].mode()[0], inplace=True)
    used_features = df_train.columns.tolist()
    used_features.remove('HasDetections')
    
    X = df_train[used_features]
    Y = df_train['HasDetections']
    SGD.partial_fit(X, Y, classes=np.unique(Y))

In [287]:
pred_val = SGD.predict_proba(df_test)
result_sgdc = pred_val[:,1].tolist()

In [288]:
#result_lgbm = pd.read_csv('C:/Users/dj.lee/Desktop/data/test/pre_lgbm.csv')
result = pd.DataFrame({'MachineIdentifier':result_lgbm['MachineIdentifier'] ,'lgbm':result_lgbm['HasDetections'], 'sgdc':result_sgdc })

In [289]:
lgbm = result_lgbm['HasDetections'] > 0.5
lgbm = np.array(lgbm).astype(float)
result['lgbm_01'] = lgbm

sgdc = np.array(result_sgdc)  > 0.5
sgdc = sgdc.astype(float)
result['sgdc_01'] = sgdc

In [290]:
sum(result['lgbm_01'] == result['sgdc_01']) / len(df_test)

0.591196412493014