# ALTEGRAD Challenge - Classification

*Abderrahim AIT-AZZI, Sébastien OHLEYER, Mickael SUTTON*

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Performing grid search
import matplotlib.pyplot as plt
import os
import csv
from datetime import datetime
from log import _check_log_directory,_initialise_model_log



In [2]:
data_dir = "./data/"
log_dir = './log'

In [3]:
#initialize model directory
log_name = (datetime.now().strftime('%d-%m-%Y_%H-%M-%S'))
log_filepath = os.path.join(log_dir,log_name,'lighgb.csv')
_check_log_directory(os.path.join(log_dir,log_name))
_initialise_model_log(log_filepath)

Attempting to make log directory at ./log/05-02-2018_17-35-56


### Load data & features

In [4]:
from load_features import load_features

In [5]:
features_train, features_test, data_train = load_features(data_dir)

### Create train and test matrices

In [6]:
X_train= features_train.drop(['is_duplicate'],axis=1)
X_test = features_test
X_train=X_train.replace([np.inf, -np.inf], np.nan)
X_train=X_train.fillna(value=0)
X_test=X_test.replace([np.inf, -np.inf], np.nan)
X_test=X_test.fillna(value=0)
Y_train = data_train["is_duplicate"].values

In [7]:
X_train.columns

Index(['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2',
       'len_word_q1', 'len_word_q2', 'common_words', 'fuzz_qratio',
       'fuzz_WRatio',
       ...
       'num_a_q1', 'num_a_q2', 'num_y_q1', 'num_y_q2', 'num_r_q1', 'num_r_q2',
       'num_b_q1', 'num_b_q2', 'num_p_q1', 'num_p_q2'],
      dtype='object', length=155)

# Classifier LIGHTGB

In [8]:
from lgb_train import lgb_train

In [9]:
print('Number of features on train matrix: ',len(X_train.columns))
print('Number of features on test matrix: ',len(X_test.columns))

Number of features on train matrix:  155
Number of features on test matrix:  155


## A. Gridsearch

In [None]:
for num_leaves in [120,130,140]:
    for lambda_l2 in [1.8,1.9,2]:
        RANDOM_SEED = 2017
        lgb_params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting': 'gbdt',
            'device': 'cpu',
            'feature_fraction': 0.486,
            'num_leaves': num_leaves,
            'lambda_l2': lambda_l2,
            'learning_rate': 0.01,
            'num_boost_round': 5000,
            'early_stopping_rounds': 50,
            'max_depth': 25,
            'min_data_in_leaf': 15,
            'subsample': 1,
            'colsample_bytree': 1,
            'verbose': 1,
            'bagging_fraction_seed': RANDOM_SEED,
            'feature_fraction_seed': RANDOM_SEED,
        }
        print(lgb_params)
        lgb_train(X_train, X_test, Y_train, lgb_params, log_filepath)

## B. Prediction

In [10]:
# Abderrahim best parameters
RANDOM_SEED = 2017
lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'device': 'cpu',
        'feature_fraction': 0.486,
        'num_leaves': 130,
        'lambda_l2': 1.9,
        'learning_rate': 0.01,
        'num_boost_round': 5000,
        'early_stopping_rounds': 50,
        'max_depth': 25,
        'min_data_in_leaf': 15,
        'subsample': 1,
        'colsample_bytree': 1,
        'verbose': 1,
        'bagging_fraction_seed': RANDOM_SEED,
        'feature_fraction_seed': RANDOM_SEED,
    }

In [11]:
feat_imp = lgb_train(X_train, X_test, Y_train, lgb_params, log_filepath, test_prediction=True, num_folds=5)



Fold 1: 1430 rounds, training loss 0.038775, validation loss 0.116121
Fold 2: 1323 rounds, training loss 0.041690, validation loss 0.121930
Fold 3: 1335 rounds, training loss 0.040846, validation loss 0.123064
Fold 4: 1594 rounds, training loss 0.034066, validation loss 0.121190
Fold 5: 1176 rounds, training loss 0.045657, validation loss 0.127973
Final CV val score: [0.11612060288900782, 0.12192976287900659, 0.12306416845550262, 0.12119010626641454, 0.1279730785951698]
Final mean CV val score: 0.12205554381702026

Make submission file...
Submission file written !


In [12]:
feat_imp[feat_imp['column']=='q2_neigh']

Unnamed: 0,column,importance
57,q2_neigh,632


## C. Manual CV

In [9]:
X_train_feat = X_train
X_train_values=X_train[:60000].values
y_train_values = Y_train[:60000]
X_fold_val = X_train[60000:].values
y_fold_val = Y_train[60000:]

In [11]:
len(X_train_values)

60000

In [14]:
import lightgbm as lgb
lgb_params = lgb_params.copy()

lgb_data_train = lgb.Dataset(X_train_values, y_train_values)
lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    
evals_result = {}

model = lgb.train(
lgb_params,
lgb_data_train,
valid_sets=[lgb_data_train, lgb_data_val],
evals_result=evals_result,
num_boost_round=lgb_params['num_boost_round'],
early_stopping_rounds=lgb_params['early_stopping_rounds'],
verbose_eval=False,
)
fold_train_scores = evals_result['training'][lgb_params['metric']]
fold_val_scores = evals_result['valid_1'][lgb_params['metric']]



In [15]:
print(fold_train_scores[-1])
print(fold_val_scores[-1])

feat_imp = pd.DataFrame({
'column': list(X_train.columns),
'importance': model.feature_importance()}).sort_values(by='importance')

0.038801477712497756
0.12940948953510495


In [15]:
feat_imp[feat_imp['column']=='core3']

Unnamed: 0,column,importance
37,core3,496
