# ALTEGRAD Challenge - Classification

*Abderrahim AIT-AZZI, Sébastien OHLEYER, Mickael SUTTON*

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Performing grid search
import matplotlib.pyplot as plt
import os
import csv
from datetime import datetime
from log import _check_log_directory,_initialise_model_log



In [2]:
data_dir = "./data/"
log_dir = './log'

#initialize model directory
log_name = (datetime.now().strftime('%d-%m-%Y_%H-%M-%S'))
log_filepath = os.path.join(log_dir,log_name,'lighgb.csv')
_check_log_directory(os.path.join(log_dir,log_name))
_initialise_model_log(log_filepath)

Attempting to make log directory at ./log/29-01-2018_19-19-06


### Load data

In [3]:
# Glove features
features_train = pd.read_csv(data_dir+'train_features_glove.csv', sep=',', encoding='latin-1')
features_test = pd.read_csv(data_dir+'test_features_glove.csv', sep=',', encoding='latin-1')
features_train= features_train.drop(['question1', 'question2'], axis=1)
features_test = features_test.drop(['id','qid1','qid2','question1', 'question2'], axis=1)
data_train = pd.read_csv(data_dir+'train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])

# Pagerank features
pagerank_feats_train = pd.read_csv(data_dir+"train_pagerank.csv", sep=',')
pagerank_feats_test = pd.read_csv(data_dir+"test_pagerank.csv", sep=',')

# Question frequency
train_question_freq = pd.read_csv(data_dir+'train_question_freq.csv', sep=',', index_col=0)
test_question_freq = pd.read_csv(data_dir+'test_question_freq.csv', sep=',', index_col=0)

# Intersection of questions
train_question_inter= pd.read_csv(data_dir+'train_question_inter.csv', sep=',', index_col=0)
test_question_inter = pd.read_csv(data_dir+'test_question_inter.csv', sep=',', index_col=0)

# K-cores
train_kcores = pd.read_csv(data_dir+'train_kcores.csv', sep=',', index_col=0)
test_kcores = pd.read_csv(data_dir+'test_kcores.csv', sep=',', index_col=0)

# question K-cores
train_question_kcores = pd.read_csv(data_dir+'train_question_kcores.csv', sep=',', index_col=0)
test_question_kcores = pd.read_csv(data_dir+'test_question_kcores.csv', sep=',', index_col=0)

# TF-IDF
train_tfidf = pd.read_csv(data_dir+'train_tfidf.csv', sep=',', index_col=0)
test_tfidf = pd.read_csv(data_dir+'test_tfidf.csv', sep=',', index_col=0)

# Graph features
train_graph_feat = pd.read_csv(data_dir+'train_graph_feat.csv', sep=',', index_col=0)
test_graph_feat = pd.read_csv(data_dir+'test_graph_feat.csv', sep=',', index_col=0)

# Bigram feature
train_bigram_feat = pd.read_csv(data_dir+'train_2gram_feat.csv', sep=',', index_col=0)
test_bigram_feat = pd.read_csv(data_dir+'test_2gram_feat.csv', sep=',', index_col=0)

# 3gram feature
train_3gram_feat = pd.read_csv(data_dir+'train_3gram_feat.csv', sep=',', index_col=0)
test_3gram_feat = pd.read_csv(data_dir+'test_3gram_feat.csv', sep=',', index_col=0)

# spaCy feature
train_spacy_feat = pd.read_csv(data_dir+'train_spacy_features.csv', sep=',', index_col=0)
test_spacy_feat = pd.read_csv(data_dir+'test_spacy_features.csv', sep=',', index_col=0)

# Graph features2 NE PAS PRENDRE !!!
#train_graph_feat2 = pd.read_csv(data_dir+'train_graph_feat2.csv', sep=',', index_col=0)
#test_graph_feat2 = pd.read_csv(data_dir+'test_graph_feat2.csv', sep=',', index_col=0)

# Word features
train_word_feat = pd.read_csv(data_dir+'train_word_feat.csv', sep=',', index_col=0)
test_word_feat = pd.read_csv(data_dir+'test_word_feat.csv', sep=',', index_col=0)

### Add features

In [4]:
# Add Pagerank features
features_train[["q1_pr","q2_pr"]]=pagerank_feats_train[["q1_pr","q2_pr"]]
features_test[["q1_pr","q2_pr"]]=pagerank_feats_test[["q1_pr","q2_pr"]]

# Add question frequency features
features_train[["q1_hash","q2_hash","q1_freq","q2_freq"]]=train_question_freq[["q1_hash","q2_hash","q1_freq","q2_freq"]]
features_test[["q1_hash","q2_hash","q1_freq","q2_freq"]]=test_question_freq[["q1_hash","q2_hash","q1_freq","q2_freq"]]

# Add intersection of questions features
features_train['q1_q2_intersect']=train_question_inter['q1_q2_intersect']
features_test['q1_q2_intersect']=test_question_inter['q1_q2_intersect']

# Add K-cores
features_train[["core1","core2","core3"]] = train_kcores[["core1","core2","core3"]]
features_test[["core1","core2","core3"]] = test_kcores[["core1","core2","core3"]]

# Add question K-cores features
features_train[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 
                'q1_q2_kcores_diff_normed']]=train_question_kcores[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]
features_test[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 
               'q1_q2_kcores_diff_normed']]=test_question_kcores[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]

# Add TF-IDF features
features_train[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]=train_tfidf[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]
features_test[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]=test_tfidf[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]

# Add graph features
features_train[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size', 'shortest_path']] = train_graph_feat[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size', 'shortest_path']]
features_test[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size', 'shortest_path']] = test_graph_feat[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size', 'shortest_path']]

# Add bigram features
features_train[['bigram_coocurence','bigram_distinct','bigram_nostpwrd_coocurence','bigram_nostpwrd_distinct']] = train_bigram_feat[['bigram_coocurence','bigram_distinct','bigram_nostpwrd_coocurence','bigram_nostpwrd_distinct']]
features_test[['bigram_coocurence','bigram_distinct','bigram_nostpwrd_coocurence','bigram_nostpwrd_distinct']] = test_bigram_feat[['bigram_coocurence','bigram_distinct','bigram_nostpwrd_coocurence','bigram_nostpwrd_distinct']]

# Add 3gram features
features_train[['3gram_cooccurence','3gram_distinct','3gram_nostpwrd_cooccurence','3gram_nostpwrd_distinct']] = train_3gram_feat[['3gram_cooccurence','3gram_distinct','3gram_nostpwrd_cooccurence','3gram_nostpwrd_distinct']]
features_test[['3gram_cooccurence','3gram_distinct','3gram_nostpwrd_cooccurence','3gram_nostpwrd_distinct']] = test_3gram_feat[['3gram_cooccurence','3gram_distinct','3gram_nostpwrd_cooccurence','3gram_nostpwrd_distinct']]

# Add spaCy features
features_train[['spacy_similarity']] = train_spacy_feat[['spacy_similarity']]
features_test[['spacy_similarity']] = test_spacy_feat[['spacy_similarity']]

# Add graph features2
#features_train[['shortest_path']] = train_graph_feat2[['shortest_path']]
#features_test[['shortest_path']] = test_graph_feat2[['shortest_path']]

# Add graph features
features_train[[ 'q1_how','q2_how','how_both','q1_what','q2_what','what_both','q1_which','q2_which','which_both','q1_who','q2_who','who_both','q1_where','q2_where','where_both','q1_when','q2_when','when_both','q1_why','q2_why','why_both','caps_count_q1','caps_count_q2','diff_caps','exactly_same']]=train_word_feat[[ 'q1_how','q2_how','how_both','q1_what','q2_what','what_both','q1_which','q2_which','which_both','q1_who','q2_who','who_both','q1_where','q2_where','where_both','q1_when','q2_when','when_both','q1_why','q2_why','why_both','caps_count_q1','caps_count_q2','diff_caps','exactly_same']]
features_test[[ 'q1_how','q2_how','how_both','q1_what','q2_what','what_both','q1_which','q2_which','which_both','q1_who','q2_who','who_both','q1_where','q2_where','where_both','q1_when','q2_when','when_both','q1_why','q2_why','why_both','caps_count_q1','caps_count_q2','diff_caps','exactly_same']]=test_word_feat[[ 'q1_how','q2_how','how_both','q1_what','q2_what','what_both','q1_which','q2_which','which_both','q1_who','q2_who','who_both','q1_where','q2_where','where_both','q1_when','q2_when','when_both','q1_why','q2_why','why_both','caps_count_q1','caps_count_q2','diff_caps','exactly_same']]

### Create train and test matrices

In [5]:
X_train= features_train.drop(['is_duplicate'],axis=1)
X_test = features_test
X_train=X_train.replace([np.inf, -np.inf], np.nan)
X_train=X_train.fillna(value=0)
X_test=X_test.replace([np.inf, -np.inf], np.nan)
X_test=X_test.fillna(value=0)
Y_train = data_train["is_duplicate"].values

In [6]:
X_train.columns

Index(['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2',
       'len_word_q1', 'len_word_q2', 'common_words', 'fuzz_qratio',
       'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio',
       'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio',
       'fuzz_token_sort_ratio', 'wmd', 'norm_wmd', 'cosine_distance',
       'cityblock_distance', 'jaccard_distance', 'canberra_distance',
       'euclidean_distance', 'minkowski_distance', 'braycurtis_distance',
       'skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec', 'q1_pr', 'q2_pr',
       'q1_hash', 'q2_hash', 'q1_freq', 'q2_freq', 'q1_q2_intersect', 'core1',
       'core2', 'core3', 'q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio',
       'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed', 'word_match',
       'tfidf_wm', 'tfidf_wm_stops', 'jaccard', 'wc_diff', 'wc_ratio',
       'wc_diff_unique', 'wc_ratio_unique', 'wc_diff_unq_stop',
       'wc_ratio_unique_stop', 'same_start', 'char_diff', 'char_diff_unq_

# Classifier

### A. Lightgb

In [7]:
from lgb_train import lgb_train

In [8]:
print('Number of features on train matrix: ',len(X_train.columns))
print('Number of features on test matrix: ',len(X_test.columns))

Number of features on train matrix:  99
Number of features on test matrix:  99


### Gridsearch

In [None]:
for num_leaves in [130,140,150,160]:
    for lr in [0.01,0.005,0.001]:
        for lambda_l2 in [1,1.5,2]:
            RANDOM_SEED = 2017
            lgb_params = {
                'objective': 'binary',
                'metric': 'binary_logloss',
                'boosting': 'gbdt',
                'device': 'cpu',
                'feature_fraction': 0.486,
                'num_leaves': num_leaves,
                'lambda_l2': lambda_l2,
                'learning_rate': lr,
                'num_boost_round': 5000,
                'early_stopping_rounds': 50,
                'max_depth': 25,
                'min_data_in_leaf': 15,
                'subsample': 1,
                'colsample_bytree': 1,
                'verbose': 1,
                'bagging_fraction_seed': RANDOM_SEED,
                'feature_fraction_seed': RANDOM_SEED,
            }
            print(lgb_params)
            lgb_train(X_train, X_test, Y_train, lgb_params, log_filepath)

#### Prediction

In [9]:
# Abderrahim best parameters
RANDOM_SEED = 2017
lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'device': 'cpu',
        'feature_fraction': 0.486,
        'num_leaves': 140,
        'lambda_l2': 2,
        'learning_rate': 0.01,
        'num_boost_round': 5000,
        'early_stopping_rounds': 50,
        'max_depth': 25,
        'min_data_in_leaf': 15,
        'subsample': 1,
        'colsample_bytree': 1,
        'verbose': 1,
        'bagging_fraction_seed': RANDOM_SEED,
        'feature_fraction_seed': RANDOM_SEED,
    }

In [22]:
#lgb_params = {
#    'learning_rate': 0.005, 'colsample_bytree': 1, 'boosting': 'gbdt', 'feature_fraction': 0.486, 
#    'metric': 'binary_logloss', 'min_data_in_leaf': 15, 'verbose': 1, 'subsample': 1, 'bagging_fraction_seed': 2017, 
#    'objective': 'binary', 'num_leaves': 130, 'max_depth': 25, 'early_stopping_rounds': 50, 'lambda_l2': 1.5, 
#    'feature_fraction_seed': 2017, 'device': 'cpu', 'num_boost_round': 5000}

In [10]:
feat_imp = lgb_train(X_train, X_test, Y_train, lgb_params, log_filepath, test_prediction=True, num_folds=5)

Fitting fold {fold_num + 1} of {kfold.n_splits}




Fold 1: 1672 rounds, training loss 0.046193, validation loss 0.133867
Fitting fold {fold_num + 1} of {kfold.n_splits}
Fold 2: 1453 rounds, training loss 0.052660, validation loss 0.142120
Fitting fold {fold_num + 1} of {kfold.n_splits}
Fold 3: 1546 rounds, training loss 0.049528, validation loss 0.144587
Fitting fold {fold_num + 1} of {kfold.n_splits}
Fold 4: 1780 rounds, training loss 0.043762, validation loss 0.141389
Fitting fold {fold_num + 1} of {kfold.n_splits}
Fold 5: 1300 rounds, training loss 0.056807, validation loss 0.150178
Final CV val score: [0.13386710621237824, 0.14211991103984803, 0.14458690691737858, 0.14138932543157473, 0.15017844553049162]
Final mean CV val score: 0.14242833902633426

Make submission file...
Submission file written !


In [24]:
feat_imp[feat_imp['column']=='shortest_path']

Unnamed: 0,column,importance
64,shortest_path,828


In [None]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
import xgboost as xgb

X_train, X_valid, y_train, y_valid = train_test_split(X_train1, Y_train, test_size=0.1, random_state=4242)

# UPDownSampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
y_train = np.array(
    [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_valid = X_valid[y_valid == 1]
neg_valid = X_valid[y_valid == 0]
X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
y_valid = np.array(
    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
print(np.mean(y_valid))
del pos_valid, neg_valid

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 7
params['subsample'] = 0.6
params['base_score'] = 0.2
# params['scale_pos_weight'] = 0.2

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50)
print(log_loss(y_valid, bst.predict(d_valid)))
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)

In [None]:
os.path.join(log_dir,'test','train','2')