# ALTEGRAD Challenge - Classification

*Abderrahim AIT-AZZI, Sébastien OHLEYER, Mickael SUTTON*

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Performing grid search
import matplotlib.pyplot as plt



### Load data

In [2]:
# Glove features
features_train = pd.read_csv('./data/train_features_glove.csv', sep=',', encoding='latin-1')
features_test = pd.read_csv('./data/test_features_glove.csv', sep=',', encoding='latin-1')
features_train= features_train.drop(['question1', 'question2'], axis=1)
features_test = features_test.drop(['id','qid1','qid2','question1', 'question2'], axis=1)
data_train = pd.read_csv('./data/train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
Y_train=data_train["is_duplicate"].values

# Pagerank features
pagerank_feats_train = pd.read_csv("data/train_pagerank.csv", sep=',')
pagerank_feats_test = pd.read_csv("data/test_pagerank.csv", sep=',')

# Question frequency
train_question_freq = pd.read_csv('./data/train_question_freq.csv', sep=',', index_col=0)
test_question_freq = pd.read_csv('./data/test_question_freq.csv', sep=',', index_col=0)

# Intersection of questions
train_question_inter= pd.read_csv('./data/train_question_inter.csv', sep=',', index_col=0)
test_question_inter = pd.read_csv('./data/test_question_inter.csv', sep=',', index_col=0)

# K-cores
train_kcores = pd.read_csv('./data/train_kcores.csv', sep=',', index_col=0)
test_kcores = pd.read_csv('./data/test_kcores.csv', sep=',', index_col=0)

# question K-cores
train_question_kcores = pd.read_csv('./data/train_question_kcores.csv', sep=',', index_col=0)
test_question_kcores = pd.read_csv('./data/test_question_kcores.csv', sep=',', index_col=0)

# TF-IDF
train_tfidf = pd.read_csv('./data/train_tfidf.csv', sep=',', index_col=0)
test_tfidf = pd.read_csv('./data/test_tfidf.csv', sep=',', index_col=0)

# Graph features
train_graph_feat = pd.read_csv('./data/train_graph_feat.csv', sep=',', index_col=0)
test_graph_feat = pd.read_csv('./data/test_graph_feat.csv', sep=',', index_col=0)

In [3]:
#train_question_kcores.dtypes

### Add features

In [4]:
# Add Pagerank features
features_train[["q1_pr","q2_pr"]]=pagerank_feats_train[["q1_pr","q2_pr"]]
features_test[["q1_pr","q2_pr"]]=pagerank_feats_test[["q1_pr","q2_pr"]]

# Add question frequency features
features_train[["q1_hash","q2_hash","q1_freq","q2_freq"]]=train_question_freq[["q1_hash","q2_hash","q1_freq","q2_freq"]]
features_test[["q1_hash","q2_hash","q1_freq","q2_freq"]]=test_question_freq[["q1_hash","q2_hash","q1_freq","q2_freq"]]

# Add intersection of questions features
features_train['q1_q2_intersect']=train_question_inter['q1_q2_intersect']
features_test['q1_q2_intersect']=test_question_inter['q1_q2_intersect']

# Add K-cores
features_train[["core1","core2","core3"]] = train_kcores[["core1","core2","core3"]]
features_test[["core1","core2","core3"]] = test_kcores[["core1","core2","core3"]]

# Add question K-cores features
features_train[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 
                'q1_q2_kcores_diff_normed']]=train_question_kcores[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]
features_test[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 
               'q1_q2_kcores_diff_normed']]=test_question_kcores[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]

# Add TF-IDF features
features_train[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]=train_tfidf[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]
features_test[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]=test_tfidf[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]

# Add graph features
features_train[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size']] = train_graph_feat[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size']]
features_test[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size']] = test_graph_feat[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size']]




In [5]:
X_train= features_train
X_test = features_test
X_train=X_train.replace([np.inf, -np.inf], np.nan)
X_train=X_train.fillna(value=0)
X_test=X_test.replace([np.inf, -np.inf], np.nan)
X_test=X_test.fillna(value=0)

In [6]:
X_train.columns

Index(['is_duplicate', 'len_q1', 'len_q2', 'diff_len', 'len_char_q1',
       'len_char_q2', 'len_word_q1', 'len_word_q2', 'common_words',
       'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio',
       'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
       'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'wmd', 'norm_wmd',
       'cosine_distance', 'cityblock_distance', 'jaccard_distance',
       'canberra_distance', 'euclidean_distance', 'minkowski_distance',
       'braycurtis_distance', 'skew_q1vec', 'skew_q2vec', 'kur_q1vec',
       'kur_q2vec', 'q1_pr', 'q2_pr', 'q1_hash', 'q2_hash', 'q1_freq',
       'q2_freq', 'q1_q2_intersect', 'core1', 'core2', 'core3', 'q1_kcores',
       'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff',
       'q1_q2_kcores_diff_normed', 'word_match', 'tfidf_wm', 'tfidf_wm_stops',
       'jaccard', 'wc_diff', 'wc_ratio', 'wc_diff_unique', 'wc_ratio_unique',
       'wc_diff_unq_stop', 'wc_ratio_unique_stop', 'same_start', 'char_diff',

# Classifier

### Feature selection

In [7]:
#Choose all predictors except target & IDcols
predictors = [x for x in X_train.columns if x not in ['is_duplicate']]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

cv_folds=5
early_stopping_rounds=50
# modelfit(xgb1, X_train, predictors)
alg=xgb1
dtrain=X_train.copy()
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['is_duplicate'].values)

In [9]:
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
    metrics='auc', early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
alg.set_params(n_estimators=cvresult.shape[0])

[0]	train-auc:0.967541+0.000310861	test-auc:0.966784+0.00115294
[1]	train-auc:0.970366+0.000606985	test-auc:0.969575+0.0011814
[2]	train-auc:0.9712+0.000666022	test-auc:0.970474+0.00103991
[3]	train-auc:0.972347+0.000246138	test-auc:0.971358+0.00102423
[4]	train-auc:0.972876+0.000335763	test-auc:0.971885+0.00109519
[5]	train-auc:0.973467+0.00046871	test-auc:0.972317+0.00124049
[6]	train-auc:0.974111+0.000331248	test-auc:0.973105+0.000576025
[7]	train-auc:0.974745+0.0003369	test-auc:0.973563+0.000859831
[8]	train-auc:0.975273+0.000279726	test-auc:0.974172+0.00108289
[9]	train-auc:0.975357+0.00028988	test-auc:0.974294+0.00109535
[10]	train-auc:0.975637+0.000309198	test-auc:0.974563+0.00108223
[11]	train-auc:0.975689+0.000343951	test-auc:0.974639+0.00107698
[12]	train-auc:0.975814+0.000276285	test-auc:0.974756+0.00100001
[13]	train-auc:0.975993+0.000178627	test-auc:0.974954+0.000862948
[14]	train-auc:0.976029+0.000209001	test-auc:0.974964+0.000821516
[15]	train-auc:0.976251+0.000160263	te

[125]	train-auc:0.988452+0.000165105	test-auc:0.983119+0.000560455
[126]	train-auc:0.988493+0.000164218	test-auc:0.983141+0.000557126
[127]	train-auc:0.988539+0.000151702	test-auc:0.983154+0.000553873
[128]	train-auc:0.988592+0.000144421	test-auc:0.983169+0.000566975
[129]	train-auc:0.988645+0.000144183	test-auc:0.983197+0.000572923
[130]	train-auc:0.988728+0.000152016	test-auc:0.983207+0.000568486
[131]	train-auc:0.988775+0.000151401	test-auc:0.983221+0.000568468
[132]	train-auc:0.988815+0.000158391	test-auc:0.98323+0.000559415
[133]	train-auc:0.988868+0.000145784	test-auc:0.98325+0.000564509
[134]	train-auc:0.98891+0.000129979	test-auc:0.983253+0.000558079
[135]	train-auc:0.988968+0.000129164	test-auc:0.983266+0.000567393
[136]	train-auc:0.989008+0.000124811	test-auc:0.983284+0.000575359
[137]	train-auc:0.98906+0.000131662	test-auc:0.9833+0.000575554
[138]	train-auc:0.989119+0.000133989	test-auc:0.983323+0.000570985
[139]	train-auc:0.989164+0.000142092	test-auc:0.98334+0.000564237
[1

[248]	train-auc:0.993366+4.26915e-05	test-auc:0.984116+0.000607082
[249]	train-auc:0.993386+3.43709e-05	test-auc:0.984113+0.000606877
[250]	train-auc:0.993415+3.58608e-05	test-auc:0.984123+0.000606029
[251]	train-auc:0.99344+3.30491e-05	test-auc:0.984131+0.000603815
[252]	train-auc:0.993473+3.44523e-05	test-auc:0.984134+0.000597111
[253]	train-auc:0.993501+3.29217e-05	test-auc:0.984141+0.000594626
[254]	train-auc:0.993531+4.2926e-05	test-auc:0.984144+0.000589774
[255]	train-auc:0.99357+4.45349e-05	test-auc:0.984143+0.000587278
[256]	train-auc:0.993598+4.50821e-05	test-auc:0.984145+0.000585532
[257]	train-auc:0.993627+4.63396e-05	test-auc:0.984154+0.000589205
[258]	train-auc:0.993663+5.24198e-05	test-auc:0.984163+0.000581672
[259]	train-auc:0.993697+5.06296e-05	test-auc:0.984167+0.000580617
[260]	train-auc:0.993727+5.12117e-05	test-auc:0.984172+0.000578395
[261]	train-auc:0.993745+5.24481e-05	test-auc:0.984172+0.000579176
[262]	train-auc:0.993781+4.7579e-05	test-auc:0.984188+0.000585211

[372]	train-auc:0.996247+8.1094e-05	test-auc:0.984454+0.000546497
[373]	train-auc:0.996269+8.23374e-05	test-auc:0.984451+0.000542301
[374]	train-auc:0.996284+8.31803e-05	test-auc:0.984456+0.000538582
[375]	train-auc:0.996298+8.5448e-05	test-auc:0.984453+0.000535859
[376]	train-auc:0.996314+8.57499e-05	test-auc:0.984452+0.000534479
[377]	train-auc:0.996329+8.04627e-05	test-auc:0.984453+0.000538436
[378]	train-auc:0.996345+7.62952e-05	test-auc:0.984451+0.000540627
[379]	train-auc:0.996362+7.80733e-05	test-auc:0.984457+0.000536831
[380]	train-auc:0.996382+8.29988e-05	test-auc:0.984463+0.000531973
[381]	train-auc:0.996398+8.35904e-05	test-auc:0.984467+0.000528771
[382]	train-auc:0.996414+8.36502e-05	test-auc:0.984466+0.000528252
[383]	train-auc:0.996433+8.82655e-05	test-auc:0.984466+0.000531774
[384]	train-auc:0.996447+9.42749e-05	test-auc:0.984468+0.000531347
[385]	train-auc:0.996463+9.65671e-05	test-auc:0.98447+0.000532833
[386]	train-auc:0.996484+9.64104e-05	test-auc:0.984482+0.00053481

[496]	train-auc:0.997931+4.62731e-05	test-auc:0.984634+0.000586929
[497]	train-auc:0.997939+4.7756e-05	test-auc:0.984635+0.000586352
[498]	train-auc:0.997949+4.58022e-05	test-auc:0.984633+0.00058531
[499]	train-auc:0.99796+4.5208e-05	test-auc:0.984636+0.000583512
[500]	train-auc:0.997971+4.56745e-05	test-auc:0.984631+0.000592525
[501]	train-auc:0.99798+4.52637e-05	test-auc:0.98463+0.000590723
[502]	train-auc:0.997991+4.76336e-05	test-auc:0.984634+0.00059319
[503]	train-auc:0.998001+4.63526e-05	test-auc:0.984641+0.000591081
[504]	train-auc:0.99801+4.58109e-05	test-auc:0.984641+0.000591856
[505]	train-auc:0.998022+4.74359e-05	test-auc:0.984644+0.000590587
[506]	train-auc:0.998028+4.57191e-05	test-auc:0.984641+0.000589718
[507]	train-auc:0.998039+4.56535e-05	test-auc:0.984643+0.000586692
[508]	train-auc:0.99805+4.71041e-05	test-auc:0.984649+0.000582836
[509]	train-auc:0.998059+4.84908e-05	test-auc:0.984644+0.000577843
[510]	train-auc:0.998069+4.84908e-05	test-auc:0.984645+0.000576636
[511

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=523,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8)

In [None]:
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['is_duplicate'],eval_metric='auc',verbose=True)

#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]

In [18]:
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['is_duplicate'].values, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['is_duplicate'], dtrain_predprob))


Model Report
Accuracy : 0.9741
AUC Score (Train): 0.997372


In [20]:
type(alg)
#feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
#feat_imp.plot(kind='bar', title='Feature Importances')
#plt.ylabel('Feature Importance Score')
#plt.show()

xgboost.sklearn.XGBClassifier

In [None]:
print(feat_imp)

In [None]:
X_train= features_train.drop(['question1','question2','is_duplicate','cosine_distance','jaccard_distance','euclidean_distance','norm_wmd','fuzz_WRatio','len_word_q2','len_word_q1','minkowski_distance','braycurtis_distance'], axis=1)

In [None]:
X_test= features_test.drop(['qid1', 'id', 'qid2','question1','question2','cosine_distance','jaccard_distance','euclidean_distance','norm_wmd','fuzz_WRatio','len_word_q2','len_word_q1','minkowski_distance','braycurtis_distance'], axis=1)

In [None]:
xgb1.fit(X_train,Y_train)

In [None]:
y_pred1 = xgb1.predict_proba(X_test1)
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred1.shape[0]):
        f.write(str(i)+','+str(y_pred1[i][1])+'\n')

## Cross val

#### light gb

In [None]:
from sklearn.model_selection import StratifiedKFold
NUM_FOLDS = 5
RANDOM_SEED = 2017
np.random.seed(RANDOM_SEED)
import lightgbm as lgb

In [None]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [None]:
y_test_pred = np.zeros((len(X_test1), NUM_FOLDS))
cv_scores = []

In [None]:
X_train=X_train1.values
X_test=X_test1.values
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, Y_train)):
    print('Fitting fold {fold_num + 1} of {kfold.n_splits}')
    
    print(len(ix_train))
    print(X_train.shape)
    X_fold_train = X_train[ix_train,:]
    X_fold_val = X_train[ix_val,:]

    y_fold_train = Y_train[ix_train]
    y_fold_val = Y_train[ix_val]
    
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'device': 'cpu',
        'feature_fraction': 0.486,
        'num_leaves': 158,
        'lambda_l2': 50,
        'learning_rate': 0.01,
        'num_boost_round': 5000,
        'early_stopping_rounds': 10,
        'verbose': 1,
        'bagging_fraction_seed': RANDOM_SEED,
        'feature_fraction_seed': RANDOM_SEED,
    }
    
    lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
    lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    
    evals_result = {}
    
    model = lgb.train(
        lgb_params,
        lgb_data_train,
        valid_sets=[lgb_data_train, lgb_data_val],
        evals_result=evals_result,
        num_boost_round=lgb_params['num_boost_round'],
        early_stopping_rounds=lgb_params['early_stopping_rounds'],
        verbose_eval=False,
    )
    
    fold_train_scores = evals_result['training'][lgb_params['metric']]
    fold_val_scores = evals_result['valid_1'][lgb_params['metric']]
    
    print('Fold {}: {} rounds, training loss {:.6f}, validation loss {:.6f}'.format(
        fold_num + 1,
        len(fold_train_scores),
        fold_train_scores[-1],
        fold_val_scores[-1],
    ))
    print()
    
    cv_scores.append(fold_val_scores[-1])
    y_test_pred[:, fold_num] = model.predict(X_test).reshape(-1)

In [None]:
pd.DataFrame({
    'column': list(X_train.columns),
    'importance': model.feature_importance(),
}).sort_values(by='importance')

In [None]:
print('Final CV score:', final_cv_score)

In [None]:
y_test = np.mean(y_test_pred, axis=1)
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_test.shape[0]):
        f.write(str(i)+','+str(y_test[i])+'\n')

#### Xgboost

In [None]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
import xgboost as xgb

X_train, X_valid, y_train, y_valid = train_test_split(X_train1, Y_train, test_size=0.1, random_state=4242)

# UPDownSampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
y_train = np.array(
    [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_valid = X_valid[y_valid == 1]
neg_valid = X_valid[y_valid == 0]
X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
y_valid = np.array(
    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
print(np.mean(y_valid))
del pos_valid, neg_valid

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 7
params['subsample'] = 0.6
params['base_score'] = 0.2
# params['scale_pos_weight'] = 0.2

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50)
print(log_loss(y_valid, bst.predict(d_valid)))
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)