# ALTEGRAD Challenge - Classification

*Abderrahim AIT-AZZI, Sébastien OHLEYER, Mickael SUTTON*

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Performing grid search
import matplotlib.pyplot as plt



### Load data

In [2]:
# Glove features
features_train = pd.read_csv('./data/train_features_glove.csv', sep=',', encoding='latin-1')
features_test = pd.read_csv('./data/test_features_glove.csv', sep=',', encoding='latin-1')
features_train= features_train.drop(['question1', 'question2'], axis=1)
features_test = features_test.drop(['id','qid1','qid2','question1', 'question2'], axis=1)
data_train = pd.read_csv('./data/train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
Y_train=data_train["is_duplicate"].values

# Pagerank features
pagerank_feats_train = pd.read_csv("data/train_pagerank.csv", sep=',')
pagerank_feats_test = pd.read_csv("data/test_pagerank.csv", sep=',')

# Question frequency
train_question_freq = pd.read_csv('./data/train_question_freq.csv', sep=',', index_col=0)
test_question_freq = pd.read_csv('./data/test_question_freq.csv', sep=',', index_col=0)

# Intersection of questions
train_question_inter= pd.read_csv('./data/train_question_inter.csv', sep=',', index_col=0)
test_question_inter = pd.read_csv('./data/test_question_inter.csv', sep=',', index_col=0)

# K-cores
train_kcores = pd.read_csv('./data/train_kcores.csv', sep=',', index_col=0)
test_kcores = pd.read_csv('./data/test_kcores.csv', sep=',', index_col=0)

# question K-cores
train_question_kcores = pd.read_csv('./data/train_question_kcores.csv', sep=',',names = ['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed'], index_col=0)
test_question_kcores = pd.read_csv('./data/test_question_kcores.csv', sep=',',names = ['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed'], index_col=0)

# TF-IDF
train_tfidf = pd.read_csv('./data/train_tfidf.csv', sep=',', index_col=0)
test_tfidf = pd.read_csv('./data/test_tfidf.csv', sep=',', index_col=0)

# Graph features
train_graph_feat = pd.read_csv('./data/train_graph_feat.csv', sep=',', index_col=0)
test_graph_feat = pd.read_csv('./data/test_graph_feat.csv', sep=',', index_col=0)

### Add features

In [3]:
# Add Pagerank features
features_train[["q1_pr","q2_pr"]]=pagerank_feats_train[["q1_pr","q2_pr"]]
features_test[["q1_pr","q2_pr"]]=pagerank_feats_test[["q1_pr","q2_pr"]]

# Add question frequency features
features_train[["q1_hash","q2_hash","q1_freq","q2_freq"]]=train_question_freq[["q1_hash","q2_hash","q1_freq","q2_freq"]]
features_test[["q1_hash","q2_hash","q1_freq","q2_freq"]]=test_question_freq[["q1_hash","q2_hash","q1_freq","q2_freq"]]

# Add intersection of questions features
features_train['q1_q2_intersect']=train_question_inter['q1_q2_intersect']
features_test['q1_q2_intersect']=test_question_inter['q1_q2_intersect']

# Add K-cores
features_train[["core1","core2","core3"]] = train_kcores[["core1","core2","core3"]]
features_test[["core1","core2","core3"]] = test_kcores[["core1","core2","core3"]]

# Add question K-cores features
features_train[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 
                'q1_q2_kcores_diff_normed']]=train_question_kcores[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]
features_test[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 
               'q1_q2_kcores_diff_normed']]=test_question_kcores[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]

# Add TF-IDF features
features_train[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]=train_tfidf[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]
features_test[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]=test_tfidf[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]

# Add graph features
features_train[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size']] = train_graph_feat[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size']]
features_test[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size']] = test_graph_feat[['q1_neigh','q2_neigh','common_neigh', 'distinct_neigh', 'clique_size']]




In [4]:
X_train= features_train
X_test = features_test
X_train=X_train.replace([np.inf, -np.inf], np.nan)
X_train=X_train.fillna(value=0)
X_test=X_test.replace([np.inf, -np.inf], np.nan)
X_test=X_test.fillna(value=0)

In [5]:
X_train.columns

Index(['is_duplicate', 'len_q1', 'len_q2', 'diff_len', 'len_char_q1',
       'len_char_q2', 'len_word_q1', 'len_word_q2', 'common_words',
       'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio',
       'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
       'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'wmd', 'norm_wmd',
       'cosine_distance', 'cityblock_distance', 'jaccard_distance',
       'canberra_distance', 'euclidean_distance', 'minkowski_distance',
       'braycurtis_distance', 'skew_q1vec', 'skew_q2vec', 'kur_q1vec',
       'kur_q2vec', 'q1_pr', 'q2_pr', 'q1_hash', 'q2_hash', 'q1_freq',
       'q2_freq', 'q1_q2_intersect', 'core1', 'core2', 'core3', 'q1_kcores',
       'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff',
       'q1_q2_kcores_diff_normed', 'word_match', 'tfidf_wm', 'tfidf_wm_stops',
       'jaccard', 'wc_diff', 'wc_ratio', 'wc_diff_unique', 'wc_ratio_unique',
       'wc_diff_unq_stop', 'wc_ratio_unique_stop', 'same_start', 'char_diff',

# Classifier

### Feature selection

In [7]:
#Choose all predictors except target & IDcols
predictors = [x for x in X_train.columns if x not in ['is_duplicate']]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [None]:
cv_folds=5
early_stopping_rounds=50
# modelfit(xgb1, X_train, predictors)
alg=xgb1
dtrain=X_train.copy()
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['is_duplicate'].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
    metrics='auc', early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
alg.set_params(n_estimators=cvresult.shape[0])

[0]	train-auc:0.966703+0.000878149	test-auc:0.966005+0.00213427
[1]	train-auc:0.969954+0.000715794	test-auc:0.969145+0.00140189
[2]	train-auc:0.971849+0.000865594	test-auc:0.97084+0.00119053
[3]	train-auc:0.97266+0.000557033	test-auc:0.971826+0.000487138
[4]	train-auc:0.973458+0.000505536	test-auc:0.972703+0.000789194
[5]	train-auc:0.973858+0.00063108	test-auc:0.973033+0.00105882
[6]	train-auc:0.97424+0.000616404	test-auc:0.973414+0.00119456
[7]	train-auc:0.974784+0.000714409	test-auc:0.97384+0.00131763
[8]	train-auc:0.975004+0.000752476	test-auc:0.974053+0.00136541
[9]	train-auc:0.975407+0.000700337	test-auc:0.974378+0.00117257
[10]	train-auc:0.975645+0.000541227	test-auc:0.974561+0.00115697
[11]	train-auc:0.975685+0.000532617	test-auc:0.974602+0.00113407
[12]	train-auc:0.975821+0.000601537	test-auc:0.974698+0.00121001
[13]	train-auc:0.976203+0.000558074	test-auc:0.975054+0.000803324
[14]	train-auc:0.976454+0.000488776	test-auc:0.975247+0.000787558
[15]	train-auc:0.976676+0.000603194	

[125]	train-auc:0.988186+0.000125653	test-auc:0.983024+0.000557119
[126]	train-auc:0.988228+0.000119657	test-auc:0.983032+0.000552043
[127]	train-auc:0.988278+0.00010771	test-auc:0.983062+0.000549257
[128]	train-auc:0.988337+0.0001153	test-auc:0.983077+0.000551679
[129]	train-auc:0.988381+0.000106571	test-auc:0.983097+0.000548042
[130]	train-auc:0.98842+0.000109176	test-auc:0.983105+0.000545822
[131]	train-auc:0.988471+9.39396e-05	test-auc:0.983117+0.000550429
[132]	train-auc:0.988528+0.000115235	test-auc:0.983136+0.000550649
[133]	train-auc:0.988562+0.000107763	test-auc:0.983153+0.00055531
[134]	train-auc:0.988614+0.000108023	test-auc:0.98317+0.000534418
[135]	train-auc:0.988677+9.37132e-05	test-auc:0.983187+0.000515949
[136]	train-auc:0.988719+8.83344e-05	test-auc:0.983195+0.000506357
[137]	train-auc:0.988763+9.26987e-05	test-auc:0.983203+0.000500426
[138]	train-auc:0.988803+9.25669e-05	test-auc:0.983213+0.000505526
[139]	train-auc:0.988866+0.000108609	test-auc:0.983237+0.000499921
[

[249]	train-auc:0.993153+0.000154605	test-auc:0.983872+0.000547203
[250]	train-auc:0.993185+0.000162182	test-auc:0.983876+0.000546117
[251]	train-auc:0.993219+0.000157476	test-auc:0.983881+0.000545979
[252]	train-auc:0.993252+0.000166981	test-auc:0.983889+0.000544525
[253]	train-auc:0.993276+0.000166707	test-auc:0.983897+0.000546579
[254]	train-auc:0.993307+0.000172376	test-auc:0.983905+0.00055053
[255]	train-auc:0.993333+0.000170449	test-auc:0.983908+0.000567591
[256]	train-auc:0.993369+0.000168469	test-auc:0.983914+0.000569162
[257]	train-auc:0.993407+0.000162435	test-auc:0.983913+0.000573405
[258]	train-auc:0.993437+0.000159326	test-auc:0.983916+0.000569056
[259]	train-auc:0.993458+0.000166675	test-auc:0.983914+0.000575173
[260]	train-auc:0.993486+0.000163344	test-auc:0.983914+0.000575773
[261]	train-auc:0.993509+0.00015541	test-auc:0.983921+0.00056692
[262]	train-auc:0.993534+0.000153449	test-auc:0.983929+0.000576386
[263]	train-auc:0.99356+0.000161933	test-auc:0.983928+0.000572157

[372]	train-auc:0.996075+0.000154581	test-auc:0.98421+0.000542341
[373]	train-auc:0.996096+0.000152371	test-auc:0.984217+0.000535151
[374]	train-auc:0.996109+0.000151591	test-auc:0.984222+0.000540016
[375]	train-auc:0.996128+0.000152287	test-auc:0.984219+0.000539904
[376]	train-auc:0.996147+0.000151371	test-auc:0.984216+0.000537727
[377]	train-auc:0.996163+0.000150686	test-auc:0.984222+0.000532458
[378]	train-auc:0.996188+0.000150437	test-auc:0.984224+0.000527821
[379]	train-auc:0.996208+0.000148902	test-auc:0.984221+0.000523686
[380]	train-auc:0.996223+0.000144404	test-auc:0.984226+0.000527497
[381]	train-auc:0.996241+0.000141727	test-auc:0.984221+0.000521132
[382]	train-auc:0.996265+0.000141809	test-auc:0.984223+0.000517333
[383]	train-auc:0.996283+0.000136574	test-auc:0.984226+0.000518649
[384]	train-auc:0.996302+0.000133425	test-auc:0.984222+0.000517144
[385]	train-auc:0.996322+0.000128752	test-auc:0.984213+0.00051685
[386]	train-auc:0.996338+0.000125604	test-auc:0.98422+0.00051338

In [22]:
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['is_duplicate'],eval_metric='auc'',verbose=True')

#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]

#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['is_duplicate'].values, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['is_duplicate'], dtrain_predprob))

feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.show()

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields q1_kcores, q2_kcores, q1_q2_kcores_ratio, q1_q2_kcores_diff, q1_q2_kcores_diff_normed

In [None]:
print(feat_imp)

In [None]:
X_train= features_train.drop(['question1','question2','is_duplicate','cosine_distance','jaccard_distance','euclidean_distance','norm_wmd','fuzz_WRatio','len_word_q2','len_word_q1','minkowski_distance','braycurtis_distance'], axis=1)

In [None]:
X_test= features_test.drop(['qid1', 'id', 'qid2','question1','question2','cosine_distance','jaccard_distance','euclidean_distance','norm_wmd','fuzz_WRatio','len_word_q2','len_word_q1','minkowski_distance','braycurtis_distance'], axis=1)

In [None]:
xgb1.fit(X_train,Y_train)

In [None]:
y_pred1 = xgb1.predict_proba(X_test1)
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred1.shape[0]):
        f.write(str(i)+','+str(y_pred1[i][1])+'\n')

## Cross val

#### light gb

In [None]:
from sklearn.model_selection import StratifiedKFold
NUM_FOLDS = 5
RANDOM_SEED = 2017
np.random.seed(RANDOM_SEED)
import lightgbm as lgb

In [None]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [None]:
y_test_pred = np.zeros((len(X_test1), NUM_FOLDS))
cv_scores = []

In [None]:
X_train=X_train1.values
X_test=X_test1.values
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, Y_train)):
    print('Fitting fold {fold_num + 1} of {kfold.n_splits}')
    
    print(len(ix_train))
    print(X_train.shape)
    X_fold_train = X_train[ix_train,:]
    X_fold_val = X_train[ix_val,:]

    y_fold_train = Y_train[ix_train]
    y_fold_val = Y_train[ix_val]
    
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'device': 'cpu',
        'feature_fraction': 0.486,
        'num_leaves': 158,
        'lambda_l2': 50,
        'learning_rate': 0.01,
        'num_boost_round': 5000,
        'early_stopping_rounds': 10,
        'verbose': 1,
        'bagging_fraction_seed': RANDOM_SEED,
        'feature_fraction_seed': RANDOM_SEED,
    }
    
    lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
    lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    
    evals_result = {}
    
    model = lgb.train(
        lgb_params,
        lgb_data_train,
        valid_sets=[lgb_data_train, lgb_data_val],
        evals_result=evals_result,
        num_boost_round=lgb_params['num_boost_round'],
        early_stopping_rounds=lgb_params['early_stopping_rounds'],
        verbose_eval=False,
    )
    
    fold_train_scores = evals_result['training'][lgb_params['metric']]
    fold_val_scores = evals_result['valid_1'][lgb_params['metric']]
    
    print('Fold {}: {} rounds, training loss {:.6f}, validation loss {:.6f}'.format(
        fold_num + 1,
        len(fold_train_scores),
        fold_train_scores[-1],
        fold_val_scores[-1],
    ))
    print()
    
    cv_scores.append(fold_val_scores[-1])
    y_test_pred[:, fold_num] = model.predict(X_test).reshape(-1)

In [None]:
pd.DataFrame({
    'column': list(X_train.columns),
    'importance': model.feature_importance(),
}).sort_values(by='importance')

In [None]:
print('Final CV score:', final_cv_score)

In [None]:
y_test = np.mean(y_test_pred, axis=1)
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_test.shape[0]):
        f.write(str(i)+','+str(y_test[i])+'\n')

#### Xgboost

In [None]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
import xgboost as xgb

X_train, X_valid, y_train, y_valid = train_test_split(X_train1, Y_train, test_size=0.1, random_state=4242)

# UPDownSampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
y_train = np.array(
    [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_valid = X_valid[y_valid == 1]
neg_valid = X_valid[y_valid == 0]
X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
y_valid = np.array(
    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
print(np.mean(y_valid))
del pos_valid, neg_valid

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 7
params['subsample'] = 0.6
params['base_score'] = 0.2
# params['scale_pos_weight'] = 0.2

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50)
print(log_loss(y_valid, bst.predict(d_valid)))
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)