# ALTEGRAD Challenge - Classification

*Abderrahim AIT-AZZI, Sébastien OHLEYER, Mickael SUTTON*

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Performing grid search
import matplotlib.pyplot as plt



### Load data

In [3]:
features_train = pd.read_csv('./data/train_features_glove.csv', sep=',', encoding='latin-1')
features_test = pd.read_csv('./data/test_features_glove.csv', sep=',', encoding='latin-1')
features_train= features_train.drop(['question1', 'question2'], axis=1)
features_test = features_test.drop(['id','qid1','qid2','question1', 'question2'], axis=1)
data_train = pd.read_csv('./data/train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
Y_train=data_train["is_duplicate"].values

# Pagerank features
pagerank_feats_train = pd.read_csv("data/pagerank_train.csv", sep=',')
pagerank_feats_test = pd.read_csv("data/pagerank_test.csv", sep=',')

# Question frequency
train_comb = pd.read_csv('./data/train_magic.csv', sep=',')
test_comb = pd.read_csv('./data/test_magic.csv', sep=',')

# Intersection of questions
train_feat= pd.read_csv('./data/train_magic_v2.csv', sep=',')
test_feat = pd.read_csv('./data/test_magic_v2.csv', sep=',')

# K cores
#df_train = pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","label"])
#df_test = pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

In [4]:
# Add Pagerank features
features_test["q1_pr"]=pagerank_feats_test["q1_pr"]
features_test["q2_pr"]=pagerank_feats_test["q2_pr"]
features_train["q1_pr"]=pagerank_feats_train["q1_pr"]
features_train["q2_pr"]=pagerank_feats_train["q2_pr"]

# Add question frequency features
features_test["q1_hash"]=test_comb["q1_hash"]
features_test["q2_hash"]=test_comb["q2_hash"]
features_test["q1_freq"]=test_comb["q1_freq"]
features_test["q2_freq"]=test_comb["q2_freq"]
features_train["q1_hash"]=train_comb["q1_hash"]
features_train["q2_hash"]=train_comb["q1_hash"]
features_train["q1_freq"]=train_comb["q1_freq"]
features_train["q2_freq"]=train_comb["q2_freq"]

# Add intersection of questions features
features_train['q1_q2_intersect']=train_feat['q1_q2_intersect']
features_test['q1_q2_intersect']=test_feat['q1_q2_intersect']

# Add Kcores features
#features_train[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]=df_train
#features_test[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]=df_test

### Feature selection

In [5]:
#Choose all predictors except target & IDcols
predictors = [x for x in X_train.columns if x not in ['is_duplicate']]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

cv_folds=5
early_stopping_rounds=50
# modelfit(xgb1, X_train, predictors)
alg=xgb1
dtrain=X_train.copy()
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['is_duplicate'].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
    metrics='auc', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])

#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['is_duplicate'],eval_metric='auc')

#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]

#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['is_duplicate'].values, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['is_duplicate'], dtrain_predprob))

feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.show()

NameError: name 'X_train' is not defined

In [None]:
print(feat_imp)

In [5]:
X_train= features_train.drop(['question1','question2','is_duplicate','cosine_distance','jaccard_distance','euclidean_distance','norm_wmd','fuzz_WRatio','len_word_q2','len_word_q1','minkowski_distance','braycurtis_distance'], axis=1)

ValueError: labels ['question1' 'question2'] not contained in axis

In [11]:
X_test= features_test.drop(['qid1', 'id', 'qid2','question1','question2','cosine_distance','jaccard_distance','euclidean_distance','norm_wmd','fuzz_WRatio','len_word_q2','len_word_q1','minkowski_distance','braycurtis_distance'], axis=1)

In [12]:
xgb1.fit(X_train,Y_train)

NameError: name 'xgb1' is not defined

In [None]:
y_pred1 = xgb1.predict_proba(X_test1)
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred1.shape[0]):
        f.write(str(i)+','+str(y_pred1[i][1])+'\n')

## Cross val

#### light gb

In [None]:
from sklearn.model_selection import StratifiedKFold
NUM_FOLDS = 5
RANDOM_SEED = 2017
np.random.seed(RANDOM_SEED)
import lightgbm as lgb

In [None]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [None]:
y_test_pred = np.zeros((len(X_test1), NUM_FOLDS))
cv_scores = []

In [None]:
X_train=X_train1.values
X_test=X_test1.values
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, Y_train)):
    print('Fitting fold {fold_num + 1} of {kfold.n_splits}')
    
    print(len(ix_train))
    print(X_train.shape)
    X_fold_train = X_train[ix_train,:]
    X_fold_val = X_train[ix_val,:]

    y_fold_train = Y_train[ix_train]
    y_fold_val = Y_train[ix_val]
    
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'device': 'cpu',
        'feature_fraction': 0.486,
        'num_leaves': 158,
        'lambda_l2': 50,
        'learning_rate': 0.01,
        'num_boost_round': 5000,
        'early_stopping_rounds': 10,
        'verbose': 1,
        'bagging_fraction_seed': RANDOM_SEED,
        'feature_fraction_seed': RANDOM_SEED,
    }
    
    lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
    lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    
    evals_result = {}
    
    model = lgb.train(
        lgb_params,
        lgb_data_train,
        valid_sets=[lgb_data_train, lgb_data_val],
        evals_result=evals_result,
        num_boost_round=lgb_params['num_boost_round'],
        early_stopping_rounds=lgb_params['early_stopping_rounds'],
        verbose_eval=False,
    )
    
    fold_train_scores = evals_result['training'][lgb_params['metric']]
    fold_val_scores = evals_result['valid_1'][lgb_params['metric']]
    
    print('Fold {}: {} rounds, training loss {:.6f}, validation loss {:.6f}'.format(
        fold_num + 1,
        len(fold_train_scores),
        fold_train_scores[-1],
        fold_val_scores[-1],
    ))
    print()
    
    cv_scores.append(fold_val_scores[-1])
    y_test_pred[:, fold_num] = model.predict(X_test).reshape(-1)

In [None]:
pd.DataFrame({
    'column': list(X_train.columns),
    'importance': model.feature_importance(),
}).sort_values(by='importance')

In [None]:
print('Final CV score:', final_cv_score)

In [None]:
y_test = np.mean(y_test_pred, axis=1)
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_test.shape[0]):
        f.write(str(i)+','+str(y_test[i])+'\n')

#### Xgboost

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=18)
X_test=X_test.values
d_test = xgb.DMatrix(X_test)

X=X_train
y=Y_train

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 7
params['subsample'] = 0.75
params['base_score'] = 0.2

params['colsample_bytree'] = 1
params['colsample_bylevel'] = 1
params['n_jobs'] = -1

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_tr, X_val = X.iloc[train_index], X.iloc[test_index]
    y_tr, y_val = y[train_index], y[test_index]
    
    d_train = xgb.DMatrix(X_tr, label=y_tr)
    d_valid = xgb.DMatrix(X_val, label=y_val)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=50, verbose_eval=200)
    val_loss = log_loss(y_val, bst.predict(d_valid))
    print(val_loss)
    
    p_test = bst.predict(d_test)