In [1]:
import pandas as pd
import numpy as np
import os
import math
from sklearn import preprocessing
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv(os.path.join('train_features.csv'))
val = pd.read_csv(os.path.join('val_features.csv'))
test = pd.read_csv(os.path.join('test_features.csv'))

In [3]:
def preprocess(data):
    data_x = data.drop(['Headline', 'articleBody', 'headline_vec', 'body_vec', 'Body ID',
                           'tf_idf_eucliden_dis', 'tf_idf_Manhattan_dis', 'Stance'], axis=1)
    data_y = pd.get_dummies(data['Stance'])
#     scaler = preprocessing.MinMaxScaler()
#     scaled_df = scaler.fit_transform(data_x)
#     data_x.loc[:,:] = scaled_df
    return data_x, data_y

In [4]:
train_x, train_y = preprocess(train.copy())
val_x, val_y = preprocess(val.copy())
test_x, test_y = preprocess(test.copy())

In [5]:
def multi_class_predict(y):
#     max_cols = y.idxmax(axis=1, skipna=True).values
    y = np.array(y)
    max_col_values = np.amax(y, axis=1)
    for i in range(len(y)):
        y[i] = y[i] >= max_col_values[i]
#     labels = y.columns
#     for i, row in y.iterrows():
#         for label in labels:
#             row[label] = 0.0
#         max_col = max_cols[i]
#         row[max_col] = 1.0 
    return y * 1

In [6]:
# Caculate the root mean square error
def rmse(preds, data):
    labels = data.get_label()
    diff = preds - labels                       
    mean_diff_squared = (diff ** 2).mean()  
    error = np.sqrt(mean_diff_squared)
    return error 

In [7]:
def kfold_xgboost(train_data, train_label, test_data, **kwargs):

    train_data = np.array(train_data)
    train_label = np.array(train_label)
    test_data = np.array(test_data)
    
    test_pred_list = []
    K = 5
    kfold = KFold(n_splits = K, random_state = 7, shuffle = True)
    val_scores = []
    for train_index, val_index in kfold.split(train_data):
        train_x, val_x = train_data[train_index], train_data[val_index]
        train_y, val_y = train_label[train_index], train_label[val_index]

        xgb_params = {'eta': 0.2, 'max_depth': 5, 'subsample': 1.0, 'colsample_bytree': 1.0,
                      'objective': 'reg:logistic', 'seed': 99, 'silent': True}

        d_train = xgb.DMatrix(train_x, train_y)
        d_val = xgb.DMatrix(val_x, val_y)
        d_test = xgb.DMatrix(test_data)
    
        watchlist = [(d_train, 'train'), (d_val, 'val')]
        
        model = xgb.train(xgb_params, d_train, 500,  watchlist,
                              maximize=False, verbose_eval=50, early_stopping_rounds=50)
                        
        test_pred = model.predict(d_test)
        test_pred_list.append(list(test_pred))
    
        val_score = rmse(model.predict(d_val, ntree_limit=model.best_ntree_limit), d_val)
        print("The validation set score is :\t{}\n".format(val_score))
        val_scores.append(val_score)
        
    print(val_scores)
        
    test_avg_pred = []
    for i in range(len(test_pred_list[0])):
        sumres = 0
        for j in range(K):
            sumres += test_pred_list[j][i]
        test_avg_pred.append(sumres / K)
    return test_avg_pred

In [8]:
def two_stage_classifer(train_data, train_label, test_data, test_y, **kwargs):
    test_pred = test_y.copy()
    
    for col in test_pred.columns:
        test_pred[col] = 0

    test_pred['unrelated'] = kfold_xgboost(train_data, train_label['unrelated'], test_data)
    
    test_pred = pd.DataFrame(test_pred).rename(columns={0: "agree",
        1: "disagree", 2: "discuss", 3: "unrelated"})
    unrelated_pred = test_pred.query('unrelated >= 0.5')
    unrelated_pred['unrelated'] = 1
    related_pred = test_pred.query('unrelated < 0.5')
    related_pred['unrelated'] = 0
    related_rows_index = list(related_pred.index)
    test_related_data = test_data.loc[related_rows_index]
    
    labels = ['agree', 'disagree', 'discuss']
    for label in labels:
        print('processing: ' + label)
        related_pred[label] = kfold_xgboost(train_data, train_label[label], test_related_data)
        
    test_pred = pd.concat([related_pred, unrelated_pred])
    test_pred = test_pred.sort_index()
    test_pred = multi_class_predict(test_pred)
    return test_pred

In [9]:
val_pred = two_stage_classifer(train_x, train_y, val_x, val_y.copy())

[0]	train-rmse:0.408778	val-rmse:0.408836
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.12311	val-rmse:0.128844
Stopping. Best iteration:
[40]	train-rmse:0.123976	val-rmse:0.128527

The validation set score is :	0.12852731347084045

[0]	train-rmse:0.408823	val-rmse:0.408621
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.124808	val-rmse:0.126904
[100]	train-rmse:0.120638	val-rmse:0.12671
Stopping. Best iteration:
[87]	train-rmse:0.121137	val-rmse:0.126583

The validation set score is :	0.12658336758613586

[0]	train-rmse:0.408823	val-rmse:0.408687
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.123664	val-rmse:0.126974
Stopping. Best iteration:
[32]	train-r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


[50]	train-rmse:0.224968	val-rmse:0.231681
[100]	train-rmse:0.216648	val-rmse:0.232611
Stopping. Best iteration:
[52]	train-rmse:0.224181	val-rmse:0.231637

The validation set score is :	0.2316371500492096

[0]	train-rmse:0.424543	val-rmse:0.423984
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.225423	val-rmse:0.229688
[100]	train-rmse:0.216401	val-rmse:0.230639
Stopping. Best iteration:
[66]	train-rmse:0.222253	val-rmse:0.229418

The validation set score is :	0.2294178009033203

[0]	train-rmse:0.424433	val-rmse:0.424834
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.224599	val-rmse:0.233022
[100]	train-rmse:0.215292	val-rmse:0.233157
Stopping. Best iteration:
[66]	train-rmse:0.221836	val-rmse:0.232753

The validation set score is :	0.2327525019645691

[0]	train-rms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[50]	train-rmse:0.119925	val-rmse:0.12982
Stopping. Best iteration:
[28]	train-rmse:0.122083	val-rmse:0.129547

The validation set score is :	0.12954701483249664

[0]	train-rmse:0.408399	val-rmse:0.40771
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.123324	val-rmse:0.117131
[100]	train-rmse:0.119211	val-rmse:0.117811
Stopping. Best iteration:
[52]	train-rmse:0.123261	val-rmse:0.11708

The validation set score is :	0.11707979440689087

[0]	train-rmse:0.408167	val-rmse:0.408317
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.121652	val-rmse:0.126731
[100]	train-rmse:0.116367	val-rmse:0.126724
[150]	train-rmse:0.111968	val-rmse:0.126433
[200]	train-rmse:0.107544	val-rmse:0.126405
Stopping. Best iteration:
[168]	train-rmse:0.110167	val-rmse:0.126314

The validation set 

In [10]:
accuracy_score(val_y, val_pred)

0.89175670268107243

In [11]:
test_pred = two_stage_classifer(train_x, train_y, test_x, test_y.copy())

[0]	train-rmse:0.408778	val-rmse:0.408836
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.12311	val-rmse:0.128844
Stopping. Best iteration:
[40]	train-rmse:0.123976	val-rmse:0.128527

The validation set score is :	0.12852731347084045

[0]	train-rmse:0.408823	val-rmse:0.408621
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.124808	val-rmse:0.126904
[100]	train-rmse:0.120638	val-rmse:0.12671
Stopping. Best iteration:
[87]	train-rmse:0.121137	val-rmse:0.126583

The validation set score is :	0.12658336758613586

[0]	train-rmse:0.408823	val-rmse:0.408687
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.123664	val-rmse:0.126974
Stopping. Best iteration:
[32]	train-r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


[50]	train-rmse:0.224968	val-rmse:0.231681
[100]	train-rmse:0.216648	val-rmse:0.232611
Stopping. Best iteration:
[52]	train-rmse:0.224181	val-rmse:0.231637

The validation set score is :	0.2316371500492096

[0]	train-rmse:0.424543	val-rmse:0.423984
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.225423	val-rmse:0.229688
[100]	train-rmse:0.216401	val-rmse:0.230639
Stopping. Best iteration:
[66]	train-rmse:0.222253	val-rmse:0.229418

The validation set score is :	0.2294178009033203

[0]	train-rmse:0.424433	val-rmse:0.424834
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.224599	val-rmse:0.233022
[100]	train-rmse:0.215292	val-rmse:0.233157
Stopping. Best iteration:
[66]	train-rmse:0.221836	val-rmse:0.232753

The validation set score is :	0.2327525019645691

[0]	train-rms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[50]	train-rmse:0.119925	val-rmse:0.12982
Stopping. Best iteration:
[28]	train-rmse:0.122083	val-rmse:0.129547

The validation set score is :	0.12954701483249664

[0]	train-rmse:0.408399	val-rmse:0.40771
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.123324	val-rmse:0.117131
[100]	train-rmse:0.119211	val-rmse:0.117811
Stopping. Best iteration:
[52]	train-rmse:0.123261	val-rmse:0.11708

The validation set score is :	0.11707979440689087

[0]	train-rmse:0.408167	val-rmse:0.408317
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.121652	val-rmse:0.126731
[100]	train-rmse:0.116367	val-rmse:0.126724
[150]	train-rmse:0.111968	val-rmse:0.126433
[200]	train-rmse:0.107544	val-rmse:0.126405
Stopping. Best iteration:
[168]	train-rmse:0.110167	val-rmse:0.126314

The validation set 

In [12]:
accuracy_score(test_y, test_pred)

0.86947625231180892