In [1]:
import pandas as pd
import numpy as np
import os
import math
from model import linear_regression
from model import logistic_regression
from sklearn import preprocessing

In [2]:
train = pd.read_csv(os.path.join('train_features.csv'))
val = pd.read_csv(os.path.join('val_features.csv'))
test = pd.read_csv(os.path.join('test_features.csv'))

In [3]:
def preprocess(data):
    data_x = data.drop(['Headline', 'articleBody', 'headline_vec', 'body_vec', 'Body ID',
                           'tf_idf_eucliden_dis', 'tf_idf_Manhattan_dis', 'Stance'], axis=1)
    data_y = pd.get_dummies(data['Stance'])
    scaler = preprocessing.MinMaxScaler()
    scaled_df = scaler.fit_transform(data_x)
    data_x.loc[:,:] = scaled_df
    return data_x, data_y

In [4]:
train_x, train_y = preprocess(train.copy())
val_x, val_y = preprocess(val.copy())
test_x, test_y = preprocess(test.copy())

In [5]:
train_x.head()

Unnamed: 0,tf_idf_cos_sim,common_words_count,KL_divergence
0,0.251449,0.148148,0.319306
1,0.119022,0.148148,0.313937
2,0.223101,0.222222,0.184358
3,0.266811,0.37037,0.21009
4,0.307222,0.222222,0.251412


In [6]:
def accuracy(y, y_pred):
    return sum(np.sum(y_pred == y, axis=1) == 4) / len(y)

In [7]:
def calculate_accuracies(model, epochs, eta):
    train_preds = train_y.copy()
    val_preds = val_y.copy()
    test_preds = test_y.copy()
    for col in train_preds.columns:
        train_preds[col] = 0
        val_preds[col] = 0
        test_preds[col] = 0
 
    labels = train_preds.columns

    for label in labels:
        model.fit(train_x, train_y[label].to_frame(), epochs, eta)
        train_preds[label] = model.predict(train_x)
        val_preds[label] = model.predict(val_x)
        test_preds[label] = model.predict(test_x)
    
    
    train_preds = multi_class_predict(train_preds)
    val_preds = multi_class_predict(val_preds)
    test_preds = multi_class_predict(test_preds)
    
    return accuracy(train_preds, train_y), accuracy(val_preds, val_y), accuracy(test_preds, test_y)

# Baseline

In [8]:
base_y = np.zeros((len(test_x), 4))
base_y[:,3] = 1
accuracy(test_y, base_y)

0.7220320308503522

# Linear Regression

In [9]:
def multi_class_predict(y):
#     max_cols = y.idxmax(axis=1, skipna=True).values
    y = np.array(y)
    max_col_values = np.amax(y, axis=1)
    for i in range(len(y)):
        y[i] = y[i] >= max_col_values[i]
#     labels = y.columns
#     for i, row in y.iterrows():
#         for label in labels:
#             row[label] = 0.0
#         max_col = max_cols[i]
#         row[max_col] = 1.0 
    return y * 1

In [10]:
model = linear_regression()
calculate_accuracies(model, 3000, 0.061)

(0.8703695468492907, 0.885954381752701, 0.8552709243300672)

# Logistic Regression

In [11]:
model = logistic_regression()
calculate_accuracies(model, 3000, 0.36)

(0.8755058478231867, 0.8873549419767908, 0.8582615196946445)

# Investigate learning rate effects

In [12]:
import matplotlib.pyplot as plt
def plot_errors(model, epochs):
    etas = [x / 100 for x in list(range(2, 100, 5))]
    train_errors = []
    val_errors = []
    test_errors = []
    for eta in etas:
        train_error, val_error, test_error = calculate_accuracies(model, epochs, eta)
        train_errors.append(train_error)
        val_errors.append(val_error)
        test_errors.append(test_error)
    
    errors = [train_errors, val_errors, test_errors]
    labels = ['train_error', 'val_error', 'test_error']

    for error, label in zip(errors, labels):
        plt.plot(etas, error, label=label)
    plt.legend()
    plt.xlabel('learning rate')
    plt.ylabel('Training, val and testing errors')
    plt.title('Investigating the impacts of changing learning rate on the errors')
    plt.show()

In [13]:
# model = linear_regression()
# plot_errors(model, 100)

In [14]:
# model = logistic_regression()
# plot_errors(model, 100)

# Feature Importance

In [15]:
def calc_best_feature(model, epochs, eta):

    labels = train_y.columns
    features = train_x.columns

    val_accuracy = calculate_accuracies(model, epochs, eta)[1]
    max_acc_reduction, best_feature = 0, ''
    
    for feature in features:
        train_preds = train_y.copy()
        val_preds = val_y.copy()
        for label in labels:
            new_train_x = train_x.copy().drop(feature, axis=1)
            new_val_x = val_x.copy().drop(feature, axis=1)
            
            model.fit(new_train_x, train_y[label].to_frame(), epochs, eta)
            val_preds[label] = model.predict(new_val_x)      
        val_preds = multi_class_predict(val_preds)
        acc_reduction = (val_accuracy - accuracy(val_preds, val_y))
        print(feature + ' : ' + str(acc_reduction))
        if acc_reduction > max_acc_reduction:
            max_acc_reduction = acc_reduction
            best_feature = feature
            
    return best_feature

In [16]:
model = linear_regression()
calc_best_feature(model, 3000, 0.061)

tf_idf_cos_sim : 0.019807923169267605
common_words_count : 0.0006002400960383181
KL_divergence : 0.004801920768307322


'tf_idf_cos_sim'

In [17]:
model = logistic_regression()
calc_best_feature(model, 3000, 0.36)

tf_idf_cos_sim : 0.02200880352140866
common_words_count : 0.001400560224089742
KL_divergence : 0.27871148459383754


'KL_divergence'