In [1]:
from classifier import train_classifier
from features import extract_features, flatten_features
from utils.dataset import read_dataset
from utils.preprocessing import preprocess_dataframe, extract_labels, decipher_labels, oversample_minority_classes
from utils.scoring import print_cv_score, evaluate_submission
from utils.splits import generate_hold_out_split

In [2]:
raw_data = read_dataset('data')
data = preprocess_dataframe(raw_data, 'raw_data')
labels = extract_labels(data)

In [3]:
features = extract_features(data, raw_data)

In [4]:
features = features.drop(['Word Overlap'], axis=1)

In [5]:
flattened_features = flatten_features(features)

In [6]:
training_data, testing_data, unused_data = generate_hold_out_split(raw_data)
training_features, testing_features = flattened_features.iloc[training_data.index], flattened_features.iloc[testing_data.index]
training_labels, testing_labels = labels.iloc[training_data.index], labels.iloc[testing_data.index]

In [7]:
oversampled_training_features, oversampled_training_labels = oversample_minority_classes(training_features, training_labels)

Oversampling agree group...
Oversampling disagree group...
Oversampling discuss group...


In [8]:
classifier = train_classifier(oversampled_training_features, oversampled_training_labels)

In [9]:
predictions = testing_data.copy()
predictions['Stance'] = decipher_labels(classifier.predict(testing_features), index=testing_features.index)
evaluate_submission(testing_data, predictions)

CONFUSION MATRIX:
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    134    |     5     |    167    |    12     |
-------------------------------------------------------------
| disagree  |    46     |     3     |    37     |     4     |
-------------------------------------------------------------
|  discuss  |    162    |    11     |    672    |    46     |
-------------------------------------------------------------
| unrelated |     1     |     0     |    14     |   3506    |
-------------------------------------------------------------
ACCURACY: 0.895

MAX  - the best possible score (100% accuracy)
NULL - score as if all predicted stances were unrelated
TEST - score based on the provided predictions

||    MAX    ||    NULL   ||    TEST   ||
||  2179.25  ||  880.25   ||  1792.5   ||

82.25306871630148% score achieved


In [35]:
# print_cv_score(classifier, training_features, training_labels, cv=5)
import numpy as np
feature_idx = np.argsort(classifier.feature_importances_[1:])[::-1]
# flattened_features.columns.to_series()[feature_idx]
# flattened_features.columns[feature_idx]
flattened_features.columns[feature_idx][0:20]

Index(['articleBody LSI 299', 'articleBody LDA 99', 'tf-idf Cosine Similarity',
       'Word Mover's Distance', 'doc2vec Cosine Similarity',
       'articleBody BoW 1999', 'articleBody Mean Word Vector 2',
       'articleBody BoW 1266', 'Headline RP 4', 'articleBody LSI 0',
       'Headline LSI 108', 'articleBody LSI 207', 'Headline RP 133',
       'articleBody LSI 286', 'articleBody RP 110', 'Headline doc2vec 42',
       'articleBody tf-idf 1999', 'articleBody LSI 283', 'articleBody RP 299',
       'articleBody BoW 486'],
      dtype='object')