In [1]:
## Author: Mirjam Nanko
## Date Created: 2021-11-29
## Email: m.nanko@exeter.ac.uk

# babyCARDS classifier training and evaluation<br>
#### This script loads the **training, validation and testing** data sourced from various **blogs, twitter, facebook and newspapers** and trains a classifier to classify these into **convinced** (0) and **contrarian** (1).

# Packages

In [2]:
from preprocess import denoise_text
from logistic import fit_logistic_classifier
import pandas as pd
from sklearn import metrics
import pickle
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# Load and preprocess the training, validation and testing data

In [3]:
%%time
# Load the data
train = pd.read_csv('../data/babyCARDStrain.csv', lineterminator='\n')
valid = pd.read_csv('../data/babyCARDSvalid.csv', lineterminator='\n')
test = pd.read_csv('../data/babyCARDStest.csv', lineterminator='\n')
# Preprocess the text
train['text'] = train['text'].astype(str).apply(denoise_text)
valid['text'] = valid['text'].astype(str).apply(denoise_text)
test['text'] = test['text'].astype(str).apply(denoise_text)

CPU times: user 5min 4s, sys: 8.06 s, total: 5min 12s
Wall time: 5min 12s


# Train the logistic classifier

In [4]:
%%time
# Fit the classifier
data = train[['text','label']].values.tolist()
model = fit_logistic_classifier(data)

CPU times: user 1h 13min 9s, sys: 1min 42s, total: 1h 14min 52s
Wall time: 17min 50s


# Encode and vectorize the validation and testing data

In [5]:
%%time
# Encode the labels
y_valid = model['label_encoder'].transform(valid.label)
y_test = model['label_encoder'].transform(test.label)
# Vectorize the text
X_valid = model['vectorizer'].transform(valid.text)
X_test = model['vectorizer'].transform(test.text)

CPU times: user 2min 8s, sys: 1.67 s, total: 2min 9s
Wall time: 2min 9s


# Inspect the classifier performance

In [9]:
print('Validation data:\n')
print(metrics.classification_report(y_valid,model['clf'].predict(X_valid)))
print('\n\nTesting data (unseen bloggers, twitter accounts, facebook accounts and newspaper articles):\n')
print(metrics.classification_report(y_test,model['clf'].predict(X_test)))

Validation data:

              precision    recall  f1-score   support

           0       0.87      0.90      0.88    509797
           1       0.91      0.88      0.90    597768

    accuracy                           0.89   1107565
   macro avg       0.89      0.89      0.89   1107565
weighted avg       0.89      0.89      0.89   1107565



Testing data (unseen bloggers, facebook users, twitter users and newspaper articles):

              precision    recall  f1-score   support

           0       0.71      0.90      0.80    180897
           1       0.90      0.70      0.79    221956

    accuracy                           0.79    402853
   macro avg       0.80      0.80      0.79    402853
weighted avg       0.81      0.79      0.79    402853



# Save the classifier

In [8]:
# Save the classifier
pkl_filename = "../classifiers/babyCARDS.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
# Load classifier from file
# with open(pkl_filename, 'rb') as file:
#     model = pickle.load(file)