In [1]:
## Author: Mirjam Nanko
## Date Created: 2022-10-13
## Email: m.nanko@exeter.ac.uk

# climatenewsbabyCARDS classifier training and evaluation<br>
#### This script loads the climate related **training, validation and testing** data sourced from various **blogs and newspapers** and trains a classifier to classify these into **convinced** (0) and **contrarian** (1).

# Packages

In [2]:
from preprocess_climatenews import denoise_text
from preprocess_climatenews import simple_tokenise
from logistic_climatenews import fit_logistic_classifier
from logistic_climatenews import show_most_informative_features
import pandas as pd
from sklearn import metrics
import pickle

# Load and preprocess the training, validation and testing data

In [3]:
%%time
# Load the data
train = pd.read_csv('../data/climatenewsbabyCARDStrain.csv', lineterminator='\n')
valid = pd.read_csv('../data/climatenewsbabyCARDSvalid.csv', lineterminator='\n')
test = pd.read_csv('../data/climatenewsbabyCARDStest.csv', lineterminator='\n')
# Preprocess the text
train['tokens'] = train['text'].astype(str).apply(denoise_text).apply(simple_tokenise)
valid['tokens'] = valid['text'].astype(str).apply(denoise_text).apply(simple_tokenise)
test['tokens'] = test['text'].astype(str).apply(denoise_text).apply(simple_tokenise)

CPU times: user 1min 53s, sys: 3.8 s, total: 1min 56s
Wall time: 1min 57s


# Train the logistic classifier

In [4]:
%%time
# Fit the classifier
data = train[['tokens','label']].values.tolist()
model = fit_logistic_classifier(data, C=1000)

CPU times: user 16min 47s, sys: 1min 39s, total: 18min 26s
Wall time: 15min 1s


# Encode and vectorize the validation and testing data

In [5]:
%%time
# Encode the labels
y_valid = model['label_encoder'].transform(valid.label)
y_test = model['label_encoder'].transform(test.label)
# Vectorize the text
X_valid = model['vectorizer'].transform(valid.tokens)
X_test = model['vectorizer'].transform(test.tokens)

CPU times: user 2min 5s, sys: 1.07 s, total: 2min 6s
Wall time: 2min 6s


# Inspect the classifier performance

In [6]:
print('Validation data:\n')
print(metrics.classification_report(y_valid, model['clf'].predict(X_valid)))
print('\n\nTesting data (unseen bloggers and newspaper articles):\n')
print(metrics.classification_report(y_test, model['clf'].predict(X_test)))

Validation data:

              precision    recall  f1-score   support

           0       0.92      0.91      0.91     14668
           1       0.95      0.95      0.95     25841

    accuracy                           0.94     40509
   macro avg       0.93      0.93      0.93     40509
weighted avg       0.94      0.94      0.94     40509



Testing data (unseen bloggers and newspaper articles):

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      7337
           1       0.94      0.95      0.95     11443

    accuracy                           0.94     18780
   macro avg       0.93      0.93      0.93     18780
weighted avg       0.94      0.94      0.94     18780



# Encode and vectorize the validation and testing data limited to newspaper articles

In [7]:
%%time
# Encode the labels
y_valid_news = model['label_encoder'].transform(valid[valid.type == 'newspaper'].label)
y_test_news = model['label_encoder'].transform(test[test.type == 'newspaper'].label)
# Vectorize the text
X_valid_news = model['vectorizer'].transform(valid[valid.type == 'newspaper'].tokens)
X_test_news = model['vectorizer'].transform(test[test.type == 'newspaper'].tokens)

CPU times: user 17.8 s, sys: 123 ms, total: 17.9 s
Wall time: 17.9 s


# Inspect the classifier performance limited to newspaper articles

In [8]:
print('Validation data (limited to newspaper articles):\n')
print(metrics.classification_report(y_valid_news, model['clf'].predict(X_valid_news)))
print('\n\nTesting data (limited to newspaper articles):\n')
print(metrics.classification_report(y_test_news, model['clf'].predict(X_test_news)))

Validation data (limited to newspaper articles):

              precision    recall  f1-score   support

           0       0.96      0.84      0.90      1153
           1       0.78      0.94      0.86       691

    accuracy                           0.88      1844
   macro avg       0.87      0.89      0.88      1844
weighted avg       0.89      0.88      0.88      1844



Testing data (limited to newspaper articles):

              precision    recall  f1-score   support

           0       0.95      0.82      0.88       805
           1       0.76      0.93      0.84       496

    accuracy                           0.86      1301
   macro avg       0.86      0.88      0.86      1301
weighted avg       0.88      0.86      0.87      1301



# Save the classifier

In [10]:
# Save the classifier
pkl_filename = "../classifiers/climatenewsbabyCARDS.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
# Load classifier from file
# with open(pkl_filename, 'rb') as file:
#     model = pickle.load(file)