In [1]:
## Author: Mirjam Nanko
## Date Created: 2021-11-29
## Email: m.nanko@exeter.ac.uk

In [1]:
from preprocess import denoise_text
from logistic import fit_logistic_classifier
import pandas as pd
from sklearn import metrics
import pickle
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# Simpletransformers classifier
from simpletransformers.classification import ClassificationModel
# PyTorch: enable GPU access
import torch
# If you want to select a specific GPU, set it here:
gpu = 1
torch.cuda.set_device(gpu) 
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use GPU {}:'.format(torch.cuda.current_device()), torch.cuda.get_device_name(torch.cuda.current_device()))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 4 GPU(s) available.
We will use GPU 1: NVIDIA GeForce RTX 2080 Ti


In [2]:
%%time
# Load the training data
train = pd.read_csv('../data/train.csv', lineterminator='\n')
# Preprocess the text
train['text'] = train['text'].astype(str).apply(denoise_text)
# Load the validation and testing data
valid = pd.read_csv('../data/valid.csv', lineterminator='\n')
test = pd.read_csv('../data/test.csv', lineterminator='\n')
# Preprocess the text
valid['text'] = valid['text'].astype(str).apply(denoise_text)
test['text'] = test['text'].astype(str).apply(denoise_text)

CPU times: user 5min 6s, sys: 8.41 s, total: 5min 14s
Wall time: 5min 14s


# Train logistic classifier

In [3]:
%%time
# Load the training data
train = pd.read_csv('../data/train.csv', lineterminator='\n')
# Preprocess the text
train['text'] = train['text'].astype(str).apply(denoise_text)

CPU times: user 3min 29s, sys: 6.02 s, total: 3min 35s
Wall time: 3min 33s


In [4]:
%%time
# Fit the classifier
data = train[['text','label']].values.tolist()
model = fit_logistic_classifier(data)

CPU times: user 29min 24s, sys: 32.3 s, total: 29min 57s
Wall time: 17min 14s


In [5]:
%%time
# Load the validation and testing data
valid = pd.read_csv('../data/valid.csv', lineterminator='\n')
test = pd.read_csv('../data/test.csv', lineterminator='\n')
# Preprocess the text
valid['text'] = valid['text'].astype(str).apply(denoise_text)
test['text'] = test['text'].astype(str).apply(denoise_text)
# Encode the labels
y_valid = model['label_encoder'].transform(valid.label)
y_test = model['label_encoder'].transform(test.label)
# Vectorize the text
X_valid = model['vectorizer'].transform(valid.text)
X_test = model['vectorizer'].transform(test.text)

CPU times: user 3min 33s, sys: 5.18 s, total: 3min 38s
Wall time: 3min 37s


In [6]:
# Inspect classifier performance
print('Validation data:\n')
print(metrics.classification_report(y_valid,model['clf'].predict(X_valid)))
print('\n\nTesting data (unseen handles/news articles):\n')
print(metrics.classification_report(y_test,model['clf'].predict(X_test)))

Validation data:

              precision    recall  f1-score   support

           0       0.87      0.90      0.88    509797
           1       0.91      0.88      0.90    597768

    accuracy                           0.89   1107565
   macro avg       0.89      0.89      0.89   1107565
weighted avg       0.89      0.89      0.89   1107565



Testing data (unseen handles/news articles):

              precision    recall  f1-score   support

           0       0.71      0.90      0.80    180897
           1       0.90      0.70      0.79    221956

    accuracy                           0.79    402853
   macro avg       0.80      0.80      0.79    402853
weighted avg       0.81      0.79      0.79    402853



In [6]:
# # Save and load the classifier
# pkl_filename = "../babyCARDS.pkl"
# with open(pkl_filename, 'wb') as file:
#     pickle.dump(model, file)
# # Load from file
# with open(pkl_filename, 'rb') as file:
#     pickle_model = pickle.load(file)

# Train RoBERTa classifier

In [None]:
%%time

# Create a ClassificationModel
modelR = ClassificationModel('roberta', 'roberta-base', 
                            args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': True,
                                  'output_dir': '../classifier/roberta/',
                                  'best_model_dir': '../classifier/roberta/best_model/',
                                  # Labels and weights
                                  'num_labels': 2,
                                  # Hyperparameters
                                  'train_batch_size': 8,
                                  'num_train_epochs': 2, 
                                  'learning_rate': 1e-5,
                                  # Text processing
                                  'max_seq_length': 256,
                                  'sliding_window': True,
                                  'stride': 0.8,
                                  'do_lower_case': False,
                                  # Evaluation
                                  'evaluate_during_training': True,
                                  'evaluate_during_training_verbose': True,
                                  'evaluate_during_training_steps': -1,
                                  # Saving
                                  'save_model_every_epoch': True,
                                  'save_eval_checkpoints': True})

# Train and evaluate the model
modelR.train_model(train, eval_df = valid)
#                   f1_macro = f1_multiclass_macro, 
#                   f1_micro = f1_multiclass_micro, 
#                   f1_weighted = f1_multiclass_weighted, 
#                   acc = accuracy_score, 
#                   f1_class = f1_class)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

HBox(children=(FloatProgress(value=0.0, max=2659185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 2', max=418835.0, style=ProgressStyle(…



In [80]:
%%time
# Inspect classifier performance

# Load the classifier
modelR = ClassificationModel('roberta', '../classifier/roberta/checkpoint-418835-epoch-1')

valid_predictions = modelR.predict(valid.text)
print('Validation data:\n')
print(metrics.classification_report(y_valid,valid_predictions[0]))

test_predictions = modelR.predict(test.text)
print('\n\nTesting data (unseen handles/news articles):\n')
print(metrics.classification_report(y_test,test_predictions[0]))

KeyboardInterrupt: 

In [79]:
%%time
# Inspect classifier performance

# Load the classifier
modelR2 = ClassificationModel('roberta', '../classifier/roberta/checkpoint-837670-epoch-2')

valid_predictions2 = modelR2.predict(valid.text)
print('Validation data:\n')
print(metrics.classification_report(y_valid,valid_predictions2[0]))

test_predictions = modelR2.predict(test.text)
print('\n\nTesting data (unseen handles/news articles):\n')
print(metrics.classification_report(y_test,test_predictions2[0]))

HBox(children=(FloatProgress(value=0.0, max=886397.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=139830.0), HTML(value='')))




KeyboardInterrupt: 