In [1]:
# allow widgits to display, like 
!jupyter nbextension enable --py widgetsnbextension
!pip install transformers
# for some reason torchtext 0.7.0 is giving me issues, I know 0.4.0 works fine
!pip install torchtext==0.4.0
# install to read pandas from s3
!pip install s3fs

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data

# assorted QOL things
import random
from tqdm import tqdm
import time
import pandas as pd

# my classes
from langhelper import BERTHelper, DataFrameDataset
from classifier import *
import modelfitting

# from sagemaker get execution role so we can read/write to/from s3
from sagemaker import get_execution_role
import boto3


role = get_execution_role()

client = boto3.client('s3')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lang_helper = BERTHelper('bert-base-uncased')

In [2]:
# define text/label data types, used for when we instantiate the torchtext TabularDataset class
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = lang_helper.tokenize_and_cut,
                  preprocessing = lang_helper.bert_tokenizer.convert_tokens_to_ids,
                  init_token = lang_helper.bert_tokenizer.cls_token_id,
                  eos_token = lang_helper.bert_tokenizer.sep_token_id,
                  pad_token = lang_helper.bert_tokenizer.pad_token_id,
                  unk_token = lang_helper.bert_tokenizer.unk_token_id)

LABEL = data.LabelField(dtype = torch.float, use_vocab=False)

In [3]:
sarcasm_df = pd.read_json('s3://mas-ml-projects/news-headlines-sarcasm-detector/data/Sarcasm_Headlines_Dataset_v2.json', lines=True)
print(sarcasm_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB
None


In [4]:
fields = {'label' : LABEL, 'text' : TEXT} 

# kind of a crappy fix, I should change the DataFrameDataset class
sarcasm_tt = DataFrameDataset(sarcasm_df.rename(columns={"is_sarcastic": "label", "headline": "text"}), fields)

# now make into 
headlines_train, headlines_test = sarcasm_tt.split(split_ratio=0.85, random_state = random.seed(1234))
headlines_train, headlines_valid = headlines_train.split(random_state = random.seed(1234))

In [5]:
# from the data objects we just created we instantiate the bucketiterator class, which is the last preprocessing step we'll take with the data.
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (headlines_train, headlines_valid, headlines_test), 
    batch_size = 16,
    sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    device = device)

In [6]:
# instantiate our model
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 3
DROPOUT = 0.35
L1 = 100
L2 = 50

model = BERTLSTMSentimentLast2(lang_helper,
                             HIDDEN_DIM,
                             OUTPUT_DIM,
                             N_LAYERS,
                             DROPOUT).to(device)

In [7]:
# we'll use Adam for optimization, and our loss function will be BCE with logit loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)

In [None]:
N=50

model_name_str= f'LSTM Last 2 {DROPOUT * 100} drop {N_LAYERS} layers'
modelfitting.fit(n_epochs=N, model=model, train_iter=train_iterator, valid_iter=valid_iterator, optimizer=optimizer, criterion=criterion, model_name=model_name_str)

  0%|          | 0/50 [00:00<?, ?it/s]

validation done epoch # 1


  _warn_prf(average, modifier, msg_start, len(result))


validation done epoch # 2


  6%|▌         | 3/50 [05:50<1:31:07, 116.34s/it]

validation done epoch # 3


  8%|▊         | 4/50 [07:46<1:29:02, 116.14s/it]

validation done epoch # 4
validation done epoch # 5


 12%|█▏        | 6/50 [11:41<1:25:36, 116.75s/it]

validation done epoch # 6


 14%|█▍        | 7/50 [13:36<1:23:22, 116.33s/it]

validation done epoch # 7


 16%|█▌        | 8/50 [15:32<1:21:16, 116.11s/it]

validation done epoch # 8


 18%|█▊        | 9/50 [17:27<1:19:14, 115.95s/it]

validation done epoch # 9


 20%|██        | 10/50 [19:23<1:17:11, 115.78s/it]

validation done epoch # 10


 22%|██▏       | 11/50 [21:18<1:15:10, 115.65s/it]

validation done epoch # 11


 24%|██▍       | 12/50 [23:13<1:13:07, 115.45s/it]

validation done epoch # 12


 26%|██▌       | 13/50 [25:09<1:11:11, 115.46s/it]

validation done epoch # 13


 28%|██▊       | 14/50 [27:04<1:09:14, 115.40s/it]

validation done epoch # 14


 30%|███       | 15/50 [29:00<1:07:21, 115.46s/it]

validation done epoch # 15


In [None]:
# load best model
model.load_state_dict(torch.load(model_name_str + '.pt'))

# let's test out a few sample headlines:
# sarcastic one from the onion, 5/10/2020 (https://www.theonion.com/experts-warn-unemployment-rate-could-soon-rise-to-ameri-1843348378)
print(single_eval(model, 'Experts Warn Unemployment Rate Could Soon Rise To America Is The Greatest Country In The World', lang_helper, device))

# real one from NPR, 5/10/2020 (https://www.npr.org/2020/05/10/852943513/the-people-flying-during-the-pandemic-and-how-airlines-are-trying-to-protect-the)
print(single_eval(model, 'The People Flying During The Pandemic And How Airlines Are Trying To Protect Them', lang_helper, device))

In [None]:
# how does it look on our test dataset?
test_loss, test_acc, test_precision, test_recall, test_f1 = modelfitting.evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss :.3f} | Test Acc: {test_acc*100 :.2f}')
print(f'Test Precision: {test_precision :.3f} | Test Recall: {test_recall*100 :.2f} | Test F1: {test_f1*100 :.2f}')

About 90% accuracy on the test set. Not too bad considering most of the Kaggle front page solutions have validation set accuracies in the mid 80s!