#Data Loading and Preprocessing

In [None]:
#uploading kaggle file

from google.colab import files

uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
!pip install kaggle



In [None]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c sentiment-analysis-of-tweets

Downloading test_samples.txt to /content
  0% 0.00/714k [00:00<?, ?B/s]
100% 714k/714k [00:00<00:00, 48.6MB/s]
Downloading train.txt.zip to /content
  0% 0.00/1.36M [00:00<?, ?B/s]
100% 1.36M/1.36M [00:00<00:00, 92.1MB/s]


In [None]:
from zipfile import ZipFile
filename = "train.txt.zip"

with ZipFile(filename, 'r') as zip:
  zip.extractall()
  print("Done")

Done


In [None]:
import pandas as pd

train_data_path = "/content/train.txt"


tweet_data = pd.read_csv(train_data_path, sep=',')
tweet_data.head(5)

Unnamed: 0,tweet_id,sentiment,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...


In [None]:
import re
from string import punctuation

def cleaning_tweets(tweets):
  cleaned_tweets = []
  
  for tweet in tweets:
    tweet = re.sub("#","",tweet) #removing hashtags but retaining the hashtag content
    tweet = re.sub("@\S+", "", tweet) #removing tagged usernames
    for punc in punctuation:
      tweet = tweet.replace(punc, "") #removing punctuations (have to find a way to avoid removinf emoticons)
    tweet = re.sub("u\d{3}.", "", tweet) #removing UNIcodes
    tweet = re.sub("http\S*", "", tweet) #removing hyperlinks
    tweet = re.sub("\d", "", tweet) #removing numbers
    tweet = tweet.lower()

    cleaned_tweets.append(tweet)
  
  return cleaned_tweets

In [None]:
tweet_data["cleaned_tweet"] = cleaning_tweets(tweet_data.loc[:,"tweet_text"])
tweet_data.head(5)

Unnamed: 0,tweet_id,sentiment,tweet_text,cleaned_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...,gas by my house hit im going to chapel hill o...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...,theo walcott is still shit watch rafa and john...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...,its not that im a gsp fan i just hate nick dia...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...,iranian general says israels iron dome cant de...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...,tehran mon amour obama tried to establish ties...


In [None]:
from collections import Counter

label_counts = Counter(tweet_data.loc[:,"sentiment"])

In [None]:
label_counts

Counter({'negative': 3387, 'neutral': 9014, 'positive': 9064})

#Model 1: Simple RNN

In [None]:
import torch

bs = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from torchtext.data import Field, LabelField
from sklearn.model_selection import train_test_split

text_field = Field(tokenize='spacy')

label_field = LabelField()

In [None]:
df = tweet_data.loc[:,['cleaned_tweet', 'sentiment']]

df.head(10)

Unnamed: 0,cleaned_tweet,sentiment
0,gas house hit going chapel hill sat happy,positive
1,theo walcott still shit watch rafa johnny deal...,negative
2,not gsp fan hate nick diaz cant wait february,negative
3,iranian general says israels iron dome cant de...,negative
4,tehran mon amour obama tried establish ties mu...,neutral
5,sat whole movie harry ron christmas ohlawd,neutral
6,davlar main rivals team poland hopefully make ...,positive
7,talking acts sats deciding want college applyi...,negative
8,why happy valentines day trending its february...,neutral
9,they may superbowl dallas dallas aint winning ...,negative


In [None]:
from torchtext.data import Dataset, Example

# Torchtext does not have any inherit method to deal with dataframes
# as input data, hence we create a child class of Dataset class  

class DataFrameDataset(Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [
                Example.fromlist(list(r), fields) 
                for i, r in df.iterrows()
            ], 
            fields
        )

In [None]:
train_data, valid_data = DataFrameDataset(
    df=df, 
    fields=(
        ('text', text_field),
        ('label', label_field)
    )
).split(split_ratio=0.8) #creating dataset and splitting it into train and validation data

In [None]:
print("train_data example: ",vars(train_data.examples[0]))
print("valid_data example: ",vars(valid_data.examples[0]))

train_data example:  {'text': ['career', 'fair', 'tomorrow', 'murphy', 'center', 'dress', 'business', 'attire', 'bring', 'resumes'], 'label': 'neutral'}
valid_data example:  {'text': ['ill', 'share', 'care', 'clud', 'weird', 'crush', 'justin', 'sororithank', 'you', 'gin', 'real', 'life', 'comes', 'across', 'turn'], 'label': 'negative'}


In [None]:
print('No of training examples:', len(train_data))
print('No of validation examples:', len(valid_data))

No of training examples: 17172
No of validation examples: 4293


In [None]:
MAX_VOCAB = 25000

text_field.build_vocab(train_data, max_size = MAX_VOCAB) #all embeddings will be initialised with zero
label_field.build_vocab(train_data)

In [None]:
text_field.vocab.freqs.most_common(20)

[('tomorrow', 2741),
 ('may', 2284),
 ('you', 1727),
 ('day', 1402),
 ('the', 1295),
 ('going', 1229),
 ('nt', 1220),
 ('night', 1163),
 ('not', 1094),
 ('see', 990),
 ('friday', 987),
 ('like', 939),
 ('time', 936),
 ('game', 912),
 ('saturday', 867),
 ('happy', 865),
 ('get', 858),
 ('sunday', 856),
 ('new', 736),
 ('s', 719)]

In [None]:
len(text_field.vocab)

25002

In [None]:
BATCH_SIZE = 64

from torchtext.data import BucketIterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    device = device,
    sort_key= lambda x: len(x.text),
    sort_within_batch = False)

In [None]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, ip_dim, emb_dim, hid_dim, op_dim):
    super().__init__()
    self.embedding = nn.Embedding(ip_dim, emb_dim)
    self.rnn = nn.RNN(emb_dim, hid_dim)
    self.fc = nn.Linear(hid_dim, op_dim)

  def forward(self, text):
    embedded = self.embedding(text)
    output, hidden = self.rnn(embedded)

    assert torch.equal(output[-1,:,:], hidden.squeeze(0))

    return self.fc(hidden.squeeze(0))

In [None]:
input_dim = len(text_field.vocab)
emb_dim = 100
hidden_dim = 256
out_dim = len(label_field.vocab)

model = RNN(input_dim, emb_dim, hidden_dim, out_dim)

In [None]:
def num_params(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {num_params(model):,} trainable params")

The model has 2,592,619 trainable params


In [None]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def calc_acc(preds, y):
  max_preds = preds.argmax(dim=1, keepdim = True)
  correct = max_preds.squeeze(1).eq(y)
  return correct.sum() / torch.FloatTensor([y.shape[0]])

In [None]:
def train(model, iterator, optimizer, criterion):

  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = calc_acc(predictions, batch.label)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc/len(iterator)

In [None]:
def evaluate(model, iterator, criterion): 
  """
  Used for testing purposed hence no backprop
  Also the dropout and BatchNorm layers are deactivated in model.eval()
  """
  epoch_loss = 0
  epoch_acc = 0

  model.eval()

  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = calc_acc(predictions, batch.label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time/60)
  elapsed_secs = int(elapsed_time - (elapsed_mins*60))
  return elapsed_mins, elapsed_secs

In [None]:
epochs = 300 

best_valid_loss = float('inf')

for epoch in range(epochs):
  start_time = time.time()

  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'phase-1.pt')

  if ((epoch + 1) % 10 == 0) or (epoch == 0):
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tVal. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 1.025 | Train Acc: 42.19%
	Val. Loss: 1.108 | Val. Acc: 32.83%
Epoch: 10 | Epoch Time: 0m 2s
	Train Loss: 1.018 | Train Acc: 42.03%
	Val. Loss: 1.101 | Val. Acc: 33.49%
Epoch: 20 | Epoch Time: 0m 2s
	Train Loss: 1.018 | Train Acc: 42.28%
	Val. Loss: 1.095 | Val. Acc: 35.01%
Epoch: 30 | Epoch Time: 0m 1s
	Train Loss: 1.018 | Train Acc: 42.15%
	Val. Loss: 1.090 | Val. Acc: 35.68%
Epoch: 40 | Epoch Time: 0m 2s
	Train Loss: 1.017 | Train Acc: 42.56%
	Val. Loss: 1.085 | Val. Acc: 36.93%
Epoch: 50 | Epoch Time: 0m 1s
	Train Loss: 1.018 | Train Acc: 42.99%
	Val. Loss: 1.079 | Val. Acc: 37.76%
Epoch: 60 | Epoch Time: 0m 2s
	Train Loss: 1.017 | Train Acc: 42.74%
	Val. Loss: 1.075 | Val. Acc: 38.64%
Epoch: 70 | Epoch Time: 0m 2s
	Train Loss: 1.017 | Train Acc: 42.58%
	Val. Loss: 1.072 | Val. Acc: 39.03%
Epoch: 80 | Epoch Time: 0m 2s
	Train Loss: 1.017 | Train Acc: 42.81%
	Val. Loss: 1.069 | Val. Acc: 39.78%
Epoch: 90 | Epoch Time: 0m 1s
	Train Loss: 1.0

Phase1: Validation accuracy: 46.51%

#Model 2: Pretrained word embeddings and bidirectional RNN with LSTM

In [None]:
import torch
from torchtext import data
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(strip_handles = True, reduce_len = True)

In [None]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

id = data.Field()
tweet = data.Field(tokenize = tokenizer.tokenize, include_lengths = True) #tweet will be a tuple ([tokes], len(tweet))
label = data.LabelField()

In [None]:
from torchtext.data import Dataset, Example
import pandas as pd

# Torchtext does not have any inherit method to deal with dataframes
# as input data, hence we create a child class of Dataset class  

class DataFrameDataset(Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [
                Example.fromlist(list(r), fields) 
                for i, r in df.iterrows()
            ], 
            fields
        )

In [None]:
df = tweet_data.loc[:,[
                       #'tweet_id',
                       'cleaned_tweet', 
                       'sentiment'
                       ]]

df.head(10)

Unnamed: 0,cleaned_tweet,sentiment
0,gas by my house hit im going to chapel hill o...,positive
1,theo walcott is still shit watch rafa and john...,negative
2,its not that im a gsp fan i just hate nick dia...,negative
3,iranian general says israels iron dome cant de...,negative
4,tehran mon amour obama tried to establish ties...,neutral
5,i sat through this whole movie just for harry ...,neutral
6,with j davlar th main rivals are team poland h...,positive
7,talking about acts sats deciding where i want...,negative
8,why is happy valentines day trending its on th...,neutral
9,they may have a superbowl in dallas but dallas...,negative


In [None]:
import random

train_data, valid_data = DataFrameDataset(
    df=df, 
    fields=(
        # ('id', id),
        ('text', tweet),
        ('label', label)
    ),
).split(split_ratio=0.8, random_state = random.getstate()) #creating dataset and splitting it into train and validation data

In [None]:
print(vars(train_data.examples[0]))
print(vars(valid_data.examples[0]))

{'text': ['can', 'it', 'please', 'be', 'wednesday', 'and', 'potus', 'has', 'been', 're', 'elected', 'then', 'we', 'can', 'move', 'this', 'country', 'forward'], 'label': 'neutral'}
{'text': ['remembering', 'that', 'time', 'i', 'said', 'may', 'the', 'force', 'be', 'with', 'you', 'to', 'buzz', 'lightyear', 'at', 'disneyland', 'toinfinitiandbeyond'], 'label': 'positive'}


In [None]:
# max_vocab_size = 25000

#using pretrained embeddings
tweet.build_vocab(train_data,
                  # max_size = max_vocab_size,
                  vectors = "glove.twitter.27B.100d",
                  unk_init = torch.Tensor.normal_) # in case a word is not present in the pretrained embedding, it will be assigned a random value from normal distribution, instead of zero

label.build_vocab(train_data)

.vector_cache/glove.twitter.27B.zip: 1.52GB [11:44, 2.16MB/s]                            
100%|█████████▉| 1193334/1193514 [01:07<00:00, 18263.50it/s]

In [None]:
print(len(tweet.vocab))
print(len(label.vocab))

25647
3


In [None]:
print(label.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f1c80622400>, {'positive': 0, 'neutral': 1, 'negative': 2})


In [None]:
batches = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size = batches,
    sort_within_batch = True,
    sort_key= lambda x: len(x.text),
    device = device)

In [None]:
import torch.nn as nn

class RNN_LSTM(nn.Module):
  def __init__(self, vocab_size, emb_dim, hid_dim, op_dim, n_layers, bidirect, drop, pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx = pad_idx)
    self.rnn = nn.LSTM(embedding_dim, hid_dim,
                      num_layers = n_layers,
                      bidirectional = bidirect,
                      dropout  = 0 if n_layers < 2 else drop)
    self.fc = nn.Linear(hidden_dim * 2 if bidirect else hidden_dim, op_dim) 
    # self.fc = nn.Linear(hidden_dim * 2 if bidirect else hidden_dim, hidden_dim if bidirect else hidden_dim/2)
    # self.fc2 = nn.Linear(hidden_dim if bidirect else hidden_dim/2, op_dim)
    self.dropout = nn.Dropout(drop)

  def forward(self, text, text_lengths):
    embedded = self.dropout(self.embedding(text))

    #packed padding - when passing a batch all the inputs are of the same size 
    #                 so shorter sentences are padded with <pad> in the front. 
    #                 In packed padding while the padded input will be passed, 
    #                 the <pad> elements will output as 0 and only the non <pad> 
    #                 elements will affect the model

    packed_embedded =nn.utils.rnn.pack_padded_sequence(embedded, text_lengths) 
    packed_output, (hidden, cell) = self.rnn(packed_embedded)

    #unpacking sequence
    output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

    if self.rnn.bidirectional:
      hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
    else:
      hidden = self.dropout(hidden[-1,:,:])
    
    # hidden = self.fc(hidden)  

    return self.fc(hidden)

In [None]:
input_dim = len(tweet.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = len(label.vocab)
n_layers = 2
bi_dir = True
dropout = 0.4
pad_idx = tweet.vocab.stoi[tweet.pad_token]

model = RNN_LSTM(input_dim, embedding_dim, 
            hidden_dim, output_dim,
            n_layers, bi_dir,
            dropout, pad_idx)

In [None]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,877,154 trainable parameters


In [None]:
pretrain_embed = tweet.vocab.vectors #GloVe vectors (100d)

print(pretrain_embed.shape) # [vocab_len, 100]

torch.Size([25647, 100])


In [None]:
model.embedding.weight.data.copy_(pretrain_embed) #initialising model word embeddings with GloVe embeds

#<unk> and <pad> are not in GloVe vocab so they are intialised randomly from a normal distribution

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [ 0.0952,  0.3702,  0.5429,  ..., -0.5108,  0.4688,  0.3488],
        ...,
        [-0.2369, -0.2940,  0.0918,  ..., -0.4967, -0.1751,  0.1848],
        [ 0.1977,  0.1966,  0.3419,  ...,  0.1391,  0.3802,  0.6479],
        [-0.9683,  0.7869,  1.5518,  ..., -0.2938,  1.3459, -0.7236]])

In [None]:
unk_idx = tweet.vocab.stoi[tweet.unk_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim) #setting <unk> embedding as zero to teach model that the token is irrelevant for sentiment analysis
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim) #setting <pad> embedding as zero to teach model that the token is irrelevant for sentiment analysis

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0952,  0.3702,  0.5429,  ..., -0.5108,  0.4688,  0.3488],
        ...,
        [-0.2369, -0.2940,  0.0918,  ..., -0.4967, -0.1751,  0.1848],
        [ 0.1977,  0.1966,  0.3419,  ...,  0.1391,  0.3802,  0.6479],
        [-0.9683,  0.7869,  1.5518,  ..., -0.2938,  1.3459, -0.7236]])


In [None]:
import torch.optim as optim 

optimizer = optim.Adagrad(model.parameters())

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def calc_acc(preds, y):
  max_preds = preds.argmax(dim=1, keepdim = True)
  correct = max_preds.squeeze(1).eq(y)
  return correct.sum() / torch.FloatTensor([y.shape[0]])

In [None]:
def train(model, iterator, optimizer, criterion):

  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    text, text_lengths = batch.text
    predictions = model(text, text_lengths).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = calc_acc(predictions, batch.label)
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc/len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

  epoch_loss = 0
  epoch_acc = 0

  model.eval()

  with torch.no_grad():
    for batch in iterator:
      text, text_lengths = batch.text
      predictions = model(text, text_lengths).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = calc_acc(predictions, batch.label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time 

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_sec = int(elapsed_time - (elapsed_mins * 60))

  return elapsed_mins, elapsed_sec

In [None]:
num_epochs = 15 

best_valid_loss = float('inf')

for epoch in range(num_epochs):
  start_time = time.time()
  train_loss, train_acc = train(model, train_iter, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iter, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'RNN_LSTM.pt')

  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\tVal. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 7s
	Train Loss: 0.882 | Train Acc: 56.86%
	Val. Loss: 0.811 | Val. Acc: 61.34%
Epoch: 02 | Epoch Time: 0m 7s
	Train Loss: 0.768 | Train Acc: 63.93%
	Val. Loss: 0.782 | Val. Acc: 63.27%
Epoch: 03 | Epoch Time: 0m 7s
	Train Loss: 0.717 | Train Acc: 67.46%
	Val. Loss: 0.768 | Val. Acc: 64.67%
Epoch: 04 | Epoch Time: 0m 7s
	Train Loss: 0.684 | Train Acc: 69.29%
	Val. Loss: 0.759 | Val. Acc: 65.34%
Epoch: 05 | Epoch Time: 0m 7s
	Train Loss: 0.650 | Train Acc: 70.97%
	Val. Loss: 0.770 | Val. Acc: 64.86%
Epoch: 06 | Epoch Time: 0m 7s
	Train Loss: 0.623 | Train Acc: 72.58%
	Val. Loss: 0.773 | Val. Acc: 65.17%
Epoch: 07 | Epoch Time: 0m 7s
	Train Loss: 0.603 | Train Acc: 73.57%
	Val. Loss: 0.764 | Val. Acc: 64.66%
Epoch: 08 | Epoch Time: 0m 7s
	Train Loss: 0.579 | Train Acc: 74.92%
	Val. Loss: 0.787 | Val. Acc: 64.79%
Epoch: 09 | Epoch Time: 0m 7s
	Train Loss: 0.555 | Train Acc: 76.04%
	Val. Loss: 0.777 | Val. Acc: 65.50%
Epoch: 10 | Epoch Time: 0m 7s
	Train Loss: 0.5

In [None]:
test_data_path = "/content/test_samples.txt"

test_tweets = pd.read_csv(test_data_path, sep=',')
test_tweets.head(10)

Unnamed: 0,tweet_id,tweet_text
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor..."
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa..."
4,171874368908050432,"Excuse the connectivity of this live stream, f..."
5,256010056942903296,Show your LOVE for your local field & it might...
6,253809989599232000,"Milton on Bolton Wanderers 2 v 2 Leeds United,..."
7,261776619146985472,@firecore Can you tell me when an update for t...
8,264143999374356481,"@Heavensbasement The Crown, Filthy McNastys, K..."
9,223052929131757571,Uncover the Eternal City! Return flights to Ro...


In [None]:
test_tweets["cleaned_test_tweet"] = cleaning_tweets(test_tweets.loc[:,"tweet_text"])
test_tweets.head(5)

Unnamed: 0,tweet_id,tweet_text,cleaned_test_tweet
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor...",down in the atlantic city ventnor margate oce...
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...,musical awareness great big beautiful tomorrow...
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...,on radio fm fri oct labour analyst shawn hat...
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa...",kapan sih lo ngebuktiinjan ngomong doang susah...
4,171874368908050432,"Excuse the connectivity of this live stream, f...",excuse the connectivity of this live stream fr...


In [None]:
test_df = test_tweets.loc[:,["tweet_id","cleaned_test_tweet"]]

test_data = DataFrameDataset(
    df=test_df, 
    fields=(
        ('id', id),
        ('text', tweet)
    )
)

print(len(test_data))
print(vars(test_data.examples[0]))

5398
{'id': 264238274963451904, 'text': ['down', 'in', 'the', 'atlantic', 'city', 'ventnor', 'margate', 'ocean', 'city', 'area', 'im', 'just', 'waiting', 'for', 'the', 'coordinator', 'to', 'hopefully', 'call', 'me', 'tomorrow']}


In [None]:
model.load_state_dict(torch.load('RNN_LSTM.pt'))

<All keys matched successfully>

In [None]:
def predict_class(model, tokens):
    model.eval()
    indexed = [tweet.vocab.stoi[t] for t in tokens]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    preds = model(tensor, length_tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()

In [None]:
pred_class = predict_class(model, "the movie is about a tiger".split())
print(pred_class)

1


In [None]:
labels = dict(label.vocab.stoi)

labels_idx = {labels[lab]:lab for lab in labels.keys()}
print(labels_idx)

{0: 'positive', 1: 'neutral', 2: 'negative'}


In [None]:
test_tweets["pred_label"] = [predict_class(model, vars(test_data.examples[i])['text']) for i in range(len(test_data))]

test_tweets.head(10)

Unnamed: 0,tweet_id,tweet_text,cleaned_test_tweet,pred_label
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor...",down in the atlantic city ventnor margate oce...,1
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...,musical awareness great big beautiful tomorrow...,0
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...,on radio fm fri oct labour analyst shawn hat...,1
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa...",kapan sih lo ngebuktiinjan ngomong doang susah...,1
4,171874368908050432,"Excuse the connectivity of this live stream, f...",excuse the connectivity of this live stream fr...,1
5,256010056942903296,Show your LOVE for your local field & it might...,show your love for your local field it might ...,0
6,253809989599232000,"Milton on Bolton Wanderers 2 v 2 Leeds United,...",milton on bolton wanderers v leeds united sa...,1
7,261776619146985472,@firecore Can you tell me when an update for t...,can you tell me when an update for the apple ...,1
8,264143999374356481,"@Heavensbasement The Crown, Filthy McNastys, K...",the crown filthy mcnastys katy dalys or the d...,0
9,223052929131757571,Uncover the Eternal City! Return flights to Ro...,uncover the eternal city return flights to rom...,1


In [None]:
test_tweets["sentiment"] = [labels_idx[idx] for idx in test_tweets.loc[:,"pred_label"]]

test_tweets.head(10)

Unnamed: 0,tweet_id,tweet_text,cleaned_test_tweet,pred_label,sentiment
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor...",down in the atlantic city ventnor margate oce...,1,neutral
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...,musical awareness great big beautiful tomorrow...,0,positive
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...,on radio fm fri oct labour analyst shawn hat...,1,neutral
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa...",kapan sih lo ngebuktiinjan ngomong doang susah...,1,neutral
4,171874368908050432,"Excuse the connectivity of this live stream, f...",excuse the connectivity of this live stream fr...,1,neutral
5,256010056942903296,Show your LOVE for your local field & it might...,show your love for your local field it might ...,0,positive
6,253809989599232000,"Milton on Bolton Wanderers 2 v 2 Leeds United,...",milton on bolton wanderers v leeds united sa...,1,neutral
7,261776619146985472,@firecore Can you tell me when an update for t...,can you tell me when an update for the apple ...,1,neutral
8,264143999374356481,"@Heavensbasement The Crown, Filthy McNastys, K...",the crown filthy mcnastys katy dalys or the d...,0,positive
9,223052929131757571,Uncover the Eternal City! Return flights to Ro...,uncover the eternal city return flights to rom...,1,neutral


In [None]:
final_df = test_tweets.loc[:,["tweet_id", "sentiment"]]

final_df.head(10)

Unnamed: 0,tweet_id,sentiment
0,264238274963451904,neutral
1,218775148495515649,positive
2,258965201766998017,neutral
3,262926411352903682,neutral
4,171874368908050432,neutral
5,256010056942903296,positive
6,253809989599232000,neutral
7,261776619146985472,neutral
8,264143999374356481,positive
9,223052929131757571,neutral


In [None]:
final_df.to_csv("HrishitaC_IITK_NLP.csv", index=False)

#Model 3: CNN

In [None]:
df = tweet_data.loc[:,['cleaned_tweet', 'sentiment']]
df.head(10)

Unnamed: 0,cleaned_tweet,sentiment
0,gas house hit going chapel hill sat happy,positive
1,theo walcott still shit watch rafa johnny deal...,negative
2,not gsp fan hate nick diaz cant wait february,negative
3,iranian general says israels iron dome cant de...,negative
4,tehran mon amour obama tried establish ties mu...,neutral
5,sat whole movie harry ron christmas ohlawd,neutral
6,davlar main rivals team poland hopefully make ...,positive
7,talking acts sats deciding want college applyi...,negative
8,why happy valentines day trending its february...,neutral
9,they may superbowl dallas dallas aint winning ...,negative


In [None]:
import torch
from torchtext import data
import numpy as np

In [None]:
tweet = data.Field(tokenize = 'spacy')
label = data.LabelField()

In [None]:
from torchtext.data import Dataset, Example
import pandas as pd

# Torchtext does not have any inherit method to deal with dataframes
# as input data, hence we create a child class of Dataset class  

class DataFrameDataset(Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [
                Example.fromlist(list(r), fields) 
                for i, r in df.iterrows()
            ], 
            fields
        )

In [None]:
train_data, valid_data = DataFrameDataset(
    df=df, 
    fields=(
        ('text', tweet),
        ('label', label)
    )
).split(split_ratio=0.8)

In [None]:
print(vars(train_data[0]))
print(vars(valid_data[0]))

{'text': ['career', 'fair', 'tomorrow', 'murphy', 'center', 'dress', 'business', 'attire', 'bring', 'resumes'], 'label': 'neutral'}
{'text': ['ill', 'share', 'care', 'clud', 'weird', 'crush', 'justin', 'sororithank', 'you', 'gin', 'real', 'life', 'comes', 'across', 'turn'], 'label': 'negative'}


In [None]:
max_vocab_size = 25000

tweet.build_vocab(train_data,
                  max_size = max_vocab_size,
                  vectors = "glove.twitter.27B.100d",
                  unk_init = torch.Tensor.normal_)

label.build_vocab(train_data)

In [None]:
print(len(tweet.vocab))
print(len(label.vocab))

25002
3


In [None]:
print(label.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fd91c97fea0>, {'positive': 0, 'neutral': 1, 'negative': 2})


In [None]:
batches = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = batches, 
    sort_within_batch = True,
    sort_key= lambda x: len(x.text),
    device = device)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
  def __init__(self, vocab_size, emb_dim, n_filters, filter_sizes,
               op_dim, drop, 
              #  pad,
               pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx = pad_idx)
    # self.pad = nn.ZeroPad2d((0,0,pad,pad)) #(left, right, top, bottom)
    self.convs = nn.ModuleList([
                                nn.Conv2d(in_channels = 1,
                                          out_channels = n_filters,
                                          kernel_size = (fs, emb_dim))
                                for fs in filter_sizes
                                ]) 
    self.fc = nn.Linear(len(filter_sizes)*n_filters, op_dim)
    self.dropout = nn.Dropout(drop)

  def forward(self, text):
    text = text.permute(1, 0)
    embedded = self.embedding(text)
    embedded = embedded.unsqueeze(1)
    # if pad > 0:
    #   embedded = self.pad(embedded)
    conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
    pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
    cat = self.dropout(torch.cat(pooled, dim=1))

    return self.fc(cat)

In [None]:
input_dim = len(tweet.vocab)
embedding_dim = 100
n_filters = 100
filter_sizes = [2, 3, 4]
op_dim = len(label.vocab)
dropout = 0.4
# padding = 0
pad_idx = tweet.vocab.stoi[tweet.pad_token]

model = CNN(input_dim, embedding_dim, n_filters, filter_sizes, op_dim, dropout, 
            # padding, 
            pad_idx)

In [None]:
def count_params(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_params(model):,} trainable parameters')

The model has 2,591,403 trainable parameters


In [None]:
pretrained_emb = tweet.vocab.vectors

model.embedding.weight.data.copy_(pretrained_emb)

tensor([[-0.2501,  0.4024,  0.9963,  ...,  1.2980,  0.1232,  0.4092],
        [-1.7404,  0.3036,  0.0273,  ..., -1.2587,  0.2789,  0.5907],
        [-0.5093,  0.2515,  0.1390,  ...,  0.7849, -0.3699, -0.4106],
        ...,
        [-1.1484,  1.1011, -0.5901,  ..., -0.0772,  0.0206,  1.3303],
        [-0.4908,  1.1556, -0.1981,  ...,  0.6285, -0.5695, -0.0437],
        [ 0.8661,  0.2620,  0.7279,  ..., -0.7392,  0.1248,  0.5263]])

In [None]:
unk_idx = tweet.vocab.stoi[tweet.unk_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def calc_acc(preds, y):
  max_preds = preds.argmax(dim=1, keepdim = True)
  correct = max_preds.squeeze(1).eq(y)
  return correct.sum() / torch.FloatTensor([y.shape[0]])

In [None]:
def train(model, iterator, optimizer, criterion):

  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text)
    loss = criterion(predictions, batch.label)
    acc = calc_acc(predictions, batch.label)
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc/len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.eval()

  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.text)
      loss = criterion(predictions, batch.label)
      acc = calc_acc(predictions, batch.label)
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - elapsed_mins * 60)
  return elapsed_mins, elapsed_secs

In [None]:
epochs = 20
best_valid_loss = float('inf')

for epoch in range(epochs):
  start_time = time.time()
  train_loss, train_acc = train(model, train_iter, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iter, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'phase-4.pt')

  print(f'Epoch {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\tVal. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

Epoch 01 | Epoch Time: 0m 2s
	Train Loss: 0.885 | Train Acc: 56.12%
	Val. Loss: 0.784 | Val. Acc: 64.14%
Epoch 02 | Epoch Time: 0m 2s
	Train Loss: 0.716 | Train Acc: 67.77%
	Val. Loss: 0.746 | Val. Acc: 66.46%
Epoch 03 | Epoch Time: 0m 2s
	Train Loss: 0.584 | Train Acc: 75.77%
	Val. Loss: 0.787 | Val. Acc: 65.40%
Epoch 04 | Epoch Time: 0m 1s
	Train Loss: 0.457 | Train Acc: 81.62%
	Val. Loss: 0.832 | Val. Acc: 64.94%
Epoch 05 | Epoch Time: 0m 1s
	Train Loss: 0.330 | Train Acc: 87.39%
	Val. Loss: 0.949 | Val. Acc: 63.73%
Epoch 06 | Epoch Time: 0m 1s
	Train Loss: 0.232 | Train Acc: 91.69%
	Val. Loss: 1.070 | Val. Acc: 62.85%
Epoch 07 | Epoch Time: 0m 1s
	Train Loss: 0.170 | Train Acc: 94.19%
	Val. Loss: 1.202 | Val. Acc: 62.00%
Epoch 08 | Epoch Time: 0m 1s
	Train Loss: 0.114 | Train Acc: 96.39%
	Val. Loss: 1.381 | Val. Acc: 61.89%
Epoch 09 | Epoch Time: 0m 2s
	Train Loss: 0.084 | Train Acc: 97.46%
	Val. Loss: 1.486 | Val. Acc: 61.73%
Epoch 10 | Epoch Time: 0m 1s
	Train Loss: 0.068 | Train

Phase4: Val Accuracy: 64.27%

#Model 4: Transformers

In [None]:
df = tweet_data.loc[:,['cleaned_tweet', 'sentiment']]
df.head(10)

Unnamed: 0,cleaned_tweet,sentiment
0,gas house hit going chapel hill sat happy,positive
1,theo walcott still shit watch rafa johnny deal...,negative
2,not gsp fan hate nick diaz cant wait february,negative
3,iranian general says israels iron dome cant de...,negative
4,tehran mon amour obama tried establish ties mu...,neutral
5,sat whole movie harry ron christmas ohlawd,neutral
6,davlar main rivals team poland hopefully make ...,positive
7,talking acts sats deciding want college applyi...,negative
8,why happy valentines day trending its february...,neutral
9,they may superbowl dallas dallas aint winning ...,negative


In [None]:
import torch
from torchtext import data
import numpy as np

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 7.5MB/s eta 0:00:01[K     |▉                               | 20kB 4.7MB/s eta 0:00:01[K     |█▎                              | 30kB 6.0MB/s eta 0:00:01[K     |█▊                              | 40kB 6.2MB/s eta 0:00:01[K     |██▏                             | 51kB 5.2MB/s eta 0:00:01[K     |██▋                             | 61kB 5.8MB/s eta 0:00:01[K     |███                             | 71kB 6.3MB/s eta 0:00:01[K     |███▍                            | 81kB 6.7MB/s eta 0:00:01[K     |███▉                            | 92kB 6.6MB/s eta 0:00:01[K     |████▎                           | 102kB 6.6MB/s eta 0:00:01[K     |████▊                           | 112kB 6.6MB/s eta 0:00:01[K     |█████▏                          | 122kB 6.6MB

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
len(tokenizer.vocab)

30522

In [None]:
init_token = tokenizer.cls_token # beginning of a sentence token
eos_token = tokenizer.sep_token # end of sentence token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [None]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [None]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

512


In [None]:
def tokenize_and_cut(sentence):
  tokens = tokenizer.tokenize(sentence)
  tokens = tokens[:max_input_length - 2] # we subtract two because we need to make space for [CLS] and [SEP] token
  return tokens

In [None]:
tweet = data.Field(batch_first = True,
                   use_vocab = False,
                   tokenize = tokenize_and_cut,
                   preprocessing = tokenizer.convert_tokens_to_ids,
                   init_token = init_token_idx,
                   eos_token = eos_token_idx,
                   pad_token = pad_token_idx,
                   unk_token = unk_token_idx)

label = data.LabelField()

In [None]:
from torchtext.data import Dataset, Example
import pandas as pd

# Torchtext does not have any inherit method to deal with dataframes
# as input data, hence we create a child class of Dataset class  

class DataFrameDataset(Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [
                Example.fromlist(list(r), fields) 
                for i, r in df.iterrows()
            ], 
            fields
        )

In [None]:
train_data, valid_data = DataFrameDataset(
    df=df, 
    fields=(
        ('text', tweet),
        ('label', label)
    )
).split(split_ratio=0.8)

In [None]:
print(vars(train_data[0]))
print(vars(valid_data[0]))

{'text': [2388, 3336, 2938, 26220, 22012, 15854, 8248, 4476], 'label': 'positive'}
{'text': [2215, 3422, 2108, 2529, 1998, 2438, 2769, 4965, 3482, 13462, 2237, 4826, 2004, 20952, 5603, 15992, 2140], 'label': 'neutral'}


In [None]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data[0])['text'])
print(tokens)

['mother', 'baby', 'sat', 'ric', 'flair', 'woo', 'brush', 'fame']


In [None]:
# max_vocab_size = 25000

# tweet.build_vocab(train_data,
#                   max_size = max_vocab_size,
#                   vectors = "glove.twitter.27B.100d",
#                   unk_init = torch.Tensor.normal_)

label.build_vocab(train_data)

In [None]:
# print(len(tweet.vocab))
print(len(label.vocab))

3


In [None]:
print(label.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f6f53a0b268>, {'neutral': 0, 'positive': 1, 'negative': 2})


In [None]:
batches = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = batches, 
    sort_within_batch = True,
    sort_key= lambda x: len(x.text),
    device = device)

In [None]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
  def __init__ (self, bert, hidden_dim, op_dim, n_layers, bidirectional, dropout):
    super().__init__()

    self.bert = bert # instead of creating and training the word embeddings we use the pretrained BERT embeddings
    embedding_dim = bert.config.to_dict()['hidden_size']
    self.rnn = nn.GRU(embedding_dim, hidden_dim,
                      num_layers = n_layers,
                      bidirectional = bidirectional,
                      batch_first = True,
                      dropout  = 0 if n_layers < 2 else dropout) 
    self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, op_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text):
    with torch.no_grad():
      embedded = self.bert(text)[0] #no training to be done in embedding layer
    _, hidden = self.rnn(embedded)
    if self.rnn.bidirectional:
      hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
    else:
      hidden = self.dropout(hidden[-1,:,:])

    output = self.out(hidden)

    return output

In [None]:
hidden_dim = 256
output_dim = len(label.vocab)
n_layers = 2
bidirectional = True
dropout = 0.25

model = BERTGRUSentiment(bert, hidden_dim, output_dim, n_layers,
                         bidirectional, dropout)


In [None]:
def count_params(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The unfrozen model has {count_params(model):,} trainable parameters')

The unfrozen model has 112,242,435 trainable parameters


In [None]:
for name, param in model.named_parameters():
  if name.startswith('bert'):
    param.requires_grad = False # freezing all the BERT transformers params

In [None]:
def count_params(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The frozen model has {count_params(model):,} trainable parameters')

The frozen model has 2,760,195 trainable parameters


In [None]:
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def calc_acc(preds, y):
  max_preds = preds.argmax(dim=1, keepdim = True)
  correct = max_preds.squeeze(1).eq(y)
  return correct.sum() / torch.FloatTensor([y.shape[0]])

In [None]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = calc_acc(predictions, batch.label)
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.eval()

  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = calc_acc(predictions, batch.label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - elapsed_mins*60)

  return elapsed_mins, elapsed_secs

In [None]:
epochs = 5

best_valid_loss = float('inf')

for epoch in range(epochs):
  start_time = time.time()
  train_loss, train_acc = train(model, train_iter, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iter, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'phase-5.pt')

  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\tVal. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 57s
	Train Loss: 0.694 | Train Acc: 68.98%
	Val. Loss: 0.815 | Val. Acc: 63.12%
Epoch: 02 | Epoch Time: 0m 56s
	Train Loss: 0.634 | Train Acc: 71.91%
	Val. Loss: 0.845 | Val. Acc: 62.55%
Epoch: 03 | Epoch Time: 0m 56s
	Train Loss: 0.588 | Train Acc: 74.46%
	Val. Loss: 0.887 | Val. Acc: 62.34%
Epoch: 04 | Epoch Time: 0m 56s
	Train Loss: 0.526 | Train Acc: 77.79%
	Val. Loss: 1.006 | Val. Acc: 60.69%
Epoch: 05 | Epoch Time: 0m 57s
	Train Loss: 0.502 | Train Acc: 79.58%
	Val. Loss: 0.965 | Val. Acc: 61.95%
