In [16]:
import numpy as np # linear algebra
import pandas as pd
import torch
from torch import nn,optim
import torchtext
from torchtext.legacy import data

In [2]:
if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"
print("Device =",device)
input_data_path = '/content/data'

Device = cuda


In [3]:
df = pd.read_csv(input_data_path +'/Train.csv')
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [6]:
#Define a custom tokenizer
my_tokenizer = lambda x:str(x).split()
#Define fields for our input dataset
TEXT = data.Field(sequential=True, lower= True,tokenize = my_tokenizer,use_vocab=True)
LABEL = data.Field(sequential = False,use_vocab = False)
#Define inut fields as a list of tuples of fields
trainval_fields = [("text",TEXT),("label",LABEL)]
#Contruct dataset
train_data, val_data = data.TabularDataset.splits(path = input_data_path, train = "Train.csv", validation = "Valid.csv", 
                                                  format = "csv", skip_header = True, fields = trainval_fields)
#Build vocabulary
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
#Define iterators for train and validation
train_iterator = data.BucketIterator(train_data,
                                     device = device,
                                     batch_size = 32,
                                     sort_key = lambda x:len(x.text),
                                     sort_within_batch = False,
                                     repeat = False)
val_iterator = data.BucketIterator(val_data,
                                   device = device,
                                   batch_size= 32,
                                   sort_key = lambda x:len(x.text),
                                   sort_within_batch = False,
                                   repeat = False)
print(TEXT.vocab.freqs.most_common()[:10])

[('the', 511112), ('a', 253702), ('and', 251397), ('of', 229381), ('to', 211883), ('is', 164005), ('in', 143530), ('i', 113576), ('this', 110892), ('that', 104153)]


In [8]:
class RNNModel(nn.Module):
  def __init__(self, embedding_dim, input_dim, hidden_dim, output_dim):
    super().__init__()
    self.Embedding = nn.Embedding(input_dim,embedding_dim)
    self.rnn = nn.RNN(embedding_dim,hidden_dim)
    self.fc = nn.Linear(hidden_dim,output_dim)


  def forward(self,x):
    x = self.Embedding(x)
    output, hidden = self.rnn(x)
    out = self.fc(hidden.squeeze(0))
    return(out)


In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
#Create model instance
model = RNNModel(EMBEDDING_DIM, INPUT_DIM,HIDDEN_DIM,OUTPUT_DIM)

In [11]:

#Define training step
def train(model, data_iterator,optimizer,loss_function):
    epoch_loss,epoch_acc,epoch_denom = 0,0,0

    model.train()    #Explicitly set model to train mode

    for i, batch in enumerate(data_iterator):

        optimizer.zero_grad()
        predictions = model(batch.text)

        loss = loss_function(predictions.reshape(-1,1), batch.label.float().reshape(-1,1))
        acc = accuracy(predictions.reshape(-1,1), batch.label.reshape(-1,1))

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_denom += len(batch)

    return epoch_loss/epoch_denom,epoch_acc, epoch_denom

#Define evaluation step
def evaluate(model, data_iterator,loss_function):
    epoch_loss,epoch_acc,epoch_denom = 0,0,0
    
    model.eval()     #Explcitly set model to eval mode

    for i, batch in enumerate(data_iterator):
        with torch.no_grad():
            predictions = model(batch.text)

            loss = loss_function(predictions.reshape(-1,1), batch.label.float().reshape(-1,1))
            acc = accuracy(predictions.reshape(-1,1), batch.label.reshape(-1,1))

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_denom += len(batch)
            
    return epoch_loss/epoch_denom, epoch_acc, epoch_denom

In [12]:
#Compute binary accuracy
def accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))

    #Count the number of correctly predicted outcomes	
    correct = (rounded_preds == y).float()
    acc = correct.sum()

    return acc

#Define optimizer, loss function 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

#Transfer components to GPU, if avaiable.
model = model.to(device)
criterion = criterion.to(device)

In [14]:
n_epochs = 5

for epoch in range(n_epochs):
    #Train and evaluate     
    train_loss, train_acc,train_num = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc,val_num = evaluate(model, val_iterator,criterion)

    print("Epoch-",epoch)    

    print(f'\tTrain  Loss: {train_loss: .3f} | Train Predicted Correct : {train_acc}| Train Denom: {train_num} | PercAccuracy: {train_acc/train_num}')
    print(f'\tValid  Loss: {valid_loss: .3f} | Valid Predicted Correct: {valid_acc}| Val Denom: {val_num}| PercAccuracy: {train_acc/train_num}')

Epoch- 0
	Train  Loss:  0.022 | Train Predicted Correct : 19851.0| Train Denom: 40000 | PercAccuracy: 0.496275
	Valid  Loss:  0.022 | Valid Predicted Correct: 2496.0| Val Denom: 5000| PercAccuracy: 0.496275
Epoch- 1
	Train  Loss:  0.022 | Train Predicted Correct : 20055.0| Train Denom: 40000 | PercAccuracy: 0.501375
	Valid  Loss:  0.022 | Valid Predicted Correct: 2542.0| Val Denom: 5000| PercAccuracy: 0.501375
Epoch- 2
	Train  Loss:  0.022 | Train Predicted Correct : 19938.0| Train Denom: 40000 | PercAccuracy: 0.49845
	Valid  Loss:  0.022 | Valid Predicted Correct: 2513.0| Val Denom: 5000| PercAccuracy: 0.49845
Epoch- 3
	Train  Loss:  0.022 | Train Predicted Correct : 20098.0| Train Denom: 40000 | PercAccuracy: 0.50245
	Valid  Loss:  0.022 | Valid Predicted Correct: 2461.0| Val Denom: 5000| PercAccuracy: 0.50245
Epoch- 4
	Train  Loss:  0.022 | Train Predicted Correct : 20078.0| Train Denom: 40000 | PercAccuracy: 0.50195
	Valid  Loss:  0.022 | Valid Predicted Correct: 2512.0| Val Denom:

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch,torchtext
from torch import nn, optim
from torch.optim import Adam


if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print("Device =",device)

input_data_path = " /input/imdb-dataset-sentiment-analysis-in-csv-format/"

#Define fields for our input dataset
TEXT = data.Field(sequential=True, lower= True,tokenize = 'spacy', include_lengths = True)
LABEL  = data.Field(sequential = False,use_vocab = False)


#Define a list of tuples of fields
trainval_fields = [("text",TEXT),("label",LABEL)]

#Contruct dataset
train_data, val_data = data.TabularDataset.splits(path = input_data_path
, train = "Train.csv", validation = "Valid.csv", format = "csv"
, skip_header = True, fields = trainval_fields)

#Build Vocab using pretrained
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE,   vectors = 'fasttext.simple.300d')
BATCH_SIZE = 64

train_iterator, val_iterator =  data.BucketIterator.splits(
                                                            (train_data, val_data), 
                                                            batch_size = BATCH_SIZE,
                                                            sort_key  = lambda x:len(x.text),
                                                            sort_within_batch = True,
                                                            device = device)

In [None]:
class ImprovedRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()     
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
                
        embedded = self.dropout(self.embedding(text))
                
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))

        return self.fc(hidden)

In [None]:
#Define model input parameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

#Create model instance
model = ImprovedRNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

#Copy pretrained vector weights
model.embedding.weight.data.copy_(pretrained_embeddings)

#Initialize the embedding with 0 for pad as well as unknown tokens
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

In [None]:
#Define train step
def train(model, iterator, optimizer, criterion):
    
    epoch_loss,epoch_acc,epoch_denom = 0,0,0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()        
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)        
        loss = criterion(predictions.reshape(-1,1), batch.label.float().reshape(-1,1))        
        acc = accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_denom += len(batch)
        
    return epoch_loss/epoch_denom, epoch_acc, epoch_denom

#Define evaluate step
def evaluate(model, iterator, criterion):
    
    epoch_loss,epoch_acc,epoch_denom = 0,0,0    
    model.eval()
    
    with torch.no_grad():    
        for batch in iterator:
            text, text_lengths = batch.text            
            predictions = model(text, text_lengths).squeeze(1)            
            loss = criterion(predictions, batch.label.float())         
            acc = accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_denom += len(batch)           
        
    return epoch_loss/epoch_denom, epoch_acc, epoch_denom

#Define optimizer, loss funciton and load to GPU
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

#similar to previous exercise, we deifne our accuracy function 
def accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))

    correct = (rounded_preds == y).float()
    acc = correct.sum()

    return acc

#Finally lets train our model for 5 epochs
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc,train_num = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc,val_num = evaluate(model, val_iterator, criterion)
    print("Epoch-",epoch)
    print(f'\tTrain  Loss: {train_loss: .3f} | Train Predicted Correct : {train_acc} | Train Denom: {train_num} | PercAccuracy: {train_acc/train_num}')
    print(f'\tValid  Loss: {valid_loss: .3f} | Valid Predicted Correct: {valid_acc} | Val Denom: {val_num}| PercAccuracy: {train_acc/train_num}')