In [1]:
import nltk
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Imports - our files
import utils
import models

# Global definitions - data
DATA_FN = 'data/crowdflower_data.csv'
LABEL_NAMES = ["happiness", "worry", "neutral", "sadness"]

# Global definitions - architecture
EMBEDDING_DIM = 100  # We will use pretrained 100-dimensional GloVe
BATCH_SIZE = 128
NUM_CLASSES = 4
USE_CUDA = torch.cuda.is_available()  # CUDA will be available if you are using the GPU image for this homework

# Global definitions - saving and loading data
FRESH_START = False  # set this to false after running once with True to just load your preprocessed data from file
#                     (good for debugging)
TEMP_FILE = "temporary_data.pkl"  # if you set FRESH_START to false, the program will look here for your data, etc.

In [2]:
# load the data and embeddings from file
try:
    with open(TEMP_FILE, "rb") as f:
        print("Loading DataLoaders and embeddings from file....")
        train_generator, dev_generator, test_generator, embeddings, train_data = pickle.load(f)
except FileNotFoundError:
    raise FileNotFoundError("You need to have saved your data with FRESH_START=True once in order to load it!")

Loading DataLoaders and embeddings from file....


In [44]:
class RNNModel(nn.Module):
    def __init__(self, sentence_len, output_dim,hidden_dim, weight):
        super(RNNModel, self).__init__()
        # Number of hidden dimensions
        self.hidden_dim = hidden_dim
        self.sentence_len = sentence_len
        self.embed_dim = weight.size(1)       
        # Define embeddings
        self.embedding = nn.Embedding.from_pretrained(weight)
        # RNN
        self.rnn = nn.RNN(self.embed_dim,hidden_dim,num_layers=2,batch_first=True)     
        #self.drop_layer = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def get_len(self, x):
        x_len = []
        for ix in x:
            if (ix==0).nonzero()[0]:
                x_len.append((ix==0).nonzero()[0])
            else:
                x_len.append(len(ix))
    
    def forward(self, x):
        batch_size = x.size(0)
        x_lengths = self.get_len(x)
        x = self.embedding(x).float()
        out, _ = self.rnn(x)
        #print(np.shape(out))
        selected = torch.zeros(batch_size, self.hidden_dim, dtype=torch.float)
        for i, l in enumerate(x_lengths):
            selected[i,:] = out[i,l,:]
        #print(np.shape(selected))
        return self.fc(selected)

In [45]:
HIDDEN_DIM = 64
SENTENCE_LEN = 91
model3 = RNNModel(SENTENCE_LEN, NUM_CLASSES, HIDDEN_DIM, embeddings)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model3.parameters())
  
EPOCHS = 20
losses = []
for iepoch in range(EPOCHS): 
    i = 0
    for train_batch, train_label in train_generator:
#         if i>10:
#             break
#         i += 1
        # Compute and print loss
        loss = criterion(model3(train_batch),train_label)
        #print(loss.item()) 

        # Zero the gradients
        model3.zero_grad()

        # perform a backward pass (backpropagation)
        loss.backward()

        # Update the parameters
        optimizer.step()
    print(loss.item()) 
#     total_loss = 0
#     for ibatch, ilabel in dev_generator:
#         dev_loss = criterion(model3(ibatch), ilabel)
#         total_loss += dev_loss
#     print(iepoch, total_loss)
#     losses.append(total_loss)

TypeError: 'NoneType' object is not iterable

In [13]:
np.shape(embeddings)

torch.Size([17635, 100])