In [3]:
import numpy as np
import os
import torch
import pickle as pkl
import matplotlib.pyplot as plt

In [4]:
data_dir = "./aclImdb/"
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
TRAIN_SIZE = 20000
VALIDATION_SIZE = 5000
TEST_SIZE = 25000

def read_file(file_name):
    with open(file_name, "r") as f:
        content = f.read()
        content = content.lower().replace("<br />", "")
    return content

def load_dataset(dataset_dir, dataset_size, initial=0):
    pos_dir = os.path.join(dataset_dir, "pos")
    neg_dir = os.path.join(dataset_dir, "neg")
    single_label_size = int(dataset_size / 2)
    dataset = []
    target = []
    all_pos = os.listdir(pos_dir)
    all_neg = os.listdir(neg_dir)
    for i in range(initial, initial+single_label_size):
        dataset.append(read_file(os.path.join(pos_dir, all_pos[i])))
        target.append(1)
        dataset.append(read_file(os.path.join(neg_dir, all_neg[i])))
        target.append(0)
    return dataset, target

train_data = load_dataset(train_dir, TRAIN_SIZE)[0]
train_targets = load_dataset(train_dir, TRAIN_SIZE)[1]
validation_data = load_dataset(train_dir, VALIDATION_SIZE, initial=int(TRAIN_SIZE/2))[0]
validation_targets = load_dataset(train_dir, VALIDATION_SIZE, initial=int(TRAIN_SIZE/2))[1]
test_data = load_dataset(test_dir, TEST_SIZE)[0]
test_targets = load_dataset(test_dir, TEST_SIZE)[1]

In [9]:
import string

punctuations = string.punctuation
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 
def tokenize(sent):
#     token_list = []
    tokens = word_tokenize(sent)
    for token in tokens:
        token_list = [w for w in tokens if not w in stop_words]
    return [token.lower() for token in token_list if (token not in punctuations)]

import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/JaneYY/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
def find_ngrams(input_list, n): #input_list is a list of single tokens
    result = []
    result += list(zip(*[input_list[j:] for j in range(n)]))
    return result

def tokenize_dataset(dataset, n):
    token_dataset = []
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample)
        ngrams = find_ngrams(tokens,n)
        new_tokens = [" ".join(list(i)) for i in ngrams]
        token_dataset.append(new_tokens)
        all_tokens += new_tokens

    return token_dataset, all_tokens

n = 1

print ("Tokenizing train data")
train_data_tokens, all_train_tokens = tokenize_dataset(train_data,n)
pkl.dump(all_train_tokens, open("stop_all_train_tokens_" + str(n) + ".p", "wb"))
pkl.dump(train_data_tokens, open("stop_train_data_tokens_" + str(n) + ".p", "wb"))

print ("Tokenizing validation data")
validation_data_tokens, _ = tokenize_dataset(validation_data,n)
pkl.dump(validation_data_tokens, open("stop_validation_data_tokens_" + str(n) + ".p", "wb"))

print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_data,n)
pkl.dump(test_data_tokens, open("stop_test_data_tokens_" + str(n) + ".p", "wb"))

Tokenizing train data
Tokenizing validation data
Tokenizing test data


In [18]:
BATCH_SIZE = 32
emb_dim = 100 # dimension for n-gram embedding
learning_rate = 0.01
num_epochs = 3 # number of epochs to train
max_vocab_size = 20000

In [12]:
from collections import Counter
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens)

In [13]:
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens)
validation_data_indices = token2index_dataset(validation_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Validation dataset size is {}".format(len(validation_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 20000
Validation dataset size is 5000
Test dataset size is 25000


In [14]:
import numpy as np
import torch
from torch.utils.data import Dataset

class IMDBDataset(Dataset):
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of IMDB tokens 
        @param target_list: list of IMDB targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

    
MAX_SENTENCE_LENGTH = 250   
    
def IMDB_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

train_dataset = IMDBDataset(train_data_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=IMDB_collate_func,
                                           shuffle=True)

val_dataset = IMDBDataset(validation_data_indices, validation_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=IMDB_collate_func,
                                           shuffle=True)

test_dataset = IMDBDataset(test_data_indices, test_targets)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=IMDB_collate_func,
                                           shuffle=False)

In [115]:
import torch.nn as nn
import torch.nn.functional as F

class BagOfNGram(nn.Module):
    """
    BagOfNGram classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNGram, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,2)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

model = BagOfNGram(len(id2token), emb_dim)

In [116]:
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

def early_stop(val_acc_history, t=2, required_progress=0.005):
    """
    Stop the training if there is no non-trivial progress in k steps
    @param val_acc_history: a list contains all the historical validation acc
    @param required_progress: the next acc should be higher than the previous by 
        at least required_progress amount to be non-trivial
    @param t: number of training steps 
    @return: a boolean indicates if the model should earily stop
    """
    # TODO: add your code here
    if len(val_acc_history) >=t+1:
        if val_acc_history[-1] - val_acc_history[-1-t] <= required_progress:
            return True


validation_acc_history = []
stop_training = False

for epoch in range(num_epochs):
    scheduler.step()
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))
            validation_acc_history.append(val_acc)
            stop_training = early_stop(validation_acc_history)
            if stop_training:
                print("early stop triggered")
                break
    if stop_training:
        break

Epoch: [1/10], Step: [101/625], Validation Acc: 84.04
Epoch: [1/10], Step: [201/625], Validation Acc: 86.48
Epoch: [1/10], Step: [301/625], Validation Acc: 86.78
Epoch: [1/10], Step: [401/625], Validation Acc: 87.66
Epoch: [1/10], Step: [501/625], Validation Acc: 88.42
Epoch: [1/10], Step: [601/625], Validation Acc: 88.68
Epoch: [2/10], Step: [101/625], Validation Acc: 88.78
Epoch: [2/10], Step: [201/625], Validation Acc: 88.88
Epoch: [2/10], Step: [301/625], Validation Acc: 89.14
Epoch: [2/10], Step: [401/625], Validation Acc: 89.04
Epoch: [2/10], Step: [501/625], Validation Acc: 89.06
early stop triggered


In [118]:
print ("Test Acc {}".format(test_model(test_loader, model)))

Test Acc 87.624


In [122]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    i = 0
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
        print('predicted:{}'.format(predicted))
        print('label:{}'.format(labels.view_as(predicted)))
        i +=1
        if i >=3:
            break
    return (100 * correct / total)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=IMDB_collate_func,
                                           shuffle=False)

val_acc = test_model(val_loader, model)

predicted:tensor([[ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 0],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0]])
label:tensor([[ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
        [ 0]])
predicted:tensor([[ 1],
        [ 0],
        [ 1],
        [ 0],
        [ 1],
      

In [131]:
print('Example 1: ' + validation_data[0])
print('Example 2: ' + validation_data[1])
print('Example 3: ' + validation_data[2])

Example 1: there are enough sad stories about women and their oppression by religious, political and societal means. not to diminish the films and stories about genital mutilation and reproductive rights, as well as wage inequality, and marginalization in society, all in the name of allah or god or some other ridiculous justification, but sometimes it is helpful to just take another approach and shed some light on the subject.the setting is the 2006 match between iran and bahrain to qualify for the world cup. passions are high and several women try to disguise themselves as men to get into the match.the women who were caught (played by sima mobarak-shahi, shayesteh irani, ayda sadeqi, golnaz farmani, and mahnaz zabihi) and detained for prosecution provided a funny and illuminating glimpse into the customs of this country and, most likely, all muslim countries. their interaction with the iranian soldiers who were guarding and transporting them, both city and villagers, and the father wh

In [132]:
print('Example 1: ' + validation_data[8])
print('Example 2: ' + validation_data[51])
print('Example 3: ' + validation_data[75])

Example 1: twisted desire (1996) was a tv movie starring melissa joan hart. melissa's character, jennifer stanton, a seventeen-year-old seduces her current boyfriend nick ryan into murdering her two parents. the movie is based on the 1990 murders of the parents of 14 year old jessica wiseman. jessica had her 17 year old boyfriend douglas christopher thomas shoot and kill her parents! thomas was executed in 2000! jessica was released from prison when she turned 21 years old. evidence now suggests that it was jessica who fired the fatal shot that killed her mother. jessica is known to now be residing somewhere in the state of virginia.
Example 2: this movie's one of my favorites. it's not really any good, but it's great to laugh at. the dialogue can become incredibly ludicrous and poorly acted (eg, "manji, can we ask you a few questions?" "sure." "we think you can help us with the answers.") any fighting is more or less surrealistic. make sure to watch for brock, the oafy white guy who a