In [1]:
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize 

In [3]:
from collections import Counter
from collections import defaultdict

Create a bag of word model for a text classification problem. Note that this is not the same as the continous bag of word problem that we solved here but you can reuse the tokenization part.

###  Download data

In [8]:
def get_data():
    ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
    ! mkdir data
    ! tar -xvf rotten_imdb.tar.gz -C data

In [9]:
get_data()
! ls data

--2020-05-20 21:55:05--  http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.20
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.20|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 519599 (507K) [application/x-gzip]
Saving to: ‘rotten_imdb.tar.gz’


2020-05-20 21:55:05 (2.29 MB/s) - ‘rotten_imdb.tar.gz’ saved [519599/519599]

x quote.tok.gt9.5000
x plot.tok.gt9.5000
x subjdata.README.1.0
plot.tok.gt9.5000   quote.tok.gt9.5000  subjdata.README.1.0


### Split data

In [10]:
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
sub_content = read_file("data/quote.tok.gt9.5000")
obj_content = read_file("data/plot.tok.gt9.5000")
sub_content = np.array([line.strip().lower() for line in sub_content])
obj_content = np.array([line.strip().lower() for line in obj_content])
sub_y = np.zeros(len(sub_content))
obj_y = np.ones(len(obj_content))
X = np.append(sub_content, obj_content)
y = np.append(sub_y, obj_y)

In [13]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

In [14]:
x_train.shape, y_train.shape

((8000,), (8000,))

In [15]:
x_train[0]

"both lead performances are oscar-size . quaid is utterly fearless as the tortured husband living a painful lie , and moore wonderfully underplays the long-suffering heroine with an unflappable '50s dignity somewhere between jane wyman and june cleaver ."

### Compute a vocabulary
* Split your sentences in tokens by spliting on spaces.
* Compute the frequency of every word.
* Pick top frequency words (4000 or so) to be part of your vocabulary.
* Create a map from each word to an index. Keep 0 for out of the vocabulary workds (<UNK>).

In [16]:
x_train_token = [s.strip().lower().split(" ")for s in x_train]
x_val_token = [s.strip().lower().split(" ")for s in x_val]

In [17]:
total_token = []
for s in x_train_token:
    total_token += s

In [18]:
c = Counter(total_token)

In [19]:
most_frequent_4000 = list(list(zip(*c.most_common(4000)))[0])

In [20]:
vocab2index = dict(zip(['UNK']+most_frequent_4000,range(4001))) 

In [21]:
words = dict(zip(range(4001),zip(['UNK']+most_frequent_4000)))

### Bag of word representation

* Given a piece of text compute the following features $x$.
$x_i = 1$ if word with index $i$ appears in the text. Otherwise $x_i = 0$. Note that length $x$ is the size of the vocabulary.  

In [22]:
def bow_representation(sentence):
    idx = np.zeros(4001)
    for w in sentence:
        try:
            tmp = vocab2index[w]
            idx[tmp]=1
        except KeyError:
            continue 
    return idx

In [23]:
idx_train = [np.array(bow_representation(s)) for s in x_train_token]
idx_val = [np.array(bow_representation(s)) for s in x_val_token]

###  Dataset and dataloaders
Write a dataset for this problem

In [24]:
class BOW(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.x[idx],self.y[idx]

In [25]:
train = BOW(idx_train,y_train)
val = BOW(idx_val,y_val)

In [26]:
train_dl = DataLoader(train,batch_size=400,shuffle=True)
valid_dl = DataLoader(val,batch_size=400,shuffle=True)

In [27]:
x,y=next(iter(train_dl))

In [39]:
x,y

(tensor([[0., 1., 1.,  ..., 0., 0., 0.],
         [0., 1., 1.,  ..., 0., 0., 0.],
         [0., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 1., 1.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 1.,  ..., 0., 0., 0.]], dtype=torch.float64),
 tensor([1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
         1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1.,
         0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0.,
         1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0.,
         1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0.,
         0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
         0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1.,
         1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0.,
         1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 

### Model

Define a two layer neural network.

In [28]:
class BOWModel(nn.Module):
    def __init__(self,vocab_size):
        super(BOWModel,self).__init__()
        self.linear1 = nn.Linear(vocab_size,40)
        self.linear2 = nn.Linear(40,1)
    def forward(self,x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        return x

## Training and valid functions

In [29]:
def val_metrics(model):
    model.eval()
    correct = 0
    total = 0
    loss_sum = 0
    for x, y in valid_dl:
        y_hat = model(x.float())
        loss = F.binary_cross_entropy_with_logits(y_hat, y.unsqueeze(1).float())
        y_pred = y_hat > 0
        correct += (y_pred.float() == y.unsqueeze(1).float()).float().sum()
        total += x.size(0)
        loss_sum += loss.item()*x.size(0)
    accuracy = correct.item()/total
    return loss_sum/total, accuracy

###  Training loop

In [30]:
def train_epocs(model, epochs=10, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i in range(epochs):
        model.train()
        total = 0
        loss_sum = 0
        for x, y in train_dl:
            y_hat = model(x.float())
            loss = F.binary_cross_entropy_with_logits(y_hat, y.unsqueeze(1).float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total += x.size(0)
            loss_sum += loss.item()*x.size(0)
        val_loss, val_acc = val_metrics(model)
        print("train loss %.3f val loss %.3f and accuracy %.3f" % (loss_sum/total, val_loss, val_acc))

In [31]:
model = BOWModel(vocab_size=len(vocab2index.keys()))
train_epocs(model, 10, 0.0009)

train loss 0.659 val loss 0.610 and accuracy 0.891
train loss 0.547 val loss 0.498 and accuracy 0.899
train loss 0.429 val loss 0.399 and accuracy 0.898
train loss 0.335 val loss 0.331 and accuracy 0.902
train loss 0.272 val loss 0.290 and accuracy 0.904
train loss 0.230 val loss 0.264 and accuracy 0.903
train loss 0.200 val loss 0.248 and accuracy 0.905
train loss 0.178 val loss 0.238 and accuracy 0.908
train loss 0.160 val loss 0.231 and accuracy 0.911
train loss 0.145 val loss 0.227 and accuracy 0.908


### Word importance
To get the words that affect the most the positive label we find the words with higher weights. Similarly to get the words that affect the most the 0 label we find the words with lower weights.

In [32]:
parms = [p for p in model.parameters()]
parms

[Parameter containing:
 tensor([[ 0.0129,  0.0951,  0.0503,  ..., -0.0044, -0.0179, -0.0038],
         [-0.0029,  0.0752,  0.0356,  ..., -0.0339, -0.0180, -0.0075],
         [ 0.0031,  0.0676,  0.0920,  ...,  0.0369,  0.0267,  0.0283],
         ...,
         [-0.0057,  0.0660,  0.0389,  ..., -0.0053, -0.0258, -0.0053],
         [-0.0153,  0.0677,  0.0809,  ...,  0.0186,  0.0128,  0.0175],
         [ 0.0109,  0.0935,  0.0578,  ..., -0.0321, -0.0090, -0.0102]],
        requires_grad=True),
 Parameter containing:
 tensor([0.0883, 0.0754, 0.0776, 0.0916, 0.0894, 0.0784, 0.0749, 0.1157, 0.0759,
         0.0874, 0.0791, 0.0885, 0.0761, 0.1153, 0.0837, 0.0785, 0.0945, 0.0624,
         0.0935, 0.0750, 0.1017, 0.0824, 0.1026, 0.0797, 0.0641, 0.1132, 0.0816,
         0.0993, 0.0606, 0.0807, 0.0874, 0.1056, 0.0762, 0.0950, 0.0803, 0.0737,
         0.0569, 0.0703, 0.0736, 0.0964], requires_grad=True),
 Parameter containing:
 tensor([[-0.1883, -0.2634,  0.2803, -0.1743,  0.3046,  0.1939, -0.2491,  

In [33]:
weights = parms[0].detach().numpy()
weights

array([[ 0.01290899,  0.09514178,  0.05026132, ..., -0.00436886,
        -0.01786191, -0.00381802],
       [-0.00292029,  0.07519574,  0.03555987, ..., -0.03394768,
        -0.01798485, -0.00745508],
       [ 0.00309361,  0.06760807,  0.09202307, ...,  0.03694866,
         0.02668696,  0.02834515],
       ...,
       [-0.00571659,  0.06595954,  0.03887386, ..., -0.00534975,
        -0.02584901, -0.00534588],
       [-0.01533698,  0.06765414,  0.08089469, ...,  0.01859844,
         0.01279914,  0.01754384],
       [ 0.01094467,  0.09346816,  0.05777293, ..., -0.03209402,
        -0.00898605, -0.01020165]], dtype=float32)

In [34]:
weights[0].shape

(4001,)

In [35]:
sorted_indeces = np.argsort(weights[0])

In [36]:
weights[0, sorted_indeces[0]], weights[0, sorted_indeces[-1]],

(-0.1477776, 0.148554)

In [37]:
[words[i] for i in sorted_indeces[:10]]

[('they',),
 ('he',),
 ('discovers',),
 ('friends',),
 ('-',),
 ('kill',),
 ('follows',),
 ('her',),
 ('she',),
 ('struggles',)]

In [38]:
[words[i] for i in sorted_indeces[3990:]]

[('screen',),
 ('here',),
 ('actors',),
 ('good',),
 ('me',),
 ('likely',),
 ('performance',),
 ('interesting',),
 ('my',),
 ('--',),
 ('material',)]