# Training CBOW

In [3]:
# import libriries
import os 
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import ConcatDataset
import torch.optim as optim
import torch.nn.functional as F
from torchtext.legacy import data, datasets
from torchtext.data.utils import get_tokenizer
import random
from torchtext.vocab import GloVe

In [4]:
SEED = 12345
torch.manual_seed(SEED)
TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

In [5]:
# download imdb daata
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [6]:
# download 6 Billion GloVe data
glove_embedding = GloVe(name='6B', dim=100)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                           
100%|█████████▉| 398683/400000 [00:14<00:00, 27895.11it/s]

In [7]:
# aggregate all imdb train & test data to make cbow data
all_data = []
all_label = []
for data in train_data:
    all_data.append(data.text)
    all_label.append(data.label)
for data in test_data:
    all_data.append(data.text)
    all_label.append(data.label)  

In [8]:
# make my vocab for words (to reduce computation cost 3000 is enough)
MAX_VOCAB_SIZE = 3000
TEXT.build_vocab(all_data, max_size = MAX_VOCAB_SIZE)

In [9]:
# slide over the text and make data for cbow (window = 5)
def make_train_data_for_cbow(train_data,window=5):
    # put around words in x_data_cbow & context word in y_data_cbow 
    x_data_cbow = []
    y_data_cbow = []
    # do it for train data
    for data in train_data:
        text_len = len(data)
        text = data
        for i in range(int(window // 2),text_len - int(window // 2)):
            x_data_cbow.append([text[i - 2],text[i - 1],text[i + 1],text[i + 2]])
            y_data_cbow.append(text[i])
            
    return (x_data_cbow,y_data_cbow)

In [10]:
x_train, y_train = make_train_data_for_cbow(all_data)

100%|█████████▉| 398683/400000 [00:30<00:00, 27895.11it/s]

In [11]:
# my cbow architecture
class CBOW(nn.Module):

  def __init__(self,vocab_size,embedding_dim):
      super(CBOW,self).__init__()
      # defining layers
      self.embedding = nn.Embedding(vocab_size,embedding_dim)
      self.linear = nn.Linear(embedding_dim,vocab_size)
      self.log_softmax = nn.LogSoftmax(dim = -1)
  
  def forward(self,input):
      x = torch.sum(self.embedding(input),dim=0).view(1,-1)
      x = self.linear(x)
      out = self.log_softmax(x)
      return out

  def word_embedding(self,word):
      return self.embedding(torch.tensor([TEXT.vocab.stoi[word]]).to(device))

In [12]:
# here i reduce the data by factor of step to reduce computation cost
step = 20
x_train = x_train[::step]
y_train = y_train[::step]

### train my CBOW network (embedding dim = 100) in 5 epochs with SGD optimizer (lr = 0.005) and NLLLoss, total loss i printed every chunk i iterated (each chunk contains 10000 train data by i did not used batches)

In [13]:
epochs = 5
device = 'cuda:0'
data_len = np.array(y_train).shape[0]
vocab_size = MAX_VOCAB_SIZE + 2
embedding_dim = 100
chunk_size = 10000
chunk_num = int(data_len // chunk_size)
cbow = CBOW(vocab_size,embedding_dim).to(device)
criterion = nn.NLLLoss()
optimizer = optim.SGD(cbow.parameters(),lr=5e-3)

for epoch in range(epochs):
    print(f'################################# Epoch {epoch + 1} #################################')
    epoch_loss = 0
    chunk_loss = 0
    chunk_id = 1

    for i in range(data_len):
        optimizer.zero_grad()
        data_x = []
        for x in x_train[i]:
            data_x.append(TEXT.vocab.stoi[x])
        data_y = [TEXT.vocab.stoi[y_train[i]]]

        # convert data to tensor and run network
        data_x = torch.Tensor(data_x).to(torch.int64).to(device)
        data_y = torch.tensor(data_y).to(torch.int64).to(device)
        output = cbow(data_x)
        loss = criterion(output,data_y)
        # add current loss to chunk loss
        chunk_loss += loss.item()
        # run optimizer
        loss.backward()
        optimizer.step()
        # add loss to epoch loss
        epoch_loss += loss.item()
        if (i + 1) % chunk_size == 0:
            print(f'chunk [{chunk_id}/{chunk_num}] loss : {np.around(chunk_loss,decimals=2)}')
            chunk_loss = 0
            chunk_id += 1

################################# Epoch 1 #################################
chunk [1/66] loss : 67161.17
chunk [2/66] loss : 61526.7
chunk [3/66] loss : 59582.13
chunk [4/66] loss : 58613.71
chunk [5/66] loss : 57619.3
chunk [6/66] loss : 57072.63
chunk [7/66] loss : 56702.73
chunk [8/66] loss : 56087.37
chunk [9/66] loss : 55566.68
chunk [10/66] loss : 54957.07
chunk [11/66] loss : 54510.34
chunk [12/66] loss : 54760.64
chunk [13/66] loss : 53702.69
chunk [14/66] loss : 53916.78
chunk [15/66] loss : 53780.51
chunk [16/66] loss : 53665.43
chunk [17/66] loss : 53949.91
chunk [18/66] loss : 54340.25
chunk [19/66] loss : 54151.17
chunk [20/66] loss : 54104.34
chunk [21/66] loss : 53949.28
chunk [22/66] loss : 53081.69
chunk [23/66] loss : 53656.98
chunk [24/66] loss : 53889.75
chunk [25/66] loss : 52843.44
chunk [26/66] loss : 52910.4
chunk [27/66] loss : 52897.23
chunk [28/66] loss : 53158.04
chunk [29/66] loss : 52807.36
chunk [30/66] loss : 53374.98
chunk [31/66] loss : 52380.19
chunk 

# Problem 5 : Part 1

### Here i preprocess the imdb data
#### 1.crop text with bigger length than 50
#### 2. pad text with length less than 50


In [15]:
threshold = 50
data_x = []
data_y = all_label

# pad all data to extent which that they all be the same size (these data are for imdb classifier)
for data in all_data:
    if len(data) < threshold:
        temp = []
        for word in data:
            temp.append(word)
        for j in range(threshold - len(data)):
            temp.append('<pad>')
        data_x.append(temp)
    else:
        data_x.append(data[:threshold])

### here i shuffle data and also divide data to train & validation & test 

In [17]:
train_data_percent = 0.7
val_data_percent = .1
test_data_percent = .2
data_len = len(data_y)

# shuffle data to random order
temp_data = list(zip(data_x, data_y))
random.shuffle(temp_data)
data_x, data_y = zip(*temp_data)

# make x data
train_data_x = data_x[:int(train_data_percent * len(data_x))]
val_data_x = data_x[int(train_data_percent * len(data_x)):int((train_data_percent + val_data_percent) * len(data_x))]
test_data_x = data_x[int((train_data_percent + val_data_percent) * len(data_x)):]
# make y data
train_data_y = data_y[:int(train_data_percent * len(data_x))]
val_data_y = data_y[int(train_data_percent * len(data_x)):int((train_data_percent + val_data_percent) * len(data_x))]
test_data_y = data_y[int((train_data_percent + val_data_percent) * len(data_x)):]

### here i define a get_batch which every time i need a batch i'll pass batch & batch size & network phase(train or validation or test) to it and it returns the batch of data in numbers (it is used for the first part of question)

In [25]:
def get_batch(batch,batch_size,data_type='train'):
    if data_type == 'train':
        word_matrix = train_data_x[batch * batch_size:(batch + 1) * batch_size]
        label_matrix = train_data_y[batch * batch_size:(batch + 1) * batch_size]
    elif data_type == 'val':
        word_matrix = val_data_x[batch * batch_size:(batch + 1) * batch_size]
        label_matrix = val_data_y[batch * batch_size:(batch + 1) * batch_size]
    elif data_type == 'test':
        word_matrix = test_data_x[batch * batch_size:(batch + 1) * batch_size]
        label_matrix = test_data_y[batch * batch_size:(batch + 1) * batch_size]



    number_matrix = np.zeros((batch_size,threshold,100))
    target_matrix = np.zeros((batch_size,1))

    for i in range(batch_size):
        if label_matrix[i] == 'pos':
            target_matrix[i] = 1
        else:
            target_matrix[i] = 0

        for j in range(threshold):
            temp = cbow.word_embedding(word_matrix[i][j])
            number_matrix[i,j] = temp.cpu().detach().numpy()


    return (number_matrix,target_matrix)

### heres my network for the first part of question (i use word embeddings of last part vectors)

In [26]:
# here is my imdb classifier using my own vectors in the last problem (Problem 4)
class IMDB_Classifier_normal(nn.Module):

  def __init__(self,input_dim,hidden_dim,output_dim,num_layers,device):
      super(IMDB_Classifier_normal,self).__init__()
      # defining layers
      self.device = device
      self.num_layers = num_layers
      self.hidden_dim = hidden_dim
      self.lstm = nn.LSTM(input_dim,hidden_dim,num_layers,batch_first=True)
      self.linear = nn.Linear(hidden_dim,output_dim)
  
  def forward(self,input):
      h0 = torch.zeros(self.num_layers,input.shape[0],self.hidden_dim).to(self.device)
      c0 = torch.zeros(self.num_layers,input.shape[0],self.hidden_dim).to(self.device)
      lstm_out, _ = self.lstm(input,(h0,c0))
      out = self.linear(lstm_out[:,-1,:])
      
      return out

### here is my first part training i trained 10 epochs on cuda and i wrote every batch loss and at the end of each epoch accuracy on validation data is measured

In [31]:
input_dim = 100
hidden_dim = 50
output_dim = 1
num_layers = 2
device = 'cuda:0'
imdb_classifier = IMDB_Classifier_normal(input_dim,hidden_dim,output_dim,num_layers,device).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(imdb_classifier.parameters(),lr=1e-2,weight_decay=0)
epochs = 10
batch_size = 64
batch_num = int(len(train_data_x) // batch_size)

for epoch in range(epochs):
      print(f'########################### EPOCH {epoch + 1} ###########################')
      epoch_loss = 0
      for i in range(batch_num):
          optimizer.zero_grad()
          batch_x, batch_y = get_batch(i,batch_size,'train')
          # resize batches and bring them to gpu
          batch_x = torch.tensor(batch_x).float().view(batch_size,threshold,input_dim).to(device)
          batch_y = torch.tensor(batch_y).float().view(batch_size,1).to(device)

          output = imdb_classifier(batch_x)
          loss = criterion(output,batch_y).to(device)
          epoch_loss += loss.item()
          # optimizer update
          loss.backward()
          optimizer.step()
          print(f'Batch {i + 1}/{batch_num} | Loss {np.around(loss.item(),decimals=5)}')

      # apply validation
      val_batch_num = int(len(val_data_y) // batch_size)
      with torch.no_grad():
          accuracy = 0
          for i in range(val_batch_num):
              batch_x, batch_y = get_batch(i,batch_size,'val')
              # resize batches and bring them to gpu
              batch_x = torch.tensor(batch_x).float().view(batch_size,threshold,input_dim).to(device)
              batch_y = torch.tensor(batch_y).int().view(batch_size,1).to(device)
              output = torch.sigmoid(imdb_classifier(batch_x))
              output = torch.round(output).int()
              accuracy += (output == batch_y).sum()
          accuracy = accuracy / (batch_size * val_batch_num)
      print(f'Accuracy On Validation : {np.around(accuracy.cpu() * 100,decimals=2)} %')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Batch 480/546 | Loss 0.69602
Batch 481/546 | Loss 0.64391
Batch 482/546 | Loss 0.63295
Batch 483/546 | Loss 0.689
Batch 484/546 | Loss 0.67261
Batch 485/546 | Loss 0.6267
Batch 486/546 | Loss 0.65155
Batch 487/546 | Loss 0.67659
Batch 488/546 | Loss 0.68027
Batch 489/546 | Loss 0.61735
Batch 490/546 | Loss 0.6496
Batch 491/546 | Loss 0.65599
Batch 492/546 | Loss 0.63703
Batch 493/546 | Loss 0.63838
Batch 494/546 | Loss 0.66482
Batch 495/546 | Loss 0.62697
Batch 496/546 | Loss 0.63013
Batch 497/546 | Loss 0.67219
Batch 498/546 | Loss 0.6084
Batch 499/546 | Loss 0.63298
Batch 500/546 | Loss 0.66527
Batch 501/546 | Loss 0.57848
Batch 502/546 | Loss 0.67791
Batch 503/546 | Loss 0.61499
Batch 504/546 | Loss 0.58178
Batch 505/546 | Loss 0.63508
Batch 506/546 | Loss 0.61426
Batch 507/546 | Loss 0.62324
Batch 508/546 | Loss 0.58159
Batch 509/546 | Loss 0.64262
Batch 510/546 | Loss 0.54493
Batch 511/546 | Loss 0.62638
Batch 512/54

In [33]:
test_batch_num = int(len(test_data_y) // batch_size)
with torch.no_grad():
    accuracy = 0
    for i in range(test_batch_num):
        batch_x, batch_y = get_batch(i,batch_size,'test')
        # resize batches and bring them to gpu
        batch_x = torch.tensor(batch_x).float().view(batch_size,threshold,input_dim).to(device)
        batch_y = torch.tensor(batch_y).int().view(batch_size,1).to(device)
        output = torch.sigmoid(imdb_classifier(batch_x))
        output = torch.round(output).int()
        accuracy += (output == batch_y).sum()
    accuracy = accuracy / (batch_size * test_batch_num)
print(f'Accuracy On Test : {np.around(accuracy.cpu() * 100,decimals=2)} %')

Accuracy On Test : 71.75 %


# Problem 5: IMD

### here i make weight matrix for my words using pretrained GloVe words

In [35]:
weights_matrix = np.zeros((MAX_VOCAB_SIZE + 2, 100))
for word, value in TEXT.vocab.stoi.items():
    try: 
        weights_matrix[value] = glove_embedding[word]
    except KeyError:
        weights_matrix[value] = np.random.normal(scale=0.6, size=(emb_dim,))

### here i define a get_batch_glove which every time i need a batch i'll pass batch & batch size & network phase(train or validation or test) to it and it returns the batch of data in numbers (it is used for the second part of question)

In [40]:
def get_batch_glove(batch,batch_size,data_type='train'):
    # get data according to training phase
    if data_type == 'train':
        word_matrix = train_data_x[batch * batch_size:(batch + 1) * batch_size]
        label_matrix = train_data_y[batch * batch_size:(batch + 1) * batch_size]
    elif data_type == 'val':
        word_matrix = val_data_x[batch * batch_size:(batch + 1) * batch_size]
        label_matrix = val_data_y[batch * batch_size:(batch + 1) * batch_size]
    elif data_type == 'test':
        word_matrix = test_data_x[batch * batch_size:(batch + 1) * batch_size]
        label_matrix = test_data_y[batch * batch_size:(batch + 1) * batch_size]

    number_matrix = np.zeros((batch_size,threshold))
    target_matrix = np.zeros((batch_size,1))

    # conver 'pos' and 'neg' strings to 1 and 0
    for i in range(batch_size):
        if label_matrix[i] == 'pos':
            target_matrix[i] = 1
        else:
            target_matrix[i] = 0
        # get id of word
        for j in range(threshold):
            number_matrix[i,j] = TEXT.vocab.stoi[word_matrix[i][j]]


    return (number_matrix,target_matrix)

### my imdb classifier using glove words my embedding layer is initialized by glove weights and the layer is freezed at start by setting its required_grad to False

In [41]:
# here is my imdb classifier using my own vectors in the last problem (Problem 4)
class IMDB_Classifier_glove(nn.Module):

  def __init__(self,input_dim,embedding_dim,hidden_dim,output_dim,num_layers,device):
      super(IMDB_Classifier_glove,self).__init__()
      # defining layers
      self.device = device
      self.num_layers = num_layers
      self.hidden_dim = hidden_dim
      self.embedding = nn.Embedding(input_dim,embedding_dim)
      self.embedding.load_state_dict({'weight': torch.tensor(weights_matrix)})
      self.embedding.weight.requires_grad = False
      self.lstm = nn.LSTM(embedding_dim,hidden_dim,num_layers,batch_first=True)
      self.linear = nn.Linear(hidden_dim,output_dim)
  
  def forward(self,input):
      embedded = self.embedding(input)
      h0 = torch.zeros(self.num_layers,embedded.shape[0],self.hidden_dim).to(self.device)
      c0 = torch.zeros(self.num_layers,embedded.shape[0],self.hidden_dim).to(self.device)
      lstm_out, _ = self.lstm(embedded,(h0,c0))
      out = self.linear(lstm_out[:,-1,:])
      
      return out

### i run network for 10 epochs i switch to unfreezed embedding layer when i get to epoch number 6 . each batch loss is printed and at the end of each epoch validation accuracy is printed 

In [43]:
input_dim = MAX_VOCAB_SIZE + 2
hidden_dim = 50
embedding_dim = 100
output_dim = 1
num_layers = 2
device = 'cuda:0'
imdb_classifier = IMDB_Classifier_glove(input_dim,embedding_dim,hidden_dim,output_dim,num_layers,device).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(imdb_classifier.parameters(),lr=1e-2)
epochs = 10
batch_size = 64
batch_num = int(len(train_data_x) // batch_size)

for epoch in range(epochs):
    print(f'########################### EPOCH {epoch + 1} ###########################')
    if int(epoch) == int(epochs // 2):
        imdb_classifier.embedding.weight.requires_grad = True
    epoch_loss = 0
    for i in range(batch_num):
        optimizer.zero_grad()
        batch_x, batch_y = get_batch_glove(i,batch_size,'train')
        # resize batches and bring them to gpu
        batch_x = torch.tensor(batch_x).int().view(batch_size,hidden_dim).to(device)
        batch_y = torch.tensor(batch_y).view(batch_size,1).to(device)
        output = imdb_classifier(batch_x)
        loss = criterion(output,batch_y).to(device)
        epoch_loss += loss.item()
        # optimizer update
        loss.backward()
        optimizer.step()
    
        print(f'Batch {i + 1}/{batch_num} | Loss {np.around(loss.item(),decimals=6)}')
    # run my network on validation to get the validation accuracy
    val_batch_num = int(len(val_data_y) // batch_size)
    with torch.no_grad():
        accuracy = 0
        for i in range(val_batch_num):
            batch_x, batch_y = get_batch_glove(i,batch_size,'val')
            # resize batches and bring them to gpu
            batch_x = torch.tensor(batch_x).int().view(batch_size,hidden_dim).to(device)
            batch_y = torch.tensor(batch_y).view(batch_size,1).to(device)
            output = torch.sigmoid(imdb_classifier(batch_x))
            output = torch.round(output).int()
            accuracy += (output == batch_y).sum()
    accuracy = accuracy / (batch_size * val_batch_num)
    print(f'Accuracy On Validation : {np.around(accuracy.cpu() * 100,decimals=2)}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Batch 480/546 | Loss 0.576049
Batch 481/546 | Loss 0.605312
Batch 482/546 | Loss 0.501254
Batch 483/546 | Loss 0.703542
Batch 484/546 | Loss 0.573294
Batch 485/546 | Loss 0.554152
Batch 486/546 | Loss 0.63374
Batch 487/546 | Loss 0.620012
Batch 488/546 | Loss 0.55252
Batch 489/546 | Loss 0.576852
Batch 490/546 | Loss 0.495697
Batch 491/546 | Loss 0.483402
Batch 492/546 | Loss 0.502313
Batch 493/546 | Loss 0.616382
Batch 494/546 | Loss 0.62386
Batch 495/546 | Loss 0.571819
Batch 496/546 | Loss 0.4708
Batch 497/546 | Loss 0.538115
Batch 498/546 | Loss 0.501396
Batch 499/546 | Loss 0.531751
Batch 500/546 | Loss 0.653873
Batch 501/546 | Loss 0.407446
Batch 502/546 | Loss 0.581847
Batch 503/546 | Loss 0.446676
Batch 504/546 | Loss 0.457732
Batch 505/546 | Loss 0.434553
Batch 506/546 | Loss 0.585861
Batch 507/546 | Loss 0.599373
Batch 508/546 | Loss 0.552903
Batch 509/546 | Loss 0.535701
Batch 510/546 | Loss 0.395296
Batch 511/

In [44]:
test_batch_num = int(len(val_data_y) // batch_size)
with torch.no_grad():
    accuracy = 0
    for i in range(test_batch_num):
        batch_x, batch_y = get_batch_glove(i,batch_size,'test')
        # resize batches and bring them to gpu
        batch_x = torch.tensor(batch_x).int().view(batch_size,hidden_dim).to(device)
        batch_y = torch.tensor(batch_y).view(batch_size,1).to(device)
        output = torch.sigmoid(imdb_classifier(batch_x))
        output = torch.round(output).int()
        accuracy += (output == batch_y).sum()
accuracy = accuracy / (batch_size * test_batch_num)
print(f'Accuracy On Test : {np.around(accuracy.cpu() * 100,decimals=2)}')

Accuracy On Test : 75.08000183105469
