In [29]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
from random import randint 
from tqdm.notebook import trange, tqdm
from sklearn.feature_extraction.text import TfidfVectorizer


import sys
# the mock-0.3.1 dir contains testcase.py, testutils.py & mock.py
sys.path.append('../modules/')
from get_data import get_data

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
print(f"Device used = {device}")

Device used = cuda


In [30]:
X,Y = get_data()
X=X["reviewText"]
X = list(X)
Y = list(Y)

In [31]:
vectorizer = TfidfVectorizer(min_df = 25, max_df = 0.8)
vectorizer.fit(X)
vocab = vectorizer.vocabulary_
vocab = {key:idx+1 for idx,key in enumerate(vocab)}
vocab["<PAD>"] = 0

In [32]:
#abtracting from actual words to just numbers
#cutoff length for sentences
sentence_length = 32

def normalize(data):
    word2idx = []
    for line in data:
        line = line.split()
        ans = [vocab.get(line[index], vocab["<PAD>"]) for index in range(min(sentence_length, len(line)))]
        for i in range(sentence_length - len(ans)):
            ans.append(vocab["<PAD>"])
        word2idx.append(ans)
    return word2idx

In [33]:
X = normalize(X)

In [34]:
lstm_dim = 50
embed_dim = 100

class LangID(nn.Module):
    def __init__(self, embed_dim, lstm_dim, vocab_dim):
        super(LangID, self).__init__()
        self.embedding = nn.Embedding(vocab_dim, embed_dim) #id, 100
        self.lstm = nn.LSTM(embed_dim,lstm_dim,batch_first = True, bidirectional = True)
        self.hidden2tag = nn.Linear(2*lstm_dim, 2)
        self.dropoutlayer = nn.Dropout(0.2)
    
    def forward(self, inputs):

        embeds = self.embedding(inputs)
        #print("embeds",embeds.shape)

        lstm_out, _ = self.lstm(self.dropoutlayer(embeds))
        #print("lstm_out",lstm_out.shape)
      
        tag_space = self.hidden2tag(self.dropoutlayer(lstm_out))[:,-1,:]
        #print("tag_space", tag_space.shape)
        return tag_space

In [40]:
source = torch.tensor(X)
target = torch.tensor(Y)

tmp_feats = source
tmp_labels = target
batch_size = 512
num_batches = int(len(tmp_labels)/batch_size)

tmp_feats_batches = tmp_feats[:batch_size*num_batches].view(num_batches,batch_size, sentence_length)
tmp_labels_batches = tmp_labels[:batch_size*num_batches].view(num_batches, batch_size)
tmp_feats_batches = tmp_feats_batches.to(device)
tmp_labels_batches = tmp_labels_batches.to(device)
#creating the model
model = LangID(embed_dim, lstm_dim, len(vocab))
model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.002)
t = trange(20, desc='Started Training', leave=True, position=0)

for epoch in t:
    totalloss = 0
    
    for i in tqdm(range(len(tmp_feats_batches)), desc=f'Epoch {epoch+1} progress', leave=False, position=0):
    
        feats_batch = tmp_feats_batches[i]
        labels_batch = tmp_labels_batches[i]
        #print(feats_batch.shape, labels_batch.shape)
        # Here you can call forward/calculate the loss etc.
        model.zero_grad()
        tag_scores = model.forward(feats_batch)

        #print(tag_scores.shape)
        loss = loss_function(tag_scores, labels_batch)
        totalloss += loss.item()
        loss.backward()
        optimizer.step()
#         t2.set_description(f"Epoch {epoch+1} batch:{i}")
#         t2.refresh()

    t.set_description(f"Epoch {epoch+1} loss:{totalloss}")
    t.refresh()



Started Training:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 4 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 5 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 6 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 7 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 8 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 9 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 10 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 11 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 12 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 13 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 14 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 15 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 16 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 17 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 18 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 19 progress:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 20 progress:   0%|          | 0/195 [00:00<?, ?it/s]

In [36]:
Xt, Yt = get_data(type="dev")
Xt = Xt["reviewText"]
Xt = list(Xt)
Yt = list(Yt)
Xt = normalize(Xt)
Xt = torch.tensor(Xt)
Yt = torch.tensor(Yt)
Xt = Xt.to(device)
Yt = Yt.to(device)

In [37]:
model.eval()
preds = torch.argmax(model.forward(Xt), dim = 1)

In [38]:
sum(preds == Yt)/len(Yt)

tensor(0.8381, device='cuda:0')

In [57]:
Yt

tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')