In [1]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
from random import randint 
from tqdm.notebook import trange, tqdm

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
print(f"Device used = {device}")

with open("data.txt", encoding="utf-8") as file:
    data = file.readlines()
data = data[:10000]

Device used = cuda


In [5]:
#abtracting from actual words to just numbers
vocab = {"<unknown>":0,"<PAD>": 1}
id = 2

#used for finding the word assoiated with a number
vocab_reverse = {}

#cutoff length for sentences
sentence_length = 20

def normalize(data):
    word2idx = []
    global id, vocab
    for line in data:
        for word in line[:sentence_length]:
            if word not in vocab:
                vocab[word] = id
                id += 1
    for line in data:
        ans = [vocab.get(line[index], vocab["<PAD>"]) for index in range(min(sentence_length, len(line)))]
        for i in range(sentence_length - len(ans)):
            ans.append(vocab["<PAD>"])
        word2idx.append(ans)
    return word2idx

vocab_reverse = {y:x for x,y in vocab.items()}

def fixData(data):
    y = []
    x = []
    for entry in data:
        x_tmp, y_tmp = hideWord(entry)
        if x_tmp:
            x.append(x_tmp)
            y.append(y_tmp)
        #print(vocab)
    return x,y

def hideWord(sentence):
    words = sentence.split()
    if len(words) < 5:
          return False, False
    place = randint(0,len(words)-1)
    hiddenword = words[place]
    words[place] = "<unknown>"
    return (words, hiddenword)

In [6]:
x,y = fixData(data)

In [7]:
x = normalize(x)
for word in y:
    if word not in vocab:
        vocab[word] = id
        id += 1
y = [vocab[label] for label in y]

In [8]:
source = torch.tensor(x)
target = torch.tensor(y)

In [9]:
lstm_dim = 50
embed_dim = 100

class LangID(nn.Module):
    def __init__(self, embed_dim, lstm_dim, vocab_dim):
        super(LangID, self).__init__()
        self.embedding = nn.Embedding(vocab_dim, embed_dim) #id, 100
        self.lstm = nn.LSTM(embed_dim,lstm_dim,batch_first = True, bidirectional = True)
        self.hidden2tag = nn.Linear(2*lstm_dim, vocab_dim)
        self.dropoutlayer = nn.Dropout(0.2)
    
    def forward(self, inputs):

        embeds = self.embedding(inputs)
        #print("embeds",embeds.shape)

        lstm_out, _ = self.lstm(self.dropoutlayer(embeds))
        #print("lstm_out",lstm_out.shape)
      
        tag_space = self.hidden2tag(self.dropoutlayer(lstm_out))[:,-1,:]
        #print("tag_space", tag_space.shape)
        return tag_space

In [11]:
tmp_feats = source
tmp_labels = target
torch.cuda.empty_cache()
batch_size = 32
num_batches = int(len(tmp_labels)/batch_size)

tmp_feats_batches = tmp_feats[:batch_size*num_batches].view(num_batches,batch_size, sentence_length)
tmp_labels_batches = tmp_labels[:batch_size*num_batches].view(num_batches, batch_size)
tmp_feats_batches = tmp_feats_batches.to(device)
tmp_labels_batches = tmp_labels_batches.to(device)
#creating the model
model = LangID(embed_dim, lstm_dim, id)
model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

t = trange(1, desc='Started Training', leave=True)

for epoch in t:
    totalloss = 0
    
    for i in tqdm(range(len(tmp_feats_batches)), desc='Epoch progress'):
    
        feats_batch = tmp_feats_batches[i]
        labels_batch = tmp_labels_batches[i]
        #print(feats_batch.shape, labels_batch.shape)
        # Here you can call forward/calculate the loss etc.
        model.zero_grad()
        tag_scores = model.forward(feats_batch)

        #print(tag_scores.shape)
        loss = loss_function(tag_scores, labels_batch)
        totalloss += loss.item()
        loss.backward()
        optimizer.step()
#         t2.set_description(f"Epoch {epoch+1} batch:{i}")
#         t2.refresh()

    t.set_description(f"Epoch {epoch+1} loss:{totalloss} ")
    t.refresh()



Started Training:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch progress:   0%|          | 0/287 [00:00<?, ?it/s]

In [16]:
torch.save(model,"model")

In [17]:
model2 = torch.load("model")