In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import matplotlib.pyplot as plt

# read csv file
data = pd.read_csv('consolidated-statements.csv', sep=',')
filteredData = data.filter(['Source', 'Type', 'Narration']);

filteredData

Unnamed: 0,Source,Type,Narration
0,HDFC_1668,CREDIT,CREDIT INTEREST CAPITALISED
1,HDFC_1668,CREDIT,NEFT CR-KKBK0000958-JAYESH MANILAL PRAJAPATI-J...
2,HDFC_1668,CREDIT,IMPS-914921830338-JAYESH MANILAL PRAJA-HDFC-XX...
3,HDFC_1668,CREDIT,ACH C- INDIABULLS HOUSING-4462246
4,HDFC_1668,CREDIT,IMPS-911912741894-JAYESH MANILAL PRAJA-HDFC-XX...
...,...,...,...
5587,PAYTM_9820875260,DEBIT,CCD Order #01130010145425040720181354
5588,PAYTM_9820875260,DEBIT,Zomato media pvt ltd Order #ZTD1868FD8E3D2287434
5589,PAYTM_9820875260,DEBIT,Reliance Jio Order #BR0001DJGFEC
5590,PAYTM_9820875260,DEBIT,Zomato media pvt ltd Order #ZTD184D51958C15B8461


In [2]:
import re
MIN_WORD_SIZE=3
test_text = ""
filteredValues = filteredData.values
for i in range(len(filteredValues)):
    text = filteredValues[i][0] + ' ' + filteredValues[i][1] + ' ' + filteredValues[i][2]
    preprocessed_text = re.sub('(N+)', ' NNN ', re.sub('[0-9]', 'N', text.lower()))
    test_text = test_text + ' ' + preprocessed_text
    
# split test text
test_text_split = list(filter(lambda word: len(word) >= MIN_WORD_SIZE, re.split(r'(,|_|-|/|\\|\s)\s*', test_text)))

# create trigrams
trigrams = [([test_text_split[i], test_text_split[i + 1]], test_text_split[i + 2])
            for i in range(len(test_text_split) - 2)]

# create vocab from word split of test text
vocab = set(test_text_split)

# create bag of words from vocab
word_to_ix = {word: i for i, word in enumerate(vocab)}

print('Bag of words length - ' + str(len(word_to_ix)))
len(trigrams)

Bag of words length - 1679


49185

In [4]:
CONTEXT_SIZE=2
EMBEDDING_DIM=10

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs



losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

print('Starting Epoch iterations ...')
for epoch in range(10):
    total_loss = 0
    print('Epoch iteration no. ' + str(epoch) + ' ...')
    for context, target in trigrams:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print('Loss in this iteration - ' + str(total_loss))
    losses.append(total_loss)
print(losses)

Starting Epoch iterations ...
Epoch iteration no. 0 ...
Loss in this iteration - 141871.59705452778
Epoch iteration no. 1 ...
Loss in this iteration - 105563.39315087834
Epoch iteration no. 2 ...
Loss in this iteration - 92070.79505306079
Epoch iteration no. 3 ...
Loss in this iteration - 83846.38441041586
Epoch iteration no. 4 ...
Loss in this iteration - 78254.70624964967
Epoch iteration no. 5 ...
Loss in this iteration - 74073.14088842714
Epoch iteration no. 6 ...
Loss in this iteration - 70710.24334811456
Epoch iteration no. 7 ...
Loss in this iteration - 67925.85389956462
Epoch iteration no. 8 ...
Loss in this iteration - 65577.07865532045
Epoch iteration no. 9 ...
Loss in this iteration - 63556.69553248256
[141871.59705452778, 105563.39315087834, 92070.79505306079, 83846.38441041586, 78254.70624964967, 74073.14088842714, 70710.24334811456, 67925.85389956462, 65577.07865532045, 63556.69553248256]


In [5]:
print('Starting Epoch iterations ...')
for epoch in range(10):
    total_loss = 0
    print('Epoch iteration no. ' + str(epoch) + ' ...')
    for context, target in trigrams:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print('Loss in this iteration - ' + str(total_loss))
    losses.append(total_loss)
print(losses)

Starting Epoch iterations ...
Epoch iteration no. 0 ...
Loss in this iteration - 61801.78295147545
Epoch iteration no. 1 ...
Loss in this iteration - 60259.510072769845
Epoch iteration no. 2 ...
Loss in this iteration - 58874.91372604875
Epoch iteration no. 3 ...
Loss in this iteration - 57617.516171606556
Epoch iteration no. 4 ...
Loss in this iteration - 56473.19627365093
Epoch iteration no. 5 ...
Loss in this iteration - 55419.91648447178
Epoch iteration no. 6 ...
Loss in this iteration - 54457.05880519524
Epoch iteration no. 7 ...
Loss in this iteration - 53556.07865188635
Epoch iteration no. 8 ...
Loss in this iteration - 52745.25296808808
Epoch iteration no. 9 ...
Loss in this iteration - 51979.1788498352
[141871.59705452778, 105563.39315087834, 92070.79505306079, 83846.38441041586, 78254.70624964967, 74073.14088842714, 70710.24334811456, 67925.85389956462, 65577.07865532045, 63556.69553248256, 61801.78295147545, 60259.510072769845, 58874.91372604875, 57617.516171606556, 56473.19

In [22]:
print('Saving trained model v0.1 ...')
torch.save(model, 'model_v0.1.pt')


Saving trained model v0.1 ...


In [16]:
print('Loading trained model v0.1 ...')
model2 = torch.load('model_v0.1.pt')
losses = []
optimizer = optim.SGD(model2.parameters(), lr=0.001)
print('Starting Epoch iterations on loaded model v0.1 ...')
for epoch in range(10):
    total_loss = 0
    print('Epoch iteration no. ' + str(epoch) + ' ...')
    for context, target in trigrams:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        model2.zero_grad()
        log_probs = model2(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print('Loss in this iteration - ' + str(total_loss))
    losses.append(total_loss)
print(losses)

Loading trained model v0.1 ...
Starting Epoch iterations on loaded model v0.1 ...
Epoch iteration no. 0 ...
Loss in this iteration - 51257.546708449154
Epoch iteration no. 1 ...
Loss in this iteration - 50576.92935818025
Epoch iteration no. 2 ...
Loss in this iteration - 49926.59133454486
Epoch iteration no. 3 ...
Loss in this iteration - 49321.7889211978
Epoch iteration no. 4 ...
Loss in this iteration - 48759.12520646511
Epoch iteration no. 5 ...
Loss in this iteration - 48238.24063310995
Epoch iteration no. 6 ...
Loss in this iteration - 47727.70138587325
Epoch iteration no. 7 ...
Loss in this iteration - 47254.770242834275
Epoch iteration no. 8 ...
Loss in this iteration - 46807.11957786247
Epoch iteration no. 9 ...
Loss in this iteration - 46383.540935100966
[51257.546708449154, 50576.92935818025, 49926.59133454486, 49321.7889211978, 48759.12520646511, 48238.24063310995, 47727.70138587325, 47254.770242834275, 46807.11957786247, 46383.540935100966]


In [21]:
print('Saving trained model2 v0.2 ...')
torch.save(model2, 'model_v0.2.pt')


Saving trained model2 v0.2 ...


In [18]:
result = model.forward(torch.tensor([word_to_ix['jayesh'], word_to_ix['manilal']], dtype=torch.long))
result

tensor([[-11.4456, -10.7538,  -9.6564,  ..., -12.3577, -12.6709,  -5.9774]],
       grad_fn=<LogSoftmaxBackward>)

In [19]:
result = model2.forward(torch.tensor([word_to_ix['jayesh'], word_to_ix['manilal']], dtype=torch.long))
result

tensor([[-12.0818, -11.5486, -10.6210,  ..., -14.0178, -14.4057,  -6.6023]],
       grad_fn=<LogSoftmaxBackward>)

In [27]:
result = model(torch.tensor([word_to_ix['ccd'], word_to_ix['order']], dtype=torch.long)).tolist()
print(str(len(result[0])) + ' ' + str(min(result[0])) + ' ' + str(max(result[0])))

1679 -20.701175689697266 -0.032097600400447845


In [29]:
word_to_ix['ccd'] + (-0.032097600400447845)

1082.9679023995996

In [34]:
for word, index in word_to_ix.items():
    if index == 1062:
        print('Min Word: ' + word)
    if index == 1063:
        print('Min Word: ' + word)
    if index == 1084:
        print('Max Word: ' + word)
    if index == 1082:
        print('Max Word: ' + word)

Min Word: caafb
Min Word: eccdbe
Max Word: ketan
Max Word: mysore
