In [1]:
#Import packages

import numpy as np
import csv
import pandas as pd
import fasttext
import fasttext.util

import torchvision
import matplotlib.pyplot as plt
import random
%matplotlib inline

import torch
from torch.nn.parameter import Parameter
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init

from torch.utils.data import Dataset

In [2]:
#Load in dataset
FileName = "../ProcessedData/training_data_clean2.csv"

df = pd.read_csv(FileName, encoding='latin-1')

In [140]:
# See the data
print(df["query"][0:50000])

0                                   0
1                             0  0 01
2                           0  good y
3                       0 001 218 172
4               0 001 218 172 starter
                     ...             
49995                     ampd mobile
49996    ampd mobile dealer locations
49997         ampd mobile lauren hill
49998              ampd mobile phones
49999                       ampdmobil
Name: query, Length: 50000, dtype: object


In [3]:
# Create a dictionary with all characters
symbol_dict = {}
count = 1
for word in df["query"]:
    if type(word) == str:
        for letter in word:
            if (symbol_dict.get(letter) == None):
                symbol_dict[letter] = count
                count += 1

vocabSize = len(symbol_dict)

In [67]:
print(symbol_dict)

{'0': 1, ' ': 2, '1': 3, 'g': 4, 'o': 5, 'd': 6, 'y': 7, '2': 8, '8': 9, '7': 10, 's': 11, 't': 12, 'a': 13, 'r': 14, 'e': 15, '9': 16, 'n': 17, 'p': 18, 'c': 19, 'i': 20, 'b': 21, 'l': 22, 'f': 23, 'w': 24, 'q': 25, 'u': 26, 'm': 27, 'h': 28, 'v': 29, 'x': 30, 'z': 31, '4': 32, '3': 33, '6': 34, '5': 35, 'j': 36, 'k': 37}


In [241]:
#Create function for one hot encoding for data loader
def oneHotEncodeList(wordList, freqList, vocabSize, fixed_size = None):
    oneHotVector = []
    target_vector = []
    for index in range(len(wordList)):
        word = wordList[index]
        if type(word) == str:
            number_list = []
            count = 0
            for letter in word:
                count += 1
                if (symbol_dict.get(letter) != None and (fixed_size == None or count <= fixed_size)):
                    number_list.append(symbol_dict.get(letter))
            if (fixed_size != None):
                while(len(number_list) < fixed_size):
                        number_list.append(0)
            oneHotWord = torch.nn.functional.one_hot(torch.tensor(number_list),vocabSize +1)
            
            #add more samples based on the frequency of the item.
            for i in range(freqList[index]):
                oneHotVector.append(oneHotWord)
                target_vector.append(torch.tensor(freqList[index]).float())
    return oneHotVector, target_vector

In [325]:
#Create dataloader
#Dataloader with one hot encoding
class MyDataset(Dataset):
    def __init__(self,file_name, vocabSize, dataSize = None, fixed_size = None):
        if (dataSize != None):
            self.df = pd.read_csv(file_name, encoding='latin-1')
            self.wordTensor, self.target = oneHotEncodeList(self.df["query"][0:dataSize], self.df["frequency"][0:dataSize], vocabSize, fixed_size =  fixed_size)
        else:
            self.df = pd.read_csv(file_name, encoding='latin-1')
            self.wordTensor, self.target = oneHotEncodeList(self.df["query"], self.df["frequency"], vocabSize, fixed_size =  fixed_size)

    def __len__(self):
        return len(self.target)
    
    def __getitem__(self,idx):
        return (self.wordTensor[idx], self.target[idx])

In [326]:
dataset = MyDataset(FileName, vocabSize, dataSize = 150000)

In [327]:
# code to check the average frequncy in the data set
total_sum = 0
for i in range(dataset.__len__()):
    total_sum += dataset.__getitem__(i)[1]
print(total_sum/dataset.__len__())

tensor(79.6529)


In [328]:
batch_size = 1
trainloader = torch.utils.data.DataLoader(dataset, 
                                          batch_size = batch_size, # Set batch size to one beacuse different length of sequences. 
                                          shuffle=True, 
                                          num_workers=0,
                                          pin_memory=True,
                                          drop_last=True)
dataiter = iter(trainloader)

In [329]:
#This is our neural network class. every Neural Network in pytorch extends nn.Module
class MyLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers = 1):
        super(MyLSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.LSTM = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
        self.linear1 = nn.Linear(self.hidden_dim, 1)
        self.linear2 = nn.Linear(25, 10)
        self.linear3 = nn.Linear(10, 1)
        
    #Input must be 3 dimensional (Sequence len, batch, input dimensions)
    #hc is a tuple which contains the vectors h (hidden/feedback) and c (cell state vector)
    def forward(self,inp, hc = None):
        if hc == None:
            hc = self.initHC()
        seq_len = inp.size()[0]
        #this gives outut for each input and also (hidden and cell state vector)
        output, hidden = self.LSTM(inp,hc)
        #Use fully connect layer to get a single output
        output = torch.relu(self.linear1(output[seq_len-1]))
        #output = torch.relu(self.linear2(output))
        #output = torch.relu(self.linear3(output))
        return torch.squeeze(output), hidden
    
    def initHC(self, batch_size = batch_size):
        #initalise hidden state and cell state
        h = torch.zeros((self.num_layers, batch_size , self.hidden_dim))
        c = torch.zeros((self.num_layers ,batch_size , self.hidden_dim))
        return (h,c)

In [343]:
# Create network:
hidden_dim = 100
vocab_size = vocabSize + 1
lstm_layers = 1
network = MyLSTM(vocab_size, hidden_dim, lstm_layers)
criterion = nn.MSELoss()


learning_rate = 0.005
optimizer = optim.Adam(network.parameters(), lr=learning_rate)

In [344]:
#Creating hidden state and cell state.
h = torch.zeros(hidden_dim*lstm_layers*batch_size).view(lstm_layers, batch_size, hidden_dim)
c = torch.zeros(hidden_dim*lstm_layers*batch_size).view(lstm_layers, batch_size, hidden_dim)

hc = (h,c)

In [345]:
# Function to run thorugh a single query:
def train(network, data_pair, hidden_state):
    #Create optimizer
    network.train()
    optimizer.zero_grad()

    #Create input vector from word
    input_vector = data_pair[0].view(data_pair[0].size()[1], batch_size, vocab_size).float()

    
    # Calculate the prediction
    output, hc = network.forward(input_vector, None)
    # Calculate loss and backpropagate the eroor
    loss = criterion(output, data_pair[1])
    loss.backward()
    optimizer.step()
    if (np.random.randint(10000) == 0):
        print(output)
    return loss.item()
    
    
# Function to run through multiple queries:
def iterTrain(epochs = 2):
    loss_list = []
    h = torch.zeros(hidden_dim*lstm_layers*batch_size).view(lstm_layers, batch_size, hidden_dim)
    c = torch.zeros(hidden_dim*lstm_layers*batch_size).view(lstm_layers, batch_size, hidden_dim)

    hc = (h,c)
    running_loss = 0
    item_count = 0
    for epoch in range(epochs):
        for data_pair in trainloader:
            item_count += 1
            loss = train(network, data_pair, hc)
            loss_list.append(loss)
            running_loss += loss
            if (item_count % 1000 == 0):
                print("Epoch {}, after {} items is the average loss for last part is {}, the overall average loss is {}".format(epoch, item_count, sum(loss_list)/1000, running_loss/item_count))
                loss_list = []

In [346]:
#Run the training
iterTrain(epochs = 4)


Epoch 0, after 1000 items is the average loss for last part is 83328.00945023245, the overall average loss is 83328.00945023245
Epoch 0, after 2000 items is the average loss for last part is 103417.8817155272, the overall average loss is 93372.94558287981
Epoch 0, after 3000 items is the average loss for last part is 77322.49415506446, the overall average loss is 88022.79510694137
Epoch 0, after 4000 items is the average loss for last part is 83514.55815464113, the overall average loss is 86895.73586886631
Epoch 0, after 5000 items is the average loss for last part is 106818.74286954907, the overall average loss is 90880.33726900286
Epoch 0, after 6000 items is the average loss for last part is 80645.30187581349, the overall average loss is 89174.49803680464
Epoch 0, after 7000 items is the average loss for last part is 100198.91490952163, the overall average loss is 90749.41473290705
Epoch 0, after 8000 items is the average loss for last part is 58589.05387277694, the overall average 

Epoch 0, after 64000 items is the average loss for last part is 75431.92250464998, the overall average loss is 84655.1400296551
Epoch 0, after 65000 items is the average loss for last part is 114753.47689888273, the overall average loss is 85118.19136610474
Epoch 0, after 66000 items is the average loss for last part is 57702.9402113646, the overall average loss is 84702.80877285107
Epoch 0, after 67000 items is the average loss for last part is 84706.68549416908, the overall average loss is 84702.86663436328
Epoch 0, after 68000 items is the average loss for last part is 58178.67373241992, the overall average loss is 84312.80497404055
Epoch 0, after 69000 items is the average loss for last part is 67609.03572085542, the overall average loss is 84070.72136167558
Epoch 0, after 70000 items is the average loss for last part is 65850.33786660098, the overall average loss is 83810.4301688888
Epoch 0, after 71000 items is the average loss for last part is 89402.53453613537, the overall aver

Epoch 0, after 126000 items is the average loss for last part is 24700.303203642965, the overall average loss is 71470.33531520904
Epoch 0, after 127000 items is the average loss for last part is 36374.28650795515, the overall average loss is 71193.98847420701
Epoch 0, after 128000 items is the average loss for last part is 24953.643613322976, the overall average loss is 70832.73577998133
Epoch 0, after 129000 items is the average loss for last part is 28096.69578021939, the overall average loss is 70501.44864820025
Epoch 0, after 130000 items is the average loss for last part is 36282.601769107554, the overall average loss is 70238.22674913028
Epoch 0, after 131000 items is the average loss for last part is 43671.12903793612, the overall average loss is 70035.42447652573
Epoch 0, after 132000 items is the average loss for last part is 49655.04037615587, the overall average loss is 69881.02762728055
Epoch 0, after 133000 items is the average loss for last part is 25435.776112714455, th

Epoch 0, after 188000 items is the average loss for last part is 93504.29526791534, the overall average loss is 72013.37605037198
Epoch 0, after 189000 items is the average loss for last part is 57824.811182713864, the overall average loss is 71938.30427858545
Epoch 0, after 190000 items is the average loss for last part is 57816.81662392062, the overall average loss is 71863.98065935037
Epoch 0, after 191000 items is the average loss for last part is 49041.32126584823, the overall average loss is 71744.4902960336
Epoch 0, after 192000 items is the average loss for last part is 63018.735888728515, the overall average loss is 71699.04365849558
Epoch 0, after 193000 items is the average loss for last part is 41980.017640319275, the overall average loss is 71545.05906772781
Epoch 0, after 194000 items is the average loss for last part is 54818.670286631444, the overall average loss is 71458.8405688562
Epoch 0, after 195000 items is the average loss for last part is 36873.397209903625, the

Epoch 0, after 250000 items is the average loss for last part is 60728.530372529596, the overall average loss is 67882.4210163509
Epoch 0, after 251000 items is the average loss for last part is 75413.51666956634, the overall average loss is 67912.42538150314
Epoch 0, after 252000 items is the average loss for last part is 101514.83751063535, the overall average loss is 68045.76828677747
Epoch 0, after 253000 items is the average loss for last part is 90492.39826244114, the overall average loss is 68134.4901443888
Epoch 0, after 254000 items is the average loss for last part is 125885.44192421001, the overall average loss is 68361.85609627786
Epoch 0, after 255000 items is the average loss for last part is 73729.77099305725, the overall average loss is 68382.9067429319
Epoch 0, after 256000 items is the average loss for last part is 99505.71833650112, the overall average loss is 68504.48022571928
Epoch 0, after 257000 items is the average loss for last part is 81395.70034473378, the ov

Epoch 0, after 311000 items is the average loss for last part is 104489.7122880559, the overall average loss is 72767.6418590144
Epoch 0, after 312000 items is the average loss for last part is 83999.84104670334, the overall average loss is 72803.64249743649
tensor(0., grad_fn=<SqueezeBackward0>)
Epoch 0, after 313000 items is the average loss for last part is 120251.70065546573, the overall average loss is 72955.23373755798
Epoch 0, after 314000 items is the average loss for last part is 72348.40252553558, the overall average loss is 72953.3011540802
Epoch 0, after 315000 items is the average loss for last part is 85122.03167073292, the overall average loss is 72991.93204460925
Epoch 0, after 316000 items is the average loss for last part is 85459.89929359818, the overall average loss is 73031.38763716936
Epoch 0, after 317000 items is the average loss for last part is 88065.11466032847, the overall average loss is 73078.81264355157
Epoch 0, after 318000 items is the average loss for 

Epoch 0, after 376000 items is the average loss for last part is nan, the overall average loss is nan
Epoch 0, after 377000 items is the average loss for last part is nan, the overall average loss is nan
Epoch 0, after 378000 items is the average loss for last part is nan, the overall average loss is nan
Epoch 0, after 379000 items is the average loss for last part is nan, the overall average loss is nan
Epoch 0, after 380000 items is the average loss for last part is nan, the overall average loss is nan
Epoch 0, after 381000 items is the average loss for last part is nan, the overall average loss is nan
tensor(nan, grad_fn=<SqueezeBackward0>)
Epoch 0, after 382000 items is the average loss for last part is nan, the overall average loss is nan
Epoch 0, after 383000 items is the average loss for last part is nan, the overall average loss is nan
tensor(nan, grad_fn=<SqueezeBackward0>)
Epoch 0, after 384000 items is the average loss for last part is nan, the overall average loss is nan
Ep

KeyboardInterrupt: 

In [349]:
def testWord(word, test_network , vocabSize = 37, fixed_size = 50):
    number_list = []
    count = 0
    for letter in word:
        count += 1
        if (symbol_dict.get(letter) != None and (fixed_size == None or count <= fixed_size)):
            number_list.append(symbol_dict.get(letter))
        if (fixed_size != None):
            while(len(number_list) < fixed_size):
                    number_list.append(0)
                    
    h = torch.zeros((1,1,100))
    c = torch.zeros((1,1,100))

    hc = (h,c)
    encoded_vector = torch.nn.functional.one_hot(torch.tensor(number_list),vocabSize +1)
    #print(encoded_vector.size())
    return(test_network(encoded_vector.view(encoded_vector.size()[0], 1, vocabSize +1).float(),hc)[0].item())

In [350]:
testWord("google", network)


nan