In [None]:
import nltk
import string
import re
import numpy as np
import pandas as pd
import os
import inflect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download("stopwords")
nltk.download("punkt")
from nltk.stem.porter import PorterStemmer

Obtain text from a user's request.

In [2]:
#This is sample text. Will be used as an intermediate process during the running of the recommender model.
sample_text = "How do I find the determinant of a 3x3 matrix?"

Preprocess the given text.

In [3]:
#First we standardize sentences by removing punctuation, uniforming white spaces and lowercasing all words.

s_words = sample_text.lower().split()
strd_text = " ".join(s_words)
print(strd_text)

how do i find the determinant of a 3x3 matrix?


In [4]:
#Next, we remove all punctuation
strd_text = strd_text.translate(str.maketrans('','', string.punctuation))
print(strd_text)

how do i find the determinant of a 3x3 matrix


Optional: We can convert numbers to words and weigh them so that they will lean the sentence slightly towards being interpreted as math related.

In [5]:
#Convert all numbers to words (ex: 3 -> three)
p = inflect.engine()

#split the words to parse for numbers, create a new array to store words
old_string = strd_text.split()
new_string = []

for word in old_string:
    if word.isdigit():
        converted = p.number_to_words(word)
        new_string.append(converted)
    else:
        new_string.append(word)
        
strd_text = " ".join(new_string)

print(strd_text)

how do i find the determinant of a 3x3 matrix


Clearly this did not work when we identified the dimensions of a matrix. This will need to be fixed in the future. Arguably, we could have treated that as a meaningless word and removed it.

Next, we want to remove any filler words that do not provide much meaning to a sentence outside of grammatical purposes. These words are known as stop words, such as "a" or "the". We will likely not need any question related words such as how, why, where as well, since the request should imply a question.

In [6]:
#remove stop words from the sentence.(We need to define a list of stop words, but we can simply obtain them from nltk.
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(strd_text)
strd_text = [word for word in word_tokens if word not in stop_words]
print(strd_text)

['find', 'determinant', '3x3', 'matrix']


Optional: 
Next, we will use stemming to obtain the root form of each "meaningful" word. Stemming may not give actual words, but it is much faster/more computationally efficient than lemmatizing, which guarantees actual words. It is also not mandatory, although it does reduce the size of necessary language dictionary.

In [7]:
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in strd_text]
print(stems)

['find', 'determin', '3x3', 'matrix']


Now that we have preprocessed the sentence, we must vectorize it so that machine learning can be done on the sentence. For this process we will use word embeddings. The word2vec word embedding method will be used.

For the word2vec method, we first need to determine a window size, for now, let this be win_size = 5. This means for each target word, we will take two words from both the left and right side to be the context. For sequential words not separated by spaces, we may use offsets but this is not necessary since we have tokenized sentences.

In [None]:
#Find all context words for each target in a sentence. 
#For the sake of scalability, we will use indicies to represent words and have a mapping of words to indexes and the reverse
#Normally while training, we are given a large amount of text to train on, so this process may take up to hours.
'
win_size = 5

word_to_ind = {}
ind_to_word = []

#first count the frequency of each word in the sentence
word_freq = {}

for word in strd_text:
    word_freq[word] = word_freq.get(word,0) + 1
        
text_len = len(strd_text)
#then map each word to an index and each index to a word
o = 0
for word in word_freq.keys():
    ind_to_word.push_back(word)
    word_to_ind[word] = o
    o += 1

half_win = win_size//2

context = [[] for word in ind_to_word]

#we use np.clip so that we do not go out of bounds
for i in range(text_len):
    #treat each word as a target
    targ_word = strd_text[i]
    
    for j in range(np.clip(i-half_win, 0, i), i):
        context[targ_word].append(word_to_ind[strd_text[j]])
    for k in range(i, np.clip(i+half_win, i, text_len)):
        context[targ_word].append(word_to_ind[strd_text[k]])

Define a custom collate function to pad zeroes since our words may have different amounts of context and therefore, different number of pairs.
The collate function simply stacks the get_item values for each batch item. This will be provided to our dataloader because the default collate function cannot handle tensors of different lengths, which is very common with sentences.

In [None]:
def collate_func(batch):
    
    len1 = [len(item[1]) for item in batch]
    dsum = [item[3] for item in batch]
    
    d = np.sum(dsum)
    maxlen = max(len1)
    
    c_o = batch[0][2]
    
    t_o = torch.zeros(maxlen)
    w_o = torch.zeros(maxlen)
    
    for item in batch:
        c = item[2]
        c_len = c.size(dim=0)
        t_out = torch.zeros(maxlen)
        w_out = torch.zeros(maxlen)
        t_out[:c_len] = item[0]
        w_out[:c_len] = item[1]
        #torch.vstack((c_o, c))
        torch.at_least_2d(t_out)
        torch.at_least_2d(w_out)
        torch.vstack((t_o, t_out.long()), axis = 1)
        torch.vstack((w_o, w_out.long()), axis = 1)
    
    return t_o.long(), w_o.long(), c_o.long(), d

Define our word2vec model

In [None]:
class word2vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(word2vec, self).__init__()
        
        #define the two layers
        self.Target = nn.Embedding(vocab_size, embedding_dim)
        self.Context = nn.Embedding(vocab_size, embedding_dim)
        
        
    def forward(self, targets, contexts, len1):
        #look up embeddings for the given targets and contexts
        target_w = self.t(targets)
        context_w = self.c(contexts)
        
        #find the dot product between corresponding pairs
        dp = torch.sum(torch.mul(target_w, context_w), dim = 1)
        
        return dp

Define our training parameters, the batch_size, the learning rate, the number of epochs, and the embedding dimensions. For now, we will let embedding dim be 2 and batch size be 1 since our dataset is tiny.

In [None]:
#training parameters
epochs = 3
lr = 0.1
batch_size = 1
embedding_dim = 2

Define our dataset model.

In [None]:
class sentence_Dataset(Dataset):
    def __init__(self, w_2in, in2_w, context, prob_dist, q):       
        super(sentence_Dataset, self).__init__()
        
        self.w_2in = w_2in
        self.in_2w = in_2w
        self.context = context
        self.prob_dist = prob_dist
        self.q = q
        
    def __len__(self):
        return len(self.in_w)
    
    def __getitem__(self, idx):
        #to be defined
        
        return