# Data processing

In [None]:
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import random
import re 
from torch.utils.data import DataLoader, TensorDataset
from 
import pandas as pd
n_model=3

In [22]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", r" \( ", string)
    string = re.sub(r"\)", r" \) ", string)
    string = re.sub(r"\?", r" \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()


# reads the content of the file passed as an argument.
# if limit > 0, this function will return only the first "limit" sentences in the file.
def loadTexts(filename, limit=-1):
    dataset=[]
    with open(filename) as f:
        line = f.readline()
        cpt=1
        skip=0
        while line :
            cleanline = clean_str(f.readline()).split()

            
            if cleanline: 
                dataset.append(cleanline)
            else: 
                line = f.readline()
                skip+=1
                continue
            if limit > 0 and cpt >= limit: 
                break
            line = f.readline()
            cpt+=1        

        print("Load ", cpt, " lines from ", filename , " / ", skip ," lines discarded")
        
    for i in range(len(dataset)):
        # remove the last two tokens BUT ONLY if they are actually label + ID
        if dataset[i][-2].isdigit() and dataset[i][-1].isalnum():
            dataset[i] = dataset[i][:-2]

        dataset[i] = [w for w in dataset[i] if w != ","] #remove single commas
    

    return dataset



def make_loader(data, labels, batch_size=32):
    X = th.stack(data)      # (N, L)
    y = labels              # (N,)
    dataset = TensorDataset(X, y)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [23]:
train_data_text = loadTexts("train.tsv")
test_data_text = loadTexts("test.tsv")
dev_data_text = loadTexts("dev.tsv")

for i in range(len(dev_data_text)):
    dev_data_text[i] = dev_data_text[i][:-2] # remove the "," and other index at the end of each sentence


Load  21706  lines from  train.tsv  /  0  lines discarded
Load  2714  lines from  test.tsv  /  1  lines discarded
Load  2714  lines from  dev.tsv  /  0  lines discarded


In [None]:
print(dev_data_text[0:20])

[['you', 'know', 'the', 'answer', 'man', 'you', 'are', 'programmed', 'to', 'capture', 'those', 'codes', 'they', 'send', 'you', 'don', 't', 'avoid', 'them'], ['the', 'economy', 'is', 'heavily', 'controlled', 'and', 'subsidized', 'by', 'the', 'government', 'in', 'any', 'case', 'i', 'was', 'poking', 'at', 'the', 'lack', 'of', 'nuance', 'in', 'us', 'politics'], ['thank', 'you', 'for', 'your', 'vote', 'of', 'confidence', 'but', 'we', 'statistically', 'ca', "n't", 'get', 'to'], ['there', 'it'], ['good', 'we', 'do', "n't", 'want', 'more', 'thrash', 'liberal', 'offspring', 'in'], ['i', 'went', 'to', 'a', 'destination', 'wedding', 'being', 'the', 'only', 'single', 'person', 'promised', 'to', 'never', 'put', 'myself', 'in', 'that', 'situation'], ['like', 'this', 'just', 'cuz', 'of', 'the', 'name', 'rhymes', 'background', 'raps', 'but', 'dude', 'your', 'name', 'is', 'sick'], ['as', 'an', 'anesthesia', 'resident', 'this', 'made', 'me', 'blow', 'air', 'out', 'my', 'nose', 'at', 'an', 'accelerated',

### Word dictionnary

In [None]:
class WordDict:
    # constructor, words must be a set containing all words
    def __init__(self, words):
        assert type(words) == set
        # TODO
        self.words=sorted(list(words))
        self.word_dict_={w: i+3 for i, w in enumerate(self.words)}
        self.id_dict_={i+3: w for i, w in enumerate(self.words)}
        
        # return the integer associated with a word
    def word_to_id(self, word):
        assert type(word) == str
        return self.word_dict_[word]
        # TODO

        # return the word associated with an integer
    def id_to_word(self, idx):
        assert type(idx) == int
        # TODO
        return self.id_dict_[idx]
    
        # number of word in the dictionnary
    def __len__(self):
        # TODO
        return len(self.word_dict_)


        
train_words = set()
train_words.update(["<bos>", "<eos>","<unk>"])

for sentence in train_data_text:
    train_words.update(sentence)

word_dict = WordDict(train_words)

vocab_size=len(word_dict)


18657

### Tensoring the data

In [None]:
def tensorize_sentence(sentence,n=n_model):
    """
    Tensorizing function
    Args: sentence: list of words as text
    Returns: output_tensor: Tensors of type long, encoding the words via index of the word dictionnary
    """
    start_index=word_dict.word_dict_["<bos>"]
    end_index=word_dict.word_dict_["<eos>"]
    output_tensor=[word_dict.word_dict_.get(word, word_dict.word_dict_["<unk>"]) for word in sentence]
    output_tensor=[start_index]*(n-1)+output_tensor+[end_index] # pad accoring to n to ensure correct first n-gram
    return output_tensor

train_data = [tensorize_sentence(sentence) for sentence in train_data_text]
test_data = [tensorize_sentence(sentence) for sentence in test_data_text]
dev_data = [tensorize_sentence(sentence) for sentence in dev_data_text]


### Generate training pairs

In [None]:
def generate_pairs(training_data,n=n_model):
    """
    Generate context,target pairs to train the MLP on
    Args: 
        traning_data: list of words as indexes, 
        n size of the Ngram  model
    
    Returns:         
        X: list of context vectors of shape (n-1,)
        y: list of target word IDs
    """
    X=[]
    y=[]
    for sentence in training_data:
        for index,word in enumerate(sentence) :
            if index>= n-1:
                training_words=sentence[index-(n-1):index]
                target_word=word
                X.append(training_words)
                y.append(target_word)

    return X,y

X_train,y_train=generate_pairs(train_data)
X_train=th.tensor(X_train, dtype=th.long)
y_train=th.tensor(y_train, dtype=th.long)


# Neural N Gram model

In [None]:
class NeuralNgram(nn.Module):
    def __init__(self,n=n_model,embedding_dim=100,hidden_dim=16):
        super().__init__()
        self.n=n
        self.emb = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim=embedding_dim
        
        # Small MLP classifier 
        self.mlp = nn.Sequential(
            nn.Linear((self.n-1)*embedding_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, vocab_size)
        )

        #Logging objects
        self.loss_history=[]




    def forward(self,inputs):
        """
        Forward pass for Neural Ngram.

        Args:
            inputs: LongTensor of shape (batch_size, n-1)
                    where each entry is a token index.

        Returns:
            logits: Tensor of shape (batch_size,vocab_size)
        """
        # Convert tokens to embeddings Shape (batch_size,n,embedding size) â†’ (batch, seq_len, embedding_dim)
        embeds = self.emb(inputs)
        embeds = embeds.view(inputs.size(0), self.embedding_dim * (self.n-1))         # flatten (batch_size, 2*emb_dim) to correct mlp size
        logits=self.mlp(embeds)
        return logits
    
    
    def fit(self,X_train,y_train,epochs,lr=0.001,batch_size=32):
        self.train()
        optimizer = th.optim.Adam(self.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        for _ in range(epochs):
            
            #Shuffle data to maximise randomness
            perm = th.randperm(len(X_train))
            X_train = X_train[perm]
            y_train = y_train[perm]

            total_loss=0.0

            for start in range(0, len(X_train), batch_size):
                optimizer.zero_grad()
                xb = X_train[start:start + batch_size]
                yb = y_train[start:start + batch_size]

                logits=self.forward(xb)
                loss=criterion(logits,yb)
                
                loss.backward()
                optimizer.step()
                total_loss+= loss.item()*xb.size(0)

            avg_loss = total_loss/len(X_train)
            self.loss_history.append(avg_loss)


implement unknown word embedding

# LSTM model