In [15]:
import pandas as pd
import torch
from torch import nn
from gensim.models import Word2Vec
import functools
import numpy as np
import re

In [16]:
MAX_LENGTH = 20
BATCH_SIZE = 500
TRAIN_DATA_PATH = '..\\data\\train.csv'
NUM_SPLITS = 7
LEARNING_RATE = 0.001
MODEL_DIR = '.\\saves'
letter_vectors = Word2Vec.load('..\\preprocessing\\word2vec.model') #load word2vec model

In [17]:
fill_out = lambda word, max_length: word + ' ' * (max_length - len(word)) #add spaces until a word is 25 characters long

#remove forbidden characters from a string
def clean_string(string, forbidden_chars):
    for char in forbidden_chars:
        string = string.replace(char, '')
    return string

#convert word to matrix
@functools.cache
def convert_to_matrix(word):
    return [letter_vectors.wv[letter] for letter in word]

#convert the raw text data into cleaned pytorch tensor word2vecs
def convert_data(df):
    df = df.astype(str)
    #remove characters not in the word2vec model
    text = ''.join(list(df['Clue'])) + ''.join(list(df['Answer']))
    chars = list(set(text))
    forbidden_chars = [char for char in chars if char not in letter_vectors.wv]
    df['Clue'] = df['Clue'].apply(lambda x: clean_string(x, forbidden_chars))
    df['Answer'] = df['Answer'].apply(lambda x: clean_string(x, forbidden_chars))
    df['Obscured'] = df['Obscured'].apply(lambda x: clean_string(x, forbidden_chars))
    
    #fill clues and answers to max length
    df['Clue'] = df['Clue'].apply(lambda x: fill_out(x, MAX_LENGTH))
    df['Answer'] = df['Answer'].apply(lambda x: fill_out(x, MAX_LENGTH))
    df['Obscured'] = df['Obscured'].apply(lambda x: fill_out(x, MAX_LENGTH))
    
    rows = [row for (_, row) in df.iterrows()]
    
    source = [convert_to_matrix(row['Clue']) + convert_to_matrix(row['Obscured']) for row in rows]
    target = [convert_to_matrix(row['Answer']) for row in rows]
    
    #convert data to pytorch tensors
    source = torch.as_tensor(np.array(source))
    target = torch.as_tensor(np.array(target))
    
    return source, target

#convert a pytorch tensor of word2vec letters into a string
def tensor_to_word(input_tensor):
    word = ''
    input_list = input_tensor.tolist()
    for word2vec in input_list:
        letter_vector = np.array(word2vec)
        letter = letter_vectors.wv.most_similar(positive=[letter_vector], topn=1)[0]
        word += letter[0]
    word = re.sub('"+', '', word)
    return word

In [18]:
def evaluate_model(split, number_splits, df):
    checkpoint = torch.load(f'{MODEL_DIR}\\{split}.pth',
                           map_location=torch.device('cpu'))
    
    model = nn.Transformer(d_model=10, nhead=5) #initialize transformer model
    criterion = nn.MSELoss() 
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) 
    
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    model.eval()
    
    #select only the data this model was not trained on
    jump = len(df) // number_splits
    begin = jump*split
    end = jump*(split+1)
    df = df.iloc[begin:end]
    df = df.reset_index()
    df = df.iloc[:2500] #only get the first 2,500, just to get an idea
    
    source, target = convert_data(df)
    length = len(source)
    
    avg_loss = 0
    for i, (sequence, answer) in enumerate(zip(source, target)):
        output = model.forward(sequence, answer)
        loss = criterion(output, answer)
        avg_loss += loss
        
        if (i+1) % BATCH_SIZE == 0:
            avg_loss /= BATCH_SIZE
            word = tensor_to_word(output)
            print(f'Iteration number {i+1} of {length}. Average loss: {avg_loss}.')
            print(f'Sample row:')
            print(df.iloc[i])
            print(f'Given answer: {word}.\n')

In [19]:
def main():
    df = pd.read_csv(TRAIN_DATA_PATH) #initially cross-validate on unseen train data
    with torch.no_grad():
        for split in range(NUM_SPLITS):
            print(f'\n\n SPLIT {split}\n\n')
            try:
                evaluate_model(split, NUM_SPLITS, df)
            except Exception: #in case the model isn't trained yet, just skip it
                continue

In [20]:
if __name__ == '__main__':
    main()



 SPLIT 0


Iteration number 500 of 2500. Average loss: 1.7745702266693115.
Sample row:
index                        499
Clue        work incorporated in
Answer                  odetojoy
Obscured                ________
Name: 499, dtype: object
Given answer: eketejey.

Iteration number 1000 of 2500. Average loss: 1.7736705541610718.
Sample row:
index                   999
Clue        buttoned weapon
Answer                 epee
Obscured               epee
Name: 999, dtype: object
Given answer: ekee.

Iteration number 1500 of 2500. Average loss: 1.76595938205719.
Sample row:
index                       1499
Clue        cultural center esta
Answer                      moma
Obscured                    moma
Name: 1499, dtype: object
Given answer: keke.

Iteration number 2000 of 2500. Average loss: 1.76057767868042.
Sample row:
index                     1999
Clue        self-righteous one
Answer                pharisee
Obscured              ________
Name: 1999, dtype: object
Given answer: k