This version uses an LSTM with a single hidden layer and softmax activation to classify real versus corrupted sentences.

In [1]:
# Load the corpora

In [41]:
# Standard pytorch imports 
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

#spacy model
import spacy
from spacy.tokenizer import Tokenizer
#load Spacy spanish model to handle tokenization of toy data
nlp = spacy.load('es_core_news_sm')
tokenizer = Tokenizer(nlp.vocab)

#other utilities
import numpy as np
from numpy.random import choice
import random
from collections import defaultdict
import math
from string import capwords
import re
import string

In [86]:

corpus_name = "euro.toy"

test_file_base = "Data/{cn}.{kind}.".format(cn=corpus_name,kind = "test")
train_file_base = "Data/{cn}.{kind}.".format(cn = corpus_name, kind = "train")

print("Loading Data")
# Method to load corrupted data
def load_corrupt(base_fn):
    corrupt_types = ["prepRM", "verbRM", "verbInfl", "adjInfl"]
    corrupt_sentences = {"prepRM":[],
                         "verbRM":[],
                         "verbInfl":[],
                         "adjInfl":[]}
    for corr_type in corrupt_types:
        filename = "{base}2corrupt.corrupted_by.{typ}"\
        .format(
            base=base_fn,
            typ = corr_type)
        file = open(filename,"r")
        for line in file.readlines():
            corrupt_sentences[corr_type].append(line.strip())
    file.close()
    return(corrupt_sentences)

# Load corrupted train data
corrupt_sentences_train = load_corrupt(train_file_base)
    
# Load uncorrupted train data
uncorrupted_sentences_train = []
uncorrupted_train_fn = train_file_base + "2keep"
uncorrupted_train_file = open(uncorrupted_train_fn, "r")
for line in uncorrupted_train_file.readlines():
    uncorrupted_sentences_train.append(line.rstrip())


# Preprocess
#Count to get rid of rare words
counts = defaultdict(int)
for sentence in uncorrupted_sentences_train:
    tokens = sentence.split()
    for token in sentence:
        token = token.rstrip(",!;:)*)»}").lstrip("(«{").lower()
        counts[token] +=1
        
for corr_type in corrupt_sentences_train:
    for sentence in corrupt_sentences_train[corr_type]:
        tokens = sentence.split()
        for token in sentence:
            token = token.rstrip(",!;:)*)»}").lstrip("(«{").lower()
            counts[token] +=1
            
            
# In case anyone else is reading this, I do know a "hapax legomenoi
# is a word that only apears once in a corpus but I want this
# to be generalizable and realy like the variable name so here you
# get to define the threshold

hapax_threshold = 1            
hapaxes = []
counts["#unk"]=0
for key in counts:
    if counts[key] <= hapax_threshold:
        hapaxes.append(key)

for hapax in hapaxes:
    counts.pop(hapax)
    
# I know keys is kept as a set but I want it to remain a set
vocabulary = set(counts.keys())
vocabulary = vocabulary.union({",",".","(",")",'''"''',";",":","#num",
                              })


def bulk_token_replacement(sentences, hapaxes):
    # Takes a list of sentences that have gone through the spacy pipeline
    # (sentences have Doc type)
    # Returns a list of sentences, each of which is a list of words (str)
    # Words specified in hapaxes are replaced by UNK
    # TODO: implement a version that replaces words by their tag instead
    processed = [token_replacement(sentence,hapaxes) 
                 for sentence in sentences]
    return processed

def token_replacement(sentence,hapaxes):
    number_regex = re.compile("\d[.,0-9]+")
    parsed = nlp(sentence)
    this_sentence = []
    for token in parsed:
        if number_regex.match(token.text):
            this_sentence.append("#num")
        elif token.text.lower() in hapaxes:
            pos = "#"+token.pos_.lower()
            vocabulary.add(pos)
            this_sentence.append(pos)
        else:
            this_sentence.append(token.text)
    return this_sentence

# Preprocess

print("Preprocessing data")

uncorrupted_sentences_train = bulk_token_replacement(
    uncorrupted_sentences_train, hapaxes)
print("Uncorrupted data ready")

for corr_type in corrupt_sentences_train:
    corrupt_sentences_train[corr_type] = bulk_token_replacement(
        corrupt_sentences_train[corr_type],
        hapaxes)
print("Corrupted data ready")

# Assign Labels and flags
labeled_sentences_train = []
for sentence in uncorrupted_sentences_train:
    labeled_sentences_train.append((sentence,1,0))
code = 0
for corr_type in corrupt_sentences_train:
    code += 1
    for sentence in corrupt_sentences_train[corr_type]:
        labeled_sentences_train.append((sentence,0,code))
        
#Shuffle

random.shuffle(labeled_sentences_train)
    
# Save the corrupted corpora with labels
training_corpus_fn = train_file_base + "labeled.trainingV2"

def save_corpus(data,filename):
    out_file = open(filename,"w")
    for instance in data:
        words = " ".join(instance[0])
        label = str(instance[1])
        code = str(instance[2])
        out = words + "|" + label + "|" + code + "\n"
        out_file.write(out)
    out_file.close()
    
save_corpus(labeled_sentences_train,training_corpus_fn)


# Print summary
summary = """Training Corpus has {full} labeled sentences.
With {uncorrupted} real ones and {corrupted} ungrammatical ones""".format(
            full = len(labeled_sentences_train),
            uncorrupted =len(uncorrupted_sentences_train),
            corrupted= len(labeled_sentences_train)-len(uncorrupted_sentences_train)
            )

print(summary)

# Build the identifiers for all words in the training vocabulary
word_to_ix = {}
for word in vocabulary:
    word_to_ix[word]= len(word_to_ix)



Loading Data
Preprocessing data
Uncorrupted data ready
Corrupted data ready
Training Corpus has 12363 labeled sentences.
With 6503 real ones and 5860 ungrammatical ones


In [64]:
labeled_sentences_train[0][0]

['Lo',
 'mucho',
 ',',
 'señor',
 'Hänsch',
 ',',
 'señor',
 'Cox',
 ',',
 'no',
 'he',
 'advertido',
 'que',
 'ustedes',
 'pedían',
 'la',
 'palabra',
 '.']

In [87]:
# Define the Neural network basic architecture


class Linguo(nn.Module):
    def __init__(self,embedding_dim, vocab_size, lstm_dim , hidden_dim):
        super(Linguo,self).__init__()
        # Store the hidden layer dimension
        self.hidden_dim = hidden_dim
        # Define word embeddings
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # Define LSTM
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        #Define hidden linear layer
        self.hidden2dec = nn.Linear(hidden_dim, 2)
        #Define the hidden state
        self.hstate = self.init_hstate()
        
    def forward(self, inputsentence):
        linguo.hstate = linguo.init_hstate()
        embeds = self.word_embeddings(inputsentence)
        lstm_out, self.hstate = self.lstm(embeds.view(len(inputsentence),1, -1), self.hstate)
        decision_lin = self.hidden2dec(lstm_out[-1])
        #print(decision_lin)
        decision_fin = F.log_softmax(decision_lin)
        return decision_fin
        
        
    def init_hstate(self):
        var1 = autograd.Variable(torch.zeros(1, 1, self.hidden_dim)) 
        var2 = autograd.Variable(torch.zeros(1, 1, self.hidden_dim))
        hidden_state = (var1, var2)
        return hidden_state
        
        
def prepare_input(word_to_ix, sentence,nlp):
    idxs = []
    try:
        #If every word is known then our work is easy
        for word in sentence:
            idxs.append(word_to_ix[word.lower()])
    except KeyError :
        #If there is at least one unknown word we parse and tag
        processed = nlp(" ".join(sentence))
        for token in processed:
            word = token.text.lower()
            pos = "#"+token.pos_.lower()
            if word in word_to_ix:
                #Known tokens are still input normally
                idxs.append(word_to_ix[word])
            elif pos in word_to_ix:
                #Tagged inputs are replaced by their tag
                idxs.append(word_to_ix[pos])
            else:
                # If no tad is available we default to a proper noun
                idxs.append(word_to_ix["#propn"])

    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)
     

In [88]:
# Training time! Cue Eye of the Tiger

#Modify hyper parameters here while we get config files
embed_dim = 32
lstm_dim = 64
voc_size = len(word_to_ix)
hidden_dim = 64
epochs = 25
linguo = Linguo(embed_dim, voc_size, lstm_dim, hidden_dim) 
optimizer = optim.SGD(linguo.parameters(),lr=0.1)
loss_function = nn.NLLLoss()
learning_rate=0.1

for i in range(epochs):
    epoch_loss = 0
    random.shuffle(labeled_sentences_train)
    for data, label , code in labeled_sentences_train:
        # Restart gradient
        linguo.zero_grad()
        
        # Run model
        in_sentence = prepare_input(word_to_ix,data,nlp)
        target = autograd.Variable(torch.LongTensor([label]))
        prediction = linguo(in_sentence)
        #Calculate loss and backpropagate
        
        #Squared Loss
        #loss = torch.pow(target-prediction.view(1),2)
        loss = loss_function(prediction,target) 
        
        loss.backward()
        optimizer.step()
        #for parameter in linguo.parameters():
        #   parameter.data.sub_(parameter.grad.data*learning_rate)
        epoch_loss += loss.data[0]
    print("{}:{}".format(i,epoch_loss))
        

0:8720.684995666146
1:8715.43458160758
2:8714.38810968399
3:8715.524949222803
4:8717.333831340075
5:8726.007983982563


KeyboardInterrupt: 

KeyError: 'el'

'PROPN'