In [1]:
# This is a first attempt at a rudimentary sentence grammaticality classifier 
# It is meant as a feasibility test for the larger experiment and as a learning
# Experience for future endeavours

# Standard pytorch imports 
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

#spacy model
import spacy
from spacy.tokenizer import Tokenizer
#load Spacy spanish model to handle tokenization of toy data
nlp = spacy.load('es_core_news_sm')

#other utilities
import numpy as np
from numpy.random import choice
import random
from collections import defaultdict
import math
from string import capwords

In [15]:
# Handles importing data, default version takes mini toy, further will take full sentences.

#Manual seed for consistency
random.seed(42)
#This is the kid version
# real_text = ["Los acontecimientos tienen lugar en una galaxia ficticia de nombre desconocido y en un tiempo no especificado.", "Además de la raza humana, son descritos muchos tipos de especies extraterrestres procedentes de los numerosos planetas y satélites que forman dicha galaxia y pertenecen a la alianza de planetas de la República Galáctica.", "Otros personajes recurrentes son los robots y los androides, creados generalmente para servir a un propósito, observándose así droides astromecánicos, médicos, de protocolo, de combate, entre otros.",
# "Los viajes espaciales son comunes y la mayoría de los planetas que aparecen en la saga están afiliados a la República Galáctica, la unión democrática que rige la galaxia y cuyo gobierno, presidido por un Canciller Supremo, está formado por representantes elegidos o designados de toda ella agrupados en el llamado Senado Galáctico, ubicado en el planeta Coruscant.", "En oposición a la República se encuentra la Confederación de Sistemas Independientes, siendo el enfrentamiento de ambas uno de los temas más importantes en la trama de las tres primeras películas de Star Wars.", "Uno de los elementos principales en la saga es «la Fuerza», un campo de energía metafísico y omnipresente creado por las cosas que existen, que impregna el universo y todo lo que hay en él.", "La Orden Jedi es una organización de caballeros unidos por su creencia y percepción de la Fuerza, que luchan por la paz y la justicia en la República Galáctica.","Se entrenan en el uso del sable de luz o espada láser, un arma similar a una espada tradicional salvo por el hecho que su hoja es un haz de energía.",
# "Los Jedi son capaces de manejar la Fuerza y lograr así habilidades como la telequinesis, la clarividencia, el control mental o una amplificación de los reflejos, la velocidad y otras capacidades físicas.", "No obstante y aunque dicho grupo la utiliza con fines positivos, tiene un lado oscuro provocado por la ira, el miedo y el odio.", "Este lado es usado por los sith con el fin de exterminar a los jedi y tomar el control de la Galaxia."]

#More real version that reads in a corpus file
ngram_order = 2


input_corpus_filename = "euro.toy"
input_corpus_path = "Data/"+input_corpus_filename
in_file = open(input_corpus_path, "r")
real_text = []
numlines = 0
inter_excl=0
for line in in_file.readlines():
    #Keep only sentences, those have a period at the end (is support for ? and ! needed??)
    if line.strip() !="":
        if line.strip()[-1] == ".":
            real_text.append(line.strip())
        elif line.strip()[-1] == "?" or line.strip()[-1] == "!":
            inter_excl +=1
    numlines+=1

print("Full corpus has {} sentences, {} were dumped, among which {} interogatives or exclamatives".format(
                            len(real_text),numlines-len(real_text),inter_excl))

proportion_train = 0.8
cutoff = math.floor(len(real_text)*proportion_train)
random.shuffle(real_text)

real_train , real_test = real_text[:cutoff], real_text[cutoff:]

# Process the input sentences (for tokenization, tokenizer sucks otherwise)
parsed_real_train = [nlp(sentence) for sentence in real_train]
parsed_real_test = [nlp(sentence) for sentence in real_test]

#Extract the statististical info needed to generate unigram word salad
#Calculate average sentence length
lengths= [len(sent) for sent in parsed_real_train]
avg_sent_length = np.mean(lengths)
length_sd = np.std(lengths)
counts = defaultdict(int)
total = 0.0
for sentence in parsed_real_train:
    for token in sentence:
        if token.text != ".":
            counts[token.text.lower()] +=1
            total += 1

# TODO: implement a version where low frequency words are replaced by their tag
#Switch happaxes for the UNK token
hapaxes = []
counts["#unk"]=0
for key in counts:
    if counts[key] == 1:
        counts["#unk"] += 1
        hapaxes.append(key)

for hapax in hapaxes:
    counts.pop(hapax)

vocabulary = []
probdist = []
for key in counts:
    vocabulary.append(key)
    probdist.append(counts[key])


# In tokenized we only retain the sentences as an array of words and we implement the replacements.
# We pre tokenize test data taking only the hapaxes from train out 

# Method to extract n-gram frequencies from the corpus
# Corpus is a list of sentences, each sentence represented by a list of tokens
def extract_ngram_freq(corpus,order):
    n_frequencies = defaultdict(lambda:defaultdict(int))
    for sentence in corpus:
        for _ in range(order-1):
            sentence.insert(0,"#")
        for ini in range(len(sentence) - order ):
            prefix = " ".join(sentence[ini:ini+order-1])
            target = sentence[ini+order-1]
            n_frequencies[prefix][target]+= 1
    return n_frequencies



def token_replacement(parsed_sentences, hapaxes):
    # Takes a list of sentences that have gone through the spacy pipeline
    # (sentences have Doc type)
    # Returns a list of sentences, each of which is a list of words (str)
    # Words specified in hapaxes are replaced by UNK
    # TODO: implement a version that replaces words by their tag instead
    tokenized = []
    for sentence in parsed_sentences:
        this_sentence = []
        for token in sentence:
            if token.text.lower() in hapaxes:
                this_sentence.append("#UNK")
            else:
                this_sentence.append(token.text)
        tokenized.append(this_sentence)
    return tokenized

# Get the sentences represented as lists of words
tokenized_real_train = token_replacement(parsed_real_train, hapaxes)
tokenized_real_test = token_replacement(parsed_real_test, hapaxes)

#Extract n-gram frequencies



def generateWSuni(vocab, probdist, avg_length,sd):
    # Method to generate one word salad sentence usin unigram distribution
    # Vocab is a list of vocabulary words
    # probdist contains the probabilities of vocabulary words in same order
    # avg_length is the average length of sentences
    # sd is the standar deviation for the legths of sentences
    
    #Draw the length
    length= math.floor(random.gauss(avg_length, sd))
    if length<6:
        length = 6
    #Draw the words
    draw= choice(vocab, length, probdist).tolist()
    #Assemble the sentence
    sentence = [capwords(draw.pop(0))]
    while draw:
        next_word = draw.pop(0)
        #special case for punctuation that needs to be closed
        if next_word in ["(","«"]:
            try:
                sentence.append(next_word) 
                sentence.append(draw.pop(0))
                closing = ""
                if next_word == "(":
                    closing = ")"
                elif next_word == "«":
                    closing = "»"
                draw.insert(random.randint(0,len(draw)),closing)
            except IndexError:
                break
        elif next_word not in [")","»"]:
            sentence.append(next_word)
    sentence.append(".")
    return sentence

def generateWSNgram(n_frequencies, avg_length,sd,order , unicounts):
    # Method to generate one word salad sentence usin unigram distribution
    # Vocab is a list of vocabulary words
    # probdist contains the probabilities of vocabulary words in same order
    # avg_length is the average length of sentences
    # sd is the standar deviation for the legths of sentences
    
    #Draw the length
    length= math.floor(random.gauss(avg_length, sd))
    if length<6:
        length = 6
    
    sentence = ["#"]*(order-1)
    for i in range(length+order-1):
        prefix = " ".join(sentence[-(order-1):])
        try:
            vocab, freqs = zip(*n_frequencies[prefix].items())
            word = choice(vocab,1,freqs)[0]
            sentence.append(word)
        except:
            vocab, freqs = zip(*unicounts.items())
            word = choice(vocab, 1, freqs)[0]
            sentence.append(word)
    sentence.append(".")
    return sentence

n_gram_frequencies = extract_ngram_freq(tokenized_real_train,ngram_order)

# get a list of word salads the same length as the real test data    
word_salads_train = [generateWSNgram(n_gram_frequencies, 
                          avg_sent_length,length_sd, ngram_order,counts) for _ in range(len(tokenized_real_train))]
word_salads_test = [generateWSNgram(n_gram_frequencies, 
                          avg_sent_length,length_sd, ngram_order,counts) for _ in range(len(tokenized_real_test))]

#Consolidate training data
labeled_sentences_train = [[sentence, 1] for sentence in tokenized_real_train]
labeled_sentences_train += [[sentence, 0] for sentence in word_salads_train]
random.shuffle(labeled_sentences_train)

#Consolidate test data
labeled_sentences_test = [[sentence, 1] for sentence in tokenized_real_test]
labeled_sentences_test += [[sentence, 0] for sentence in word_salads_test]
random.shuffle(labeled_sentences_test)

#Define the vocabulary and word ids
vocabulary.append(".")

word_to_ix = {}
for word in vocabulary:
    word_to_ix[word] = len(word_to_ix)
    

#Saving the Corpus
training_corpus_fn = "Data/" + input_corpus_filename + ".labeled.training"
testing_corpus_fn = "Data/" + input_corpus_filename + ".labeled.testing"


def save_corpus(data,filename):
    out_file = open(filename,"w")
    for instance in data:
        words = " ".join(instance[0])
        label = str(instance[1])
        out = words + "|" + label + "\n"
        out_file.write(out)
    out_file.close()

#save_corpus(labeled_sentences_train, training_corpus_fn)
#save_corpus(labeled_sentences_test, testing_corpus_fn)

loaded_corpus = True

print("Done, you now have {} train instances and {} test instancess:".format(len(labeled_sentences_train),len(labeled_sentences_test)))

Full corpus has 18582 sentences, 1418 were dumped, among which 853 interogatives or exclamatives
Done, you now have 29730 train instances and 7434 test instancess:


In [None]:
# Alternatively load an existing corpus
loaded_corpus = True
if loaded_corpus == False:
    #Put your corpus filename here
    input_corpus_filename = "mini.toy"
    
    training_corpus_fn = "Data/" + input_corpus_filename + ".labeled.training"
    testing_corpus_fn = "Data/" + input_corpus_filename + ".labeled.testing"

    def load_corpus(filename):
        in_file = open(filename,"r")
        labeled_data = []
        for line in in_file.readlines():
            words_str , label = line.rstrip().split("|")
            words_list = words_str.split(" ")
            instance = [words_list, int(label)]
            labeled_data.append(instance)
        return labeled_data

    labeled_sentences_train = load_corpus(training_corpus_fn)
    labeled_sentences_test = load_corpus(testing_corpus_fn)

    print("Done, you now have {} train instances and {} test instancess:".format(len(labeled_sentences_train),len(labeled_sentences_test)))

In [11]:
# Now we define the Neural network


class Linguo(nn.Module):
    def __init__(self,embedding_dim, vocab_size, lstm_dim , hidden_dim):
        super(Linguo,self).__init__()
        # Store the hidden layer dimension
        self.hidden_dim = hidden_dim
        # Define word embeddings
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # Define LSTM
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        #Define hidden linear layer
        self.hidden2dec = nn.Linear(hidden_dim, 2)
        #Define the hidden state
        self.hstate = self.init_hstate()
        
    def forward(self, inputsentence):
        linguo.hstate = linguo.init_hstate()
        embeds = self.word_embeddings(inputsentence)
        lstm_out, self.hstate = self.lstm(embeds.view(len(inputsentence),1, -1), self.hstate)
        decision_lin = self.hidden2dec(lstm_out[-1])
        #print(decision_lin)
        decision_fin = F.log_softmax(decision_lin)
        return decision_fin
        
        
    def init_hstate(self):
        var1 = autograd.Variable(torch.zeros(1, 1, self.hidden_dim)) 
        var2 = autograd.Variable(torch.zeros(1, 1, self.hidden_dim))
        hidden_state = (var1, var2)
        return hidden_state
        
        
def prepare_input(word_to_ix, sentence):
    idxs = []
    for word in sentence:
        if word in word_to_ix:
            idxs.append(word_to_ix[word.lower()])
        else:
            idxs.append(word_to_ix["#unk"])
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)
     


In [12]:
# Training time! Cue Eye of the Tiger
embed_dim = 32
lstm_dim = 32
voc_size = len(word_to_ix)
hidden_dim = 32
epochs = 5
linguo = Linguo(embed_dim, voc_size, lstm_dim, hidden_dim) 
optimizer = optim.SGD(linguo.parameters(),lr=0.1)
loss_function = nn.NLLLoss()
learning_rate=0.1

for i in range(epochs):
    epoch_loss = 0
    random.shuffle(labeled_sentences_train)
    for data, label in labeled_sentences_train:
        # Restart gradient
        linguo.zero_grad()
        
        
        # Run model
        in_sentence = prepare_input(word_to_ix,data)
        target = autograd.Variable(torch.LongTensor([label]))
        prediction = linguo(in_sentence)
        #Calculate loss and backpropagate
        
        #Squared Loss
        #loss = torch.pow(target-prediction.view(1),2)
        loss = loss_function(prediction,target) 
        
        loss.backward()
        optimizer.step()
        #for parameter in linguo.parameters():
        #   parameter.data.sub_(parameter.grad.data*learning_rate)
        epoch_loss += loss.data[0]
    print("{}:{}".format(i,epoch_loss))
        

0:915.3221575738862
1:698.9492537204642
2:554.0780851221061
3:474.1894234545616
4:361.24798116292686


In [13]:
correct = 0
salads =[]
for testcase in labeled_sentences_test:
    prepared_inputs = prepare_input(word_to_ix, testcase[0] )
    prediction_vec = linguo(prepared_inputs).view(2)
    if prediction_vec.data[0] > prediction_vec.data[1]:
        prediction = 0
    else:
        prediction = 1
    if prediction == testcase[1]:
        correct += 1

#Summary:        
outtable ="""Corpus: {corpus}
Ngram-Order: {n} 
Embedding dimension: {embed}
LSTM dimension: {lstm}
Hidden Dimension: {hidden}
Number of Epochs: {epoch}
Final loss:{loss}""".format(
                            corpus = input_corpus_filename,
                            embed= embed_dim,
                            lstm= lstm_dim,
                            hidden= hidden_dim,
                            epoch= epochs,
                            loss= epoch_loss,
                            n=ngram_order)
print (outtable)
print("Accuracy: {}".format(correct/len(labeled_sentences_test)))

Corpus: euro.mini
Ngram-Order: 6 
Embedding dimension: 32
LSTM dimension: 32
Hidden Dimension: 32
Number of Epochs: 5
Final loss:361.24798116292686
Accuracy: 0.8552631578947368


Corpus: euro.mini
Ngram-Order: 4 
Embedding dimension: 32
LSTM dimension: 32
Hidden Dimension: 32
Number of Epochs: 5
Final loss:504.7399406191398
Accuracy: 0.7973684210526316


Corpus: euro.mini
Ngram-Order: 3 
Embedding dimension: 32
LSTM dimension: 32
Hidden Dimension: 32
Number of Epochs: 10
Final loss:232.77821091443258
Accuracy: 0.6868421052631579

Corpus: euro.mini
Ngram-Order: 3 
Embedding dimension: 32
LSTM dimension: 32
Hidden Dimension: 32
Number of Epochs: 5
Final loss:522.7681364201596
Accuracy: 0.7236842105263158

Corpus: euro.mini
Ngram-Order: 2 
Embedding dimension: 32
LSTM dimension: 32
Hidden Dimension: 32
Number of Epochs: 5
Final loss:363.4900658682709
Accuracy: 0.8184210526315789

Corpus: euro.mini
Ngram-Order: 6 
Embedding dimension: 32
LSTM dimension: 32
Hidden Dimension: 32
Number of Epochs: 5
Final loss:361.24798116292686
Accuracy: 0.8552631578947368



Type           | Corpus    |Corpus Size | Embed | LSTM | Hidden | Epochs | Loss   |Accuracy | 
:--------------|:----------|:----------:|:-----:|:----:|:------:|:------:|:------:|:-------:|
Gram Vs Unigram|euro.mini  | 1514/380   | 10    | 10   | 10     |   3    | 51     | 0.97    |
Gram Vs Unigram|euro.mini  | 1514/380   | 32    | 32   | 64     |   50   | 0.034  | 0.98    |
Gram Vs Unigram|euro.toy   | 29730/7434 | 32    | 64   | 64     |   25   | 0.06   | 0.99    |
Gram Vs Bigram |euro.mini  | 1514/380   | 10    | 10   | 10     |   3    | 718    | 0.81    |
Gram Vs 3-gram |euro.mini  | 1514/380   | 10    | 10   | 10     |   3    | 840.8  | 0.67    |
Gram Vs 3-gram |euro.mini  | 1514/380   | 10    | 10   | 10     |   5    | 673    | 0.75    |
Gram Vs 3-gram |euro.mini  | 1514/380   | 10    | 10   | 10     |   5    | 490.9  | 0.75    |

In [None]:
correct = 0
salads =[]
for testcase in labeled_sentences_test:
    prepared_inputs = prepare_input(word_to_ix, testcase[0] )
    prediction_vec = linguo(prepared_inputs).view(2)
    if prediction_vec.data[0] > prediction_vec.data[1]:
        prediction = 0
    else:
        prediction = 1
    if prediction == testcase[1]:
        correct += 1

#Summary:        
outtable ="""Corpus: {corpus}
Embedding dimension: {embed}
LSTM dimension: {lstm}
Hidden Dimension: {hidden}
Number of Epochs: {epoch}
Final loss:{loss}""".format(
                            corpus = input_corpus_filename,
                            embed= embed_dim,
                            lstm= lstm_dim,
                            hidden= hidden_dim,
                            epoch= epochs,
                            loss= epoch_loss)
print (outtable)
print("Accuracy: {}".format(correct/len(labeled_sentences_test)))

In [None]:
#Reminder for the summary


In [None]:
# Saving the model
modelfilename= "Models/{corpus}.{embed}emb.{lstm}lstm.{hidden}hid.{epoch}ep.model".format(
                            corpus = input_corpus_filename,
                            embed= embed_dim,
                            lstm= lstm_dim,
                            hidden= hidden_dim,
                            epoch= epochs)
torch.save(linguo.state_dict(), modelfilename)

In [None]:
# Section reserver to save the data





In [None]:
examplefile = open("example","w")

In [None]:
batch_size = 2
fresh_instances = labeled_sentences_train[:]
# Get batches ready
num_batches = math.floor(len(fresh_instances)/batch_size)
batches = []
for i in range(num_batches):
    batch_data = []
    batch_labels = []
    for j in range(batch_size):
        instance = fresh_instances.pop()
        data_tensor = prepare_input(word_to_ix, instance[0])
        batch_data.append(data_tensor)
        print(data_tensor.size())
        batch_labels.append(int(instance[1]))
    batch_var = torch.stack(batch_data)
    batch_labels_var = autograd.Variable(torch.IntTensor(batch_labels))
    batches.append(batch_var)

In [None]:

corp = [["Yo","como","mucha","pizza"], ["Yo","tengo","mucha","hambre"]]

corp[0].insert(0,"#")
corp[0].insert(len(corp[0]),"#")

fdist = extract_ngram_freq(corp,2)
generateWSNgram(fdist,3,0.1,2)

In [None]:
firstWord = choice(fwvocab,1,fwfreqs)[0]

In [None]:
firstWord