In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import sampler
from torch.optim.lr_scheduler import StepLR

import numpy as np
import timeit
import time
import platform
import random
import pickle as pickle
import csv
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

print("Python version: ", platform.python_version())

Python version:  3.5.4


In [52]:
import torch

def num_datapoints(filename):
    """
    Given the name of a CVS file, the function returns the number of rows of data.
    Inputs:
    - filename: name of CVS file
    Returns:
    The number of rows of data
    """
    
    row_count = 0
    
    f = open(filename, 'rt')
    for line in f:
        row_count += 1
        
    f.close()

    return row_count

def num_chars(filename):
    """
    Given the name of a CVS file, the function returns the number of characters in the file.
    Inputs:
    - filename: name of CVS file
    Returns:
    The number of rows of data
    """
    
    char_count = 0
    
    f = open(filename, 'rt')
    for line in f:
        for c in line:
            char_count += 1
        
    f.close()

    return char_count


def extract_characters(filename):
    """
    Given the name of a file, the function returns a list of unique characters in the file.
    Inputs:
    - filename: name of CVS file
    Returns:
    A list of unique character
    """
    characters = []
    f = open(filename, 'rt')

    for line in f:
        for c in line:
            if c not in characters:
                characters.append(c)
    f.close()        
    return characters


def letterToIndex(letter, char_dict):
    """
    Find letter index from all_letters, e.g. "\t" = 0
    Inputs:
    - letter
    - char_dict: A dictionary of all possible characters
    Returns:
    Index of the letter in the dictionary
    """
    return char_dict.index(letter)

def letterToTensor(letter, char_dict):
    """
    Turn a letter into a <1 x len(char_dict)> Tensor representing its one-hot encoding.
    Inputs:
    - letter
    - char_dict: A dictionary of all possible characters
    Returns:
    tensor: One-hot encoding of the letter
    """
    tensor = torch.zeros(1, len(char_dict))
    tensor[0, letterToIndex(letter, char_dict)] = 1
    return tensor.view(1,-1)


def tensorToLetter(tensor, char_dict):
    """
    Turn a one-hot <1 x len(char_dict)> Tensor back to the letter
    Inputs:
    - letter
    - char_dict: A dictionary of all possible characters
    Returns:
    letter: One-hot encoding of the letter
    """
    index = torch.max(tensor.view(1,-1),1)[1]   # force tensor to 1xN dimension
    return char_dict[index.numpy()[0]]


def lineToTensor(line, char_dict):
    """
    Turn a line into an array of one-hot tensors <line_length x 1 x len(char_dict)>
    Inputs:
    - line
    - char_dict: A dictionary of all possible characters
    Returns:
    tensor: an array of one-hot tensors
    """
    tensor = torch.zeros(len(line), len(char_dict))
    for li, letter in enumerate(line):
        tensor[li, letterToIndex(letter, char_dict)] = 1
    return tensor

def display_batch(minibatch, char_dict):
    line = ''
    for tensor in minibatch:
        char = tensorToLetter(tensor, char_dict)
        line = line + char
    return line

def generateCorpus(filename, dictionary):
    """
    Take all the content in a text file and generate a corpus of one-hot encodings.
    Inputs:
    - filename
    - dictionary: A dictionary of all possible characters
    Returns:
    corpus: a tensor of <total_chars, 1, len(dictionary)>
    """
    
    total_chars = num_chars(filename)  # figure out how many char in the file
    corpus = torch.zeros(total_chars, len(dictionary))
    i = 0

    f = open(filename, 'rt')
    for line in f:
        for c in line:
            corpus[i, letterToIndex(c, dictionary)] = 1  # one-hot encoding
            i += 1
            
    f.close()  
    return corpus

def prepare_sequence(batch):
    seq = []
    for i in range(batch.size([0])):
        seq.append(batch[i,:])
        tensor = torch.LongTensor(seq)
    return Variable(tensor)


In [54]:
filename = 'input.txt'

inputfile = open(filename, 'rt')

# explore the content of the input.txt file
i = 0
for line in inputfile:
    if i < 10:
        print (line)
        i += 1
    
inputfile.close()

print (type(line))  # line are read in as strings

print (num_datapoints(filename))  # 18711 rows in the file
print (num_chars(filename))  # 501470 characters in the file

char_dict = sorted(extract_characters(filename))   # 93 unique characters

print (char_dict)
print (len(char_dict))

<start>

X:1

T: La Montfarine

Z:Transcrit et/ou corrig? par Michel BELLON - 2005-07-24

Z:Pour toute observation mailto:galouvielle@free.fr

M: 4/4

L: 1/8

Q:1/4=186

FGF B=AG G=AG F2F FGF {F}F2E EFE|

{E}E2D FGF B=AG G=AG {F}F2F FED C2G D2E|F3 {F}F/2 ED E3/2D/2|

<class 'str'>
18711
501470
['\t', '\n', ' ', '!', '"', '#', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']
93


In [55]:
filename = 'input.txt'
corpus = generateCorpus(filename, char_dict)

print (corpus.size())
print(display_batch(corpus[0:100,:], char_dict))

torch.Size([501470, 93])
<start>
X:1
T: La Montfarine
Z:Transcrit et/ou corrig? par Michel BELLON - 2005-07-24
Z:Pour toute o


In [56]:
import torchvision.transforms as transforms

class ChunkSampler(sampler.Sampler):
    """Samples elements sequentially from some offset. 
    Arguments:
        num_samples: # of desired datapoints
        start: offset where we should start selecting from
    """
    def __init__(self, num_samples, start = 0):
        self.num_samples = num_samples
        self.start = start

    def __iter__(self):
        return iter(range(self.start, self.start + self.num_samples))

    def __len__(self):
        return self.num_samples

NUM_TRAIN = 401470
NUM_VAL = 100000

loader_train = DataLoader(corpus, batch_size=32, sampler=ChunkSampler(NUM_TRAIN, 0))
loader_val = DataLoader(corpus, batch_size=32, sampler=ChunkSampler(NUM_VAL, NUM_TRAIN))


In [46]:
cpu_dtype = torch.FloatTensor # the CPU datatype
gpu_dtype = torch.cuda.FloatTensor  # the GPU datatype

class LSTM_ABC(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTM_ABC, self).__init__()
        self.hidden_dim = hidden_dim

        # We encode the character as 1-hot, so skip this step
        # self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes 1-hot as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(input_dim, hidden_dim)

        # The linear layer that maps from hidden state space to output (1-hot)
        self.hidden2out = nn.Linear(hidden_dim, output_dim)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # We initialize hx and cx
        if torch.cuda.is_available():
            hx = Variable(torch.zeros(1, 1, self.hidden_dim).cuda())
            cx = Variable(torch.zeros(1, 1, self.hidden_dim).cuda())
        else:
            hx = Variable(torch.zeros(1, 1, self.hidden_dim))
            cx = Variable(torch.zeros(1, 1, self.hidden_dim))            
        return (hx,cx)

    def forward(self, sentence):
        # embeds = self.word_embeddings(sentence)  # input is already 1-hot
        out, self.hidden = self.lstm(sentence.view(len(sentence), 1, -1), self.hidden)
        out = self.hidden2out(out.view(len(sentence), -1))
        scores = F.log_softmax(out)  # output log_softmax
        return scores

In [47]:
RNN = LSTM_ABC(93, 100, 93).type(gpu_dtype)

x = torch.randn(32, 93)
x_var = Variable(x.type(gpu_dtype)) # Construct a PyTorch Variable out of your input data

print (x_var)
log_prob = RNN(x_var)        # Feed it through the model! 

print (log_prob.size())

Variable containing:
-0.1730  1.7164 -1.9196  ...   0.4511  0.4606  0.4275
 0.4723 -1.9316 -0.3462  ...   0.6750  0.9762  0.3099
-0.4660  1.4636 -0.8490  ...  -1.1923 -1.1742 -0.8145
          ...             ⋱             ...          
 0.9543  1.7496 -0.4594  ...  -0.2349 -0.0735  0.6030
 0.1273  0.6579 -0.0142  ...  -1.5914 -0.0784  1.7125
-1.0749 -0.5387 -0.8794  ...   0.6246 -0.0168 -0.7994
[torch.cuda.FloatTensor of size 32x93 (GPU 0)]

torch.Size([32, 93])


In [49]:
n_epochs = 2000
print_every = 1
plot_every = 10
input_dim = 93
output_dim = 93
hidden_dim = 100
# n_layers = 1
lr = 0.001
verbose = False

# Create an instance of the LSTM and set optimizer and loss function
model = LSTM_ABC(input_dim, hidden_dim, output_dim).type(gpu_dtype)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# The output of the model is softmax log prob, so the loss function is 
# NLL (negative log likelihood)
loss_function = nn.NLLLoss()   

all_losses = []  # keep track of loss history
loss_avg = 0

for epoch in range(1, n_epochs + 1):
    # loss = train(*random_training_set())   
    
    loss = 0

    for t, sentence in enumerate(loader_train):
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        hidden = model.init_hidden()
       
        # Step 2. Get our inputs and targets ready for the network
        # input will be fed character by character into LSTM
        inp = sentence[:-1,:]
        if verbose:
            print(display_batch(inp, char_dict)) 
        # LSTM output will be compared against the target, which is 
        target = sentence[1:,:].type(torch.LongTensor)  
        if verbose:
            print(display_batch(target, char_dict))
        
        # Step 3. Run our forward pass.
        target = target.cuda()
        scores = model(Variable(inp.type(gpu_dtype)))
        if verbose:
            print (scores.size())
            print (target.size())
            
       # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        target_index = torch.max(target,1)[1]  # NLLoss accept index as target, not one-hot
        loss = loss_function(scores, Variable(target_index))
        loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm(model.parameters(), 50)
        optimizer.step()
        
        if t%10 == 0:
            print ("batch #:{}   loss: {} ".format(t, loss.data[0]/len(sentence)))
    
        loss_avg += loss.data[0] / len(sentence)

    if epoch % print_every == 0:
        print('[(%d %d%%) %.4f]' % (epoch, epoch / n_epochs * 100, loss))

    if epoch % plot_every == 0:
        all_losses.append(loss_avg / plot_every)
        loss_avg = 0

batch #:0   loss: 0.14189542829990387 
batch #:10   loss: 0.14144934713840485 
batch #:20   loss: 0.13604839146137238 
batch #:30   loss: 0.13852986693382263 
batch #:40   loss: 0.10861711949110031 
batch #:50   loss: 0.1245763823390007 
batch #:60   loss: 0.12714898586273193 
batch #:70   loss: 0.0837089940905571 
batch #:80   loss: 0.08174295723438263 
batch #:90   loss: 0.08606889843940735 
batch #:100   loss: 0.11090567708015442 
batch #:110   loss: 0.09647621214389801 
batch #:120   loss: 0.13509522378444672 
batch #:130   loss: 0.11214707046747208 
batch #:140   loss: 0.11632825434207916 
batch #:150   loss: 0.11059252917766571 
batch #:160   loss: 0.09504669904708862 
batch #:170   loss: 0.0892217829823494 
batch #:180   loss: 0.09378959238529205 
batch #:190   loss: 0.07677535712718964 
batch #:200   loss: 0.10473010689020157 
batch #:210   loss: 0.09479682147502899 
batch #:220   loss: 0.0707516223192215 
batch #:230   loss: 0.0728142261505127 
batch #:240   loss: 0.0848822146

batch #:1990   loss: 0.0883973240852356 
batch #:2000   loss: 0.0725988820195198 
batch #:2010   loss: 0.08308902382850647 
batch #:2020   loss: 0.0676867887377739 
batch #:2030   loss: 0.06909359991550446 
batch #:2040   loss: 0.08613201230764389 
batch #:2050   loss: 0.08782829344272614 
batch #:2060   loss: 0.06889494508504868 
batch #:2070   loss: 0.0725112184882164 
batch #:2080   loss: 0.04392377659678459 
batch #:2090   loss: 0.07420545071363449 
batch #:2100   loss: 0.04618201404809952 
batch #:2110   loss: 0.08181831240653992 
batch #:2120   loss: 0.06723026186227798 
batch #:2130   loss: 0.06005096435546875 
batch #:2140   loss: 0.07263901829719543 
batch #:2150   loss: 0.06917593628168106 
batch #:2160   loss: 0.06018983945250511 
batch #:2170   loss: 0.050410106778144836 
batch #:2180   loss: 0.03736766800284386 
batch #:2190   loss: 0.08210954070091248 
batch #:2200   loss: 0.060390621423721313 
batch #:2210   loss: 0.0630074068903923 
batch #:2220   loss: 0.06097196415066

KeyboardInterrupt: 

In [61]:
def evaluate(prime_str='X', predict_len=100, temperature=0.8):
    hidden = model.init_hidden()
    
    prime_input = letterToTensor(prime_str, char_dict)
    predicted = prime_str
    
    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _ = model(prime_input[p])
     
    inp = prime_input   # The prime character is input to LSTM
    
    for p in range(predict_len):
        output = model(Variable(inp.type(gpu_dtype)))
                
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]

        # Add predicted character to string and use as next input
        predicted_char = char_dict[top_i]
        predicted += predicted_char
        
        inp = letterToTensor(predicted_char, char_dict)

    return predicted

In [64]:
predicted = evaluate()

print(predicted)

X:116
L:1/8
O:1
R:3e"stoderit on coutielsove?.er tonceltames pari? sarnscitem(itomte paisle 12e cELLO
