In [None]:
# to install pytorch on colab
# no need to run on your local
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

In [1]:
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx

In [2]:
#set seed for reproducibility
torch.manual_seed(3)

<torch._C.Generator at 0x114266650>

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [38]:
#set variables

#for batches
train_batch_size = 10
eval_batch_size = 10

#for the model
emsize = 200 #size of word embedding
nhid = 200 #number of hidden units per layer
nlayers = 2 #number of layers
dropout = 0.5 #dropout
lr = 20 #initial learning rate
clip = 0.25 #gradient clipping
epochs = 5 #number of epochs
log_interval = 200
tied = True
bptt = 35 #sequence length
export_path_model = 'Trump_NLG_model.pt'

## Data preparation

In [5]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        # self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [6]:
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

In [24]:
#to download the data 
#put it in a folder with test.txt, train.txt and valid.txt
# ! wget https://www.dropbox.com/s/mrtpbpohnrsl3hc/data.zip
# ! wget https://www.dropbox.com/s/5hsrba4led6mzjw/Trump_NLG.zip
# ! wget https://www.dropbox.com/s/5hsrba4led6mzjw/Trump_NLG.zip
! wget https://www.dropbox.com/s/n71az5lgaz7xt8v/Trump_NLG.zip

--2018-11-10 12:52:39--  https://www.dropbox.com/s/n71az5lgaz7xt8v/Trump_NLG.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.67.1
Connecting to www.dropbox.com (www.dropbox.com)|162.125.67.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/n71az5lgaz7xt8v/Trump_NLG.zip [following]
--2018-11-10 12:52:39--  https://www.dropbox.com/s/raw/n71az5lgaz7xt8v/Trump_NLG.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc3cddd5be04739aa86848581894.dl.dropboxusercontent.com/cd/0/inline/AVO5r7eHllXP8xwaqxWCBA6wtsADbg1HRnxO0-IWprEFhotDOqP7ahdBPSAe1Kx1Uxa7YJfaWPP_UUpnfyzX_wLRoRGW5KcLb88u3v5BgahPd4iSFH8yHbjJH_49TRGe_IFEPtDUsfSLH6BQIYl9hGzFoayZGcX_qNOOU9FxuGO7zoqPqdggHBFBmUmGUTGp31U/file [following]
--2018-11-10 12:52:39--  https://uc3cddd5be04739aa86848581894.dl.dropboxusercontent.com/cd/0/inline/AVO5r7eHllXP8xwaqxWCBA6wtsADbg1HRnxO0-IWprEFhotDOqP7ahdBPSAe1Kx1Uxa7YJfaW

In [25]:
! unzip Trump_NLG.zip

Archive:  Trump_NLG.zip
  inflating: train.txt               
   creating: __MACOSX/
  inflating: __MACOSX/._train.txt    
  inflating: valid.txt               
  inflating: __MACOSX/._valid.txt    


In [26]:
data_path = os.getcwd()

In [27]:
#potentiellement, virer les mots rares ?
#commencer avec un embedding déjà existant (gloVe?)

In [28]:
corpus = Corpus(data_path)

In [29]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

In [30]:
train_data = batchify(corpus.train, train_batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
# test_data = batchify(corpus.test, eval_batch_size)

In [31]:
ntokens = len(corpus.dictionary)
criterion = nn.CrossEntropyLoss()

In [32]:
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, emsize, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, emsize) # map each word into a vector space represented by a dense vector
                                                    # emsize  is a size of word embedding
        self.rnn = getattr(nn, "LSTM")(emsize, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        
        if tie_weights:
            if nhid != emsize:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid))

In [33]:
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout, tied).to(device)

### Train the model


In [34]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

# get_batch subdivides the source data into chunks of length bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘

# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt): #iterate other batches
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)


In [35]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(train_batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


In [None]:
# Loop over epochs.
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(export_path_model, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(export_path_model, 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

    
#commenting this because here, we don't care much about the test loss. We just want to generate some text.
# Run on test data.
#test_loss = evaluate(test_data)
#print('=' * 89)
#print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
 #   test_loss, math.exp(test_loss)))
#print('=' * 89)

In [49]:
# if needed, download model.pt
# ! wget https://www.dropbox.com/s/s5dp5rse2by67jy/model.pt
! wget https://www.dropbox.com/s/ne6osopwbc60u1z/Trump_NLG_model.pt

--2018-11-10 13:18:57--  https://www.dropbox.com/s/ne6osopwbc60u1z/Trump_NLG_model.pt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.67.1
Connecting to www.dropbox.com (www.dropbox.com)|162.125.67.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/ne6osopwbc60u1z/Trump_NLG_model.pt [following]
--2018-11-10 13:18:58--  https://www.dropbox.com/s/raw/ne6osopwbc60u1z/Trump_NLG_model.pt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucc29c959fefa88260e4bb83ec9f.dl.dropboxusercontent.com/cd/0/inline/AVMCJsHHnetEPrmaqHdNVJWiHaWYCoMoI_MzYyREmUJGY5XH9VA87jpytV-lBpNnhtUNh317pLj9LLmgJNFVCj-yIwl9bFpRo5DEb4MRwtEVfMWLL5rYalQ1v94VV8a2zZlhc9UWdTDGmVnMseo2nhRm1WvLQ1s0gsjoBltRtQdER9PCqIPnr6F0oPKSapqqT6E/file [following]
--2018-11-10 13:18:58--  https://ucc29c959fefa88260e4bb83ec9f.dl.dropboxusercontent.com/cd/0/inline/AVMCJsHHnetEPrmaqHdNVJWiHaWYCoMoI_MzYyREmUJGY5XH9VA87jpytV

In [50]:
import torch
from torch.autograd import Variable

In [51]:
temperature = 0.8
numberwords = 500
# temperature has to be greater or equal 1e-3

with open('Trump_NLG_model.pt', 'rb') as f:
    model = torch.load(f, map_location='cpu').to(device)
    # model = torch.load(f).to(device) # when running on gpu
model.eval()

#set seed for reproducibility
torch.manual_seed(42)

hidden = model.init_hidden(1)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

with open('./text_generated.txt', 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(numberwords):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if i % log_interval == 0:
                print('| Generated {}/{} words'.format(i, numberwords))

| Generated 0/500 words
| Generated 200/500 words
| Generated 400/500 words


In [52]:
!cat text_generated.txt

and all over the United States. And believe me, the reason they want to -- they say we have a
disaster. And it is a disaster. <APPLAUSE> The U.K., of the global speech. The Paris is a beauty. The lot
of people -- her by far. So you've always read Coach fund in his statements that think it's a long
time and I'm trying to have a lot of people out of work. We are going to make America strong
again, we will make America strong again, we will make America safe again. We will make America great again. Thank
you. Thank you. Thank you, everybody. Thank you. God bless you. Thank you. Thank you vote, thank you for being
very rich again. Thank you, everybody. Thank you, everybody. God bless you, everybody. God bless you everybody. Thank you. <title="Donald
Trump, Republican Presidential Candidate, delivers remarks at a campaign event in West Bend, Bay, (SD)"> <eos> <date:"2016-10-06"> <eos> <TRUMP:> Thank
you very much. Thank you very much. (APPLAUSE) God bless you very much. Thank you. T

In [46]:
#delete files to retrain and generate something again
! rm Trump_NLG_model.pt
#! rm text_generated.txt

In [48]:
# use when running on google colab
"""
# save those on Drive
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
"""

NameError: name 'auth' is not defined

In [None]:
"""
upload = drive.CreateFile({'title': 'Trump_NLG_model.pt'})
upload.SetContentFile('Trump_NLG_model.pt')
upload.Upload()
"""

In [None]:
"""
upload = drive.CreateFile({'title': 'text_generated.txt'})
upload.SetContentFile('text_generated.txt')
upload.Upload()
"""