In [1]:
import argparse

import sys
import os
import yaml
import torch
import numpy as np
import pickle as pkl
from dataset import NIPS2015Dataset
from model import RNN

import matplotlib
if os.environ.get('DISPLAY', '') == '':
    matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [2]:
SAMPLE_SEQ_LEN = 1000

In [3]:
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--checkpoint_dir', type=str, default='checkpoints/', help='Directory of saving checkpoints')
    parser.add_argument('--data_dir', type=str, default='data/', help='Directory of papers.csv')
    parser.add_argument('--log_dir', type=str, default='logs/', help='Directory of putting logs')
    parser.add_argument('--gpu', action='store_true', help="Turn on GPU mode")

    args = parser.parse_known_args()[0]
    return args

In [4]:
def dict2namespace(config):
    new_config = argparse.Namespace()
    for key, value in config.items():
        if isinstance(value, dict):
            value = dict2namespace(value)
        setattr(new_config, key, value)
    return new_config

In [5]:
def parse_config(args):
    with open('config.yml', 'r') as f:
        config = yaml.load(f)
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    with open(os.path.join(args.log_dir, 'config.yml'), 'w') as f:
        yaml.dump(config, f, default_flow_style=False)
    return dict2namespace(config)

In [6]:
def plot_log_p(filename, dataset, rnn):
    with open(filename + '.pkl', 'rb') as f:
        lls = []
        data = pkl.load(f)
        for i, str in data.items():
            # each str is constant-length(100) string
            str_np = np.asarray([dataset.char2idx[c] for c in str])
            ll = rnn.compute_prob(str_np)
#             print(ll)
            lls.append(ll)
    print("{}'s log-likelihood:\nmin:{}\tmax:{}\n".format(filename, min(lls), max(lls)))
    with open(filename + '_raw.pkl', 'wb') as f:
        pkl.dump(lls, f, protocol=pkl.HIGHEST_PROTOCOL)

    plt.figure()
    plt.hist(lls)
    plt.xlabel('Log-likelihood')
    plt.xlim([-800, -50])
    plt.ylabel('Counts')
    plt.title(filename)
    plt.savefig(filename + '.png', bbox_inches='tight')
    plt.show()
    plt.close()
    print("# Figure written to %s.png." % filename)

In [7]:
args = parse_args()
config = parse_config(args)

print(config)

np.random.seed(config.seed)
if args.gpu and torch.cuda.is_available():
    torch.cuda.manual_seed_all(config.seed)
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

torch.manual_seed(config.seed)

dataset = NIPS2015Dataset(batch_size=config.batch_size,
                          seq_len=config.seq_len,
                          data_folder=args.data_dir)

rnn = RNN(
    vocab_size=dataset.voc_len,
    embedding_dim=config.embedding_dim,
    num_lstm_units=config.num_lstm_units,
    num_lstm_layers=config.num_lstm_layers,
    dataset=dataset,
    device=device
)

checkpoint = torch.load(os.path.join(args.checkpoint_dir, 'checkpoint.pth'), map_location=device)
rnn.load_state_dict(checkpoint['rnn'])
print("# RNN weights restored.")

  This is separate from the ipykernel package so we can avoid doing imports until


Namespace(batch_size=64, embedding_dim=64, num_lstm_layers=4, num_lstm_units=128, seed=1234, seq_len=25)
# RNN weights restored.


In [8]:
# question 3)
with open('samples.txt', 'w', encoding='utf-8') as f:
    for i in range(2):
        text = 'sample {}: '.format(i+1)
        sample = rnn.sample(SAMPLE_SEQ_LEN)
        text += ''.join([dataset.idx2char[i] for i in sample])
        f.write(text + '\n')
print("# Samples written to samples.txt.")

  ps = self.softmax(logits[0]).numpy()


# Samples written to samples.txt.


In [9]:
# question 4)
plot_log_p('random', dataset, rnn)
plot_log_p('shakespeare', dataset, rnn)
plot_log_p('nips', dataset, rnn)

  ps = self.softmax(logits).numpy()


random's log-likelihood:
min:-797.7300402263842	max:-645.7690734824657



  % get_backend())


# Figure written to random.png.
shakespeare's log-likelihood:
min:-395.35268357381517	max:-213.48790281832783

# Figure written to shakespeare.png.
nips's log-likelihood:
min:-253.49370320211887	max:-101.29741062308395

# Figure written to nips.png.


In [10]:
# question 5)
with open('snippets.pkl', 'rb') as f:
    snippets = pkl.load(f)
lbls = []

# we will be only using the first 10 strings
n_snippet = 10
for snippet in snippets[:n_snippet]:
    # Compute the log-likelihood of the current snippet
    ll = rnn.compute_prob(np.asarray([dataset.char2idx[c] for c in snippet]))
    ##### complete the code here #####
    # infer the label of the current snippet and append it to lbls.
    # If the snippet is generated randomly, append 0
    # If the snippet is from Shakespeare's work, append 1
    # If the snippet is retrieved from a NIPS paper, append 2
    ##################################
    print(ll)
    if(ll < -520):
        lbls.append(0)
    elif(ll < -220):
        lbls.append(1)
    else:
        lbls.append(2)
        
with open("answers.pkl", 'wb') as f:
    pkl.dump(lbls, f, protocol=pkl.HIGHEST_PROTOCOL)
    print(lbls)
    print("# Answers written to answers.pkl.")

-298.00204241158906
-108.89269068257536
-324.17725121709384
-137.00153728146637
-119.77384872124996
-298.8971028386369
-140.40971556119158
-112.15386210755253
-340.5324110304424
-699.4776100834879
[1, 2, 1, 2, 2, 1, 2, 2, 1, 0]
# Answers written to answers.pkl.
