In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm
import nltk

import pandas as pd

print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


In [47]:
# General util functions
def make_dir_if_not_exists(directory):
	if not os.path.exists(directory):
		logging.info("Creating new directory: {}".format(directory))
		os.makedirs(directory)

def print_list(l, K=None):
	# If K is given then only print first K
	for i, e in enumerate(l):
		if i == K:
			break
		print(e)
	print()

def remove_multiple_spaces(string):
	return re.sub(r'\s+', ' ', string).strip()

def save_in_pickle(save_object, save_file):
	with open(save_file, "wb") as pickle_out:
		pickle.dump(save_object, pickle_out)

def load_from_pickle(pickle_file):
	with open(pickle_file, "rb") as pickle_in:
		return pickle.load(pickle_in)

def save_in_txt(list_of_strings, save_file):
	with open(save_file, "w") as writer:
		for line in list_of_strings:
			line = line.strip()
			writer.write(f"{line}\n")

def load_from_txt(txt_file):
	with open(txt_file, "r") as reader:
		all_lines = list()
		for line in reader:
			line = line.strip()
			all_lines.append(line)
		return all_lines

# Data loading

In [48]:
data_folder = '../data/'
data_file = 'poem/poems.csv'
dataset = pd.read_csv(data_folder + data_file)

# Data vocabulary building

In [69]:
pad_word = "<pad>"
bos_word = "<s>"
eos_word = "</s>"
unk_word = "<unk>"
pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3
    
def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self):
        self.word_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id}
        self.word_count = {}
        self.id_to_word = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word}
        self.num_words = 4
    
    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = [bos_id] + [self.word_to_id[word] if word in self.word_to_id \
                               else unk_id for word in sentence.split()] + \
                               [eos_id]
        return sent_ids
    
    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.id_to_word[word_id] for word_id in sent_ids]

    def decode_sentence_from_ids(self, sent_ids):
        words = list()
        for i, word_id in enumerate(sent_ids):
            if word_id in [bos_id, eos_id, pad_id]:
                # Skip these words
                continue
            else:
                words.append(self.id_to_word[word_id])
        return ' '.join(words)

    def add_words_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        for word in sentence.split():
            if word not in self.word_to_id:
                # add this word to the vocabulary
                self.word_to_id[word] = self.num_words
                self.id_to_word[self.num_words] = word
                self.word_count[word] = 1
                self.num_words += 1
            else:
                # update the word count
                self.word_count[word] += 1

vocab_data = Vocabulary()

In [84]:
data_folder = '../data/'
data_file = 'PoetryFound.csv'
dataset = pd.read_csv(data_file)

In [79]:
def generate_training_data(dataset, separator = '<SEP>', mod = 'line', i_max = 20000):
    x, y = [], []
    if mod == 'line':
        for i, rows in dataset.iterrows():
            poem = eval(rows['Poem'])
            for poem_line in poem:
                x.append( "<s> " + poem_line)
                y.append(poem_line + " </s>")
    if mod == 'poem':
        for i, rows in dataset.iterrows():
            poem = eval(rows['Poem'])
            x.append( "<s> " + ' </s> '.join(poem))
            y.append(' </s> '.join(poem) + " </s>")
    return x,y

In [80]:
x, y = generate_training_data(dataset, mod = 'poem')

print(len(x))

12899


In [81]:
print(dataset.iloc()[11]['Poem'])

['After Octavio Paz', 'What’s most human must drive', 'an arrow to the heart.', 'Ghosts, too, must abide by this directive', '& remain transparent,', 'going about their business in old houses.', 'Before I was an I, I longed to be ethereal.', 'Sprouting wings at will & gliding through', 'cul-de-sacs and malls around the valley.', 'My hands, too, would gradually disappear', 'followed by my arms, then neck & head', 'until my whole body was slight as allergen.', 'Before I was an I, I spoke an old language', 'that would return on drowsy afternoons.', 'Therefore I struggled to say', 'the simplest sentences. So much so', 'that the maligned semicolon', 'became an ardent ally, an island', 'of pause and the deep breath.', 'The comma, too, bless its tiny soul,', 'was the crumb which the god', 'of small favors multiplied', 'tenfold for my morning pie.', 'Before I was an I, knowledge', 'clung to me like burrs & hunger', 'guided my ship like the barefoot light                on the sleeping land & s