In [30]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

# The haiku dataset: loading and initial cleaning

This was obtained from the dataset created by Jeremy Neiman for use in his own haiku generation model, published in the last few days of 2018; Medium post <a href="https://towardsdatascience.com/generating-haiku-with-deep-learning-dbf5d18b4246">here</a> and Github for the dataset <a href="https://github.com/docmarionum1/haikurnn/tree/master/input/poems">here</a>.

In [2]:
haikus_df = pd.read_csv('./data/image_to_text/haikus.csv')

In [4]:
haikus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143137 entries, 0 to 143136
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   0            143120 non-null  object
 1   1            143123 non-null  object
 2   2            142954 non-null  object
 3   source       143137 non-null  object
 4   0_syllables  143137 non-null  object
 5   1_syllables  143137 non-null  object
 6   2_syllables  143137 non-null  object
dtypes: object(7)
memory usage: 7.6+ MB


In [6]:
haikus_df.source.value_counts()

twaiku         111727
img2poems       11808
sballas          8142
gutenberg        5524
tempslibres      4800
haikuzao         1136
Name: source, dtype: int64

Neiman discardes the twaiku source from his final model because the poetry there appears to be low quality

In [8]:
haikus_notwitter_df = haikus_df[haikus_df.source != 'twaiku']

haikus_notwitter_df.source.value_counts()

img2poems      11808
sballas         8142
gutenberg       5524
tempslibres     4800
haikuzao        1136
Name: source, dtype: int64

In [9]:
haikus_notwitter_df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,tempslibres,23,5,5
2,spring moonset --,a rice ball for,breakfast,tempslibres,34,4,2
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4
...,...,...,...,...,...,...,...
31405,"Jupiter's throne, so dishonestly","won, it was I who secured it: Color and ivory,","marble and bronze, not to mention the poems.",gutenberg,9,1314,11
31406,"Now, all intelligent",men look upon me,in kindness.,gutenberg,6,5,3
31407,They like to Form their,"own image of me, just as",the poet has done.,gutenberg,5,7,5
31408,Nor do the girls take,offense when they see me--by no,means the matrons.,gutenberg,5,7,4


For the sake of simplicity, will only look at the lower syllable count where there are two values

In [18]:
syllable_cols = ['0_syllables', '1_syllables', '2_syllables']

for col in syllable_cols:
    haikus_notwitter_df[col] = haikus_notwitter_df[col].apply(lambda x: int(x.split(',')[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [19]:
haikus_notwitter_df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,tempslibres,2,5,5
2,spring moonset --,a rice ball for,breakfast,tempslibres,3,4,2
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4
...,...,...,...,...,...,...,...
31405,"Jupiter's throne, so dishonestly","won, it was I who secured it: Color and ivory,","marble and bronze, not to mention the poems.",gutenberg,9,13,11
31406,"Now, all intelligent",men look upon me,in kindness.,gutenberg,6,5,3
31407,They like to Form their,"own image of me, just as",the poet has done.,gutenberg,5,7,5
31408,Nor do the girls take,offense when they see me--by no,means the matrons.,gutenberg,5,7,4


Create a new column that has the whole text of the 3-line poems

In [26]:
haikus_notwitter_df['text'] = haikus_notwitter_df['0'] + ' ' + haikus_notwitter_df['1'] + ' ' + haikus_notwitter_df['2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


..and a column with all lower case and without punctuation

In [34]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

haikus_notwitter_df['text_clean'] = haikus_notwitter_df.text.map(alphanumeric).map(punc_lower)

TypeError: expected string or bytes-like object

Split into train and test (80/20)

In [32]:
from sklearn.model_selection import train_test_split

haikus_train_df, haikus_test_df = train_test_split(haikus_notwitter_df, test_size=0.2, random_state=RANDOM)

In [33]:
haikus_train_df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables,text
12344,amoretti sonnet xxvi,e,spenser,img2poems,7.0,1.0,2.0,amoretti sonnet xxvi e spenser
12027,there when they came mind suffered shame,`these be the same and not the same,a-wondering whispered mind,img2poems,8.0,7.0,4.0,there when they came mind suffered shame `thes...
4696,idle conversation,the daffodils nodding,In the breeze,tempslibres,6.0,6.0,3.0,idle conversation the daffodils nodding In the...
23120,alone at sunrise . . .,a small wave covers,my shadow,sballas,5.0,5.0,3.0,alone at sunrise . . . a small wave covers my ...
9632,although there was no sound in all the house,i could not forbear listening for the cry of t...,dragging up their strength to break on the sul...,img2poems,10.0,18.0,15.0,although there was no sound in all the house i...
...,...,...,...,...,...,...,...,...
29802,My Grandpapa lives in a wonderful house,"With a great many windows and doors, There are...","stairs that go down, And such beautiful, slipp...",gutenberg,11.0,16.0,12.0,My Grandpapa lives in a wonderful house With a...
5390,can i wash your hair,underneath,that wig,img2poems,5.0,3.0,2.0,can i wash your hair underneath that wig
860,the last light of day ~,purple rhododendrons,dissolve in the dark,tempslibres,5.0,6.0,5.0,the last light of day ~ purple rhododendrons d...
15795,now every sound at length is hush'd away,these few are sacred moments one more day,drops in the shadowy gulf of bygone things,img2poems,10.0,10.0,11.0,now every sound at length is hush'd away these...


# Topic modelling with GloVe embeddings

In [29]:
from gensim.models import KeyedVectors, Doc2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#vectorise with TF-IDF
tv = TfidfVectorizer(stop_words='english')

haiku_train_tv = tv.fit_transform(haikus_train_df['text'])
haiku_test_tv  = tv.transform(haikus_test_df['text'])

In [None]:
glove_file = './data/image_to_text/glove.840B.300d.txt'
tmp_file = './data/image_to_text/glovetmp.txt'

if not os.path.isfile(tmp_file):
    _ = glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(tmp_file)