In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

# The haiku dataset: loading and initial cleaning

This was obtained from the dataset created by Jeremy Neiman for use in his own haiku generation model, published in the last few days of 2018; Medium post <a href="https://towardsdatascience.com/generating-haiku-with-deep-learning-dbf5d18b4246">here</a> and Github for the dataset <a href="https://github.com/docmarionum1/haikurnn/tree/master/input/poems">here</a>.

In [2]:
haikus_df = pd.read_csv('./data/image_to_text/haikus.csv')

In [3]:
haikus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143137 entries, 0 to 143136
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   0            143120 non-null  object
 1   1            143123 non-null  object
 2   2            142954 non-null  object
 3   source       143137 non-null  object
 4   0_syllables  143137 non-null  object
 5   1_syllables  143137 non-null  object
 6   2_syllables  143137 non-null  object
dtypes: object(7)
memory usage: 7.6+ MB


In [4]:
haikus_df.source.value_counts()

twaiku         111727
img2poems       11808
sballas          8142
gutenberg        5524
tempslibres      4800
haikuzao         1136
Name: source, dtype: int64

Neiman discardes the twaiku source from his final model because the poetry there appears to be low quality

In [5]:
haikus_notwitter_df = haikus_df[haikus_df.source != 'twaiku']

haikus_notwitter_df.source.value_counts()

img2poems      11808
sballas         8142
gutenberg       5524
tempslibres     4800
haikuzao        1136
Name: source, dtype: int64

In [6]:
haikus_notwitter_df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,tempslibres,23,5,5
2,spring moonset --,a rice ball for,breakfast,tempslibres,34,4,2
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4
...,...,...,...,...,...,...,...
31405,"Jupiter's throne, so dishonestly","won, it was I who secured it: Color and ivory,","marble and bronze, not to mention the poems.",gutenberg,9,1314,11
31406,"Now, all intelligent",men look upon me,in kindness.,gutenberg,6,5,3
31407,They like to Form their,"own image of me, just as",the poet has done.,gutenberg,5,7,5
31408,Nor do the girls take,offense when they see me--by no,means the matrons.,gutenberg,5,7,4


For the sake of simplicity, will only look at the lower syllable count where there are two values

In [7]:
syllable_cols = ['0_syllables', '1_syllables', '2_syllables']

for col in syllable_cols:
    haikus_notwitter_df[col] = haikus_notwitter_df[col].apply(lambda x: int(x.split(',')[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [8]:
haikus_notwitter_df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,tempslibres,2,5,5
2,spring moonset --,a rice ball for,breakfast,tempslibres,3,4,2
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4
...,...,...,...,...,...,...,...
31405,"Jupiter's throne, so dishonestly","won, it was I who secured it: Color and ivory,","marble and bronze, not to mention the poems.",gutenberg,9,13,11
31406,"Now, all intelligent",men look upon me,in kindness.,gutenberg,6,5,3
31407,They like to Form their,"own image of me, just as",the poet has done.,gutenberg,5,7,5
31408,Nor do the girls take,offense when they see me--by no,means the matrons.,gutenberg,5,7,4


In [9]:
haikus_notwitter_df[haikus_notwitter_df['source'].isna()].head(5)

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables


Replace NaN in '0' - '2' with '', drop the remaining null entry

In [10]:
line_cols = ['0', '1', '2']

for col in line_cols:
    haikus_notwitter_df[col].fillna('', inplace=True)
    
haikus_notwitter_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Create a new column that has the whole text of the 3-line poems

In [11]:
haikus_notwitter_df['text'] = haikus_notwitter_df['0'] + ' ' + haikus_notwitter_df['1'] + ' ' + haikus_notwitter_df['2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


...a column with all lower case and without punctuation..

In [12]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

haikus_notwitter_df['text_clean'] = haikus_notwitter_df.text.map(alphanumeric).map(punc_lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


...and one that has tokens for line breaks and end of poem <strike>and is a list of words/tokens</strike>

In [13]:
haikus_notwitter_df['text_withtokens'] = haikus_notwitter_df['0'].apply(lambda x: x.split(' ') + ['<nEXt>']) \
                                        + haikus_notwitter_df['1'].apply(lambda x: x.split(' ') + ['<nEXt>']) \
                                        + haikus_notwitter_df['2'].apply(lambda x: x.split(' ') + ['<eNd>'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [55]:
# haikus_notwitter_df['text_withtokens'] = haikus_notwitter_df['0'] + ' <nEXt> ' \
#                                          + haikus_notwitter_df['1'] + ' <nEXt> ' \
#                                          + haikus_notwitter_df['2'] + ' <eNd>'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
# Using ↕ to represent a new line and ◘ to represent end of poe

haikus_notwitter_df['textchar_withtokens'] = haikus_notwitter_df['0'] + '↕' \
                                         + haikus_notwitter_df['1'] + '↕' \
                                         + haikus_notwitter_df['2'] + '◘'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Split into train and test (80/20)

In [15]:
from sklearn.model_selection import train_test_split

haikus_train_df, haikus_test_df = train_test_split(haikus_notwitter_df, test_size=0.2, random_state=RANDOM)

In [16]:
haikus_train_df.head(3).T

Unnamed: 0,3609,12344,12027
0,an oasis,amoretti sonnet xxvi,there when they came mind suffered shame
1,in the Bible Belt --,e,`these be the same and not the same
2,adult book store,spenser,a-wondering whispered mind
source,tempslibres,img2poems,img2poems
0_syllables,4,7,8
1_syllables,5,1,7
2_syllables,4,2,4
text,an oasis in the Bible Belt -- adult book store,amoretti sonnet xxvi e spenser,there when they came mind suffered shame `thes...
text_clean,an oasis in the bible belt adult book store,amoretti sonnet xxvi e spenser,there when they came mind suffered shame thes...
text_withtokens,"[an, oasis, <nEXt>, in, the, Bible, Belt, --, ...","[amoretti, sonnet, xxvi, <nEXt>, e, <nEXt>, sp...","[there, when, they, came, mind, suffered, sham..."


# MVP: Throw everything into an RNN

In [17]:
from keras.models import Sequential
from keras.layers import Dense, Input, LSTM, Dropout
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Using letters

In [18]:
corpus_raw = ''.join(haikus_train_df['textchar_withtokens'])

chars = sorted(set(list(corpus_raw)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

n_chars = len(corpus_raw)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1849446
Total Vocab:  107


In [19]:
chars

[' ',
 '!',
 '"',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '~',
 '\x85',
 '\x92',
 '\x96',
 '\x97',
 '\xa0',
 'à',
 'ä',
 'é',
 'ü',
 'ē',
 'ū',
 'ŭ',
 '\u200b',
 '‘',
 '’',
 '“',
 '”',
 '…',
 '↕',
 '◘']

In [20]:
n_poems = len(haikus_train_df)

n_poems

25128

In [21]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 10

poemX = []
poemY = []
n_patterns = 0

corpusX = []
corpusY = []
for poem_index in range(0, n_poems):

    textX = []
    textY = []
    poem = haikus_train_df['textchar_withtokens'].iloc[poem_index]
    for i in range(0,  len(poem) - seq_length, 1):
        seq_in = poem[i:i + seq_length]
        seq_out = poem[i + seq_length]
        textX.append([char_to_int[char] for char in seq_in])
        textY.append(char_to_int[seq_out])
    n_patterns = max(n_patterns, len(textX))
    
    poemX.append(textX)
    poemY.append(textY)
    
    corpusX += textX
    corpusY += textY

print("Max patterns per poem: ", n_patterns)

Max patterns per poem:  792


In [22]:
endpoem_charindex = char_to_int['◘']
newline_charindex = char_to_int['↕']

In [23]:
pd.DataFrame(corpusX)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,59,72,0,73,59,77,67,77,105,67
1,72,0,73,59,77,67,77,105,67,72
2,0,73,59,77,67,77,105,67,72,0
3,73,59,77,67,77,105,67,72,0,78
4,59,77,67,77,105,67,72,0,78,66
...,...,...,...,...,...,...,...,...,...,...
1598173,81,73,76,70,62,0,77,66,67,71
1598174,73,76,70,62,0,77,66,67,71,71
1598175,76,70,62,0,77,66,67,71,71,63
1598176,70,62,0,77,66,67,71,71,63,76


In [135]:
corpusY

[72,
 0,
 78,
 66,
 63,
 0,
 30,
 67,
 60,
 70,
 63,
 0,
 30,
 63,
 70,
 78,
 0,
 10,
 10,
 105,
 59,
 62,
 79,
 70,
 78,
 0,
 60,
 73,
 73,
 69,
 0,
 77,
 78,
 73,
 76,
 63,
 106,
 73,
 72,
 72,
 63,
 78,
 0,
 82,
 82,
 80,
 67,
 105,
 63,
 105,
 77,
 74,
 63,
 72,
 77,
 63,
 76,
 106,
 0,
 78,
 66,
 63,
 83,
 0,
 61,
 59,
 71,
 63,
 0,
 71,
 67,
 72,
 62,
 0,
 77,
 79,
 64,
 64,
 63,
 76,
 63,
 62,
 0,
 77,
 66,
 59,
 71,
 63,
 105,
 58,
 78,
 66,
 63,
 77,
 63,
 0,
 60,
 63,
 0,
 78,
 66,
 63,
 0,
 77,
 59,
 71,
 63,
 0,
 59,
 72,
 62,
 0,
 72,
 73,
 78,
 0,
 78,
 66,
 63,
 0,
 77,
 59,
 71,
 63,
 105,
 59,
 10,
 81,
 73,
 72,
 62,
 63,
 76,
 67,
 72,
 65,
 0,
 81,
 66,
 67,
 77,
 74,
 63,
 76,
 63,
 62,
 0,
 71,
 67,
 72,
 62,
 106,
 76,
 77,
 59,
 78,
 67,
 73,
 72,
 105,
 78,
 66,
 63,
 0,
 62,
 59,
 64,
 64,
 73,
 62,
 67,
 70,
 77,
 0,
 72,
 73,
 62,
 62,
 67,
 72,
 65,
 105,
 37,
 72,
 0,
 78,
 66,
 63,
 0,
 60,
 76,
 63,
 63,
 84,
 63,
 106,
 78,
 66,
 63,
 0,
 77,
 70,
 73,


In [92]:
# reshape X to be [samples, time steps, features]
#X = np.reshape(poemX, (n_patterns, seq_length, n_poems))
# normalize
X = np.array([np.array([char / float(n_chars) for char in seq]) for poem in poemX for seq in poem])
X = np.reshape(X, (X.shape[0], X.shape[1], 1))
# one hot encode the output variable
y = to_categorical([nextchar for poem in poemY for nextchar in poem])

In [136]:
# reshape X to be [samples, time steps, features]
#X = np.reshape(poemX, (n_patterns, seq_length, n_poems))
# normalize
X = np.reshape(X, (len(corpusX), seq_length, 1))
# one hot encode the output variable
y = to_categorical(corpusY)

In [139]:
X.shape

(1598178, 10, 1)

In [122]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [123]:
# define the checkpoint
filepath="letter-weights-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# fit the model
model.fit(X, y, epochs=1, batch_size=128, callbacks=callbacks_list)

Epoch 1/1

Epoch 00001: loss improved from inf to 3.07295, saving model to letter-weights-01-3.0729.hdf5


<keras.callbacks.callbacks.History at 0x17e7e3380c8>

In [98]:
def poemchar(index):
    char = int_to_char[index]
    if char == '↕':
        char = '\n'
    elif char == '◘':
        char = '' # represent end of poem
    return char

In [129]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

start = np.random.randint(0, len(poemX)-1)
while start == endpoem_charindex: # don't start with end of the poem
    start = np.random.randint(0, len(poemX)-1)
    
pattern = poemX[start][0]
[print(poemchar(char), end='') for char in pattern]
# generate characters
for i in range(3):
    x = np.reshape([char / float(n_vocab) for char in pattern], (1, len(pattern), 1))
    #print(x)
    prediction = model.predict(x, verbose=0) * float(n_vocab)
    #print(prediction)
    index = np.argmax(prediction)
    print(index)
    result = poemchar(index)
    
    if result == '':
        break;
    
    seq_in = [int_to_char[value] for value in pattern]
    print(result, end='')
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Max starte0
 0
 0
 
Done.


That's.... kinda terrible. Let's use words instead of letters.

## Using words

In [141]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [143]:
wordcount = CountVectorizer(lowercase=False)

X_wc = wordcount.fit_transform(haikus_train_df['text_withtokens'].map(lambda x: ' '.join(x)))

pd.DataFrame(X_wc.toarray(), columns=wordcount.get_feature_names(), index=haikus_train_df.index)

Unnamed: 0,00,01,10,100,11,11th,12,12a,12th,13,...,zoom,zor,zucchini,zuleika,zulu,zushio,émeutes,équilibre,ēn,ēng
3609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12027,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4696,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [237]:
X_wc.value_counts()

AttributeError: value_counts not found

In [216]:
flatten = lambda l: [item for sublist in l for item in sublist]

corpuswords_raw = flatten(list(haikus_notwitter_df['text_withtokens']))

words = sorted(set(corpuswords_raw))
word_to_int = dict((w, i) for i, w in enumerate(words))

n_words = len(corpuswords_raw)
n_vocab_words = len(words)
print("Total Words: ", n_words)
print("Total Vocab: ", n_vocab_words)

Total Words:  524930
Total Vocab:  41965


In [140]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 2
wordX = []
wordY = []
for i in range(0, n_words - seq_length, 1):
    seq_in = corpuswords_raw[i:i + seq_length]
    seq_out = corpuswords_raw[i + seq_length]
    dataX.append([word_to_int[word] for word in seq_in])
    dataY.append(word_to_int[seq_out])
n_wordpatterns = len(wordX)
print("Total Patterns: ", n_wordpatterns)

NameError: name 'n_words' is not defined

In [219]:
# reshape X to be [samples, time steps, features]
X = np.reshape(wordX, (n_wordpatterns, seq_length, 1))
# normalize
X = X / float(n_vocab_words)
# one hot encode the output variable
y = to_categorical(wordY)

ValueError: zero-size array to reduction operation maximum which has no identity

In [140]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [144]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# fit the model
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)


Epoch 1/20

Epoch 00001: loss improved from inf to 2.66164, saving model to weights-improvement-01-2.6616.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.66164 to 2.34912, saving model to weights-improvement-02-2.3491.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.34912 to 2.20226, saving model to weights-improvement-03-2.2023.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.20226 to 2.11321, saving model to weights-improvement-04-2.1132.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.11321 to 2.05213, saving model to weights-improvement-05-2.0521.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.05213 to 2.00513, saving model to weights-improvement-06-2.0051.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.00513 to 1.96914, saving model to weights-improvement-07-1.9691.hdf5
Epoch 8/20

Epoch 00008: loss improved from 1.96914 to 1.94036, saving model to weights-improvement-08-1.9404.hdf5
Epoch 9/20

Epoch 00009: loss improved from 1.94036 to 1.91641, saving model to wei

<keras.callbacks.callbacks.History at 0x22a239b3c48>

In [169]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
# generate characters
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    
    if result == '↕':
        result = '\n'
    elif result == '◘':
        break;
    
    seq_in = [int_to_char[value] for value in pattern]
    print(result, end='')
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

he sound of the sain
Done.


# Topic modelling
## 1. From scratch

In [166]:
from gensim.models import KeyedVectors, Doc2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
#vectorise with TF-IDF
tv = TfidfVectorizer(stop_words='english')

haiku_train_tv = tv.fit_transform(haikus_train_df['text_clean'])
haiku_test_tv  = tv.transform(haikus_test_df['text_clean'])

len(tv.get_feature_names())

23462

In [41]:
tv2 = TfidfVectorizer(stop_words='english', ngram_range=(1,2))

haiku_train_tv2 = tv2.fit_transform(haikus_train_df['text_clean'])
haiku_test_tv2  = tv2.transform(haikus_test_df['text_clean'])

len(tv2.get_feature_names())

160557

23.5k features for unigram, explodes to 160.6k features with bigrams

Let's see what basic topic modelling comes up with

In [19]:
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

In [20]:
def top_words(model, feature_names, n_top_words):
    topic_list = []
    for topic_idx, topic in enumerate(model.components_):
        word_list = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topic_list.append(word_list)

    return topic_list

In [21]:
nmf_model = NMF(20, random_state=RANDOM)
nmf_topic = nmf_model.fit_transform(haiku_train_tv)
pd.DataFrame(top_words(nmf_model, tv.get_feature_names(), 10)).add_prefix('word_').rename('topic_{}'.format)

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9
topic_0,thy,thou,thee,shall,come,god,heart,man,said,life
topic_1,moon,harvest,crescent,new,half,window,just,rising,puddle,cold
topic_2,rain,sound,scent,smell,cold,window,soft,garden,steady,heavy
topic_3,night,stars,cold,late,moonless,starry,dark,sleepless,sleep,window
topic_4,day,year,mother,valentine,memorial,warm,new,hot,rainy,end
topic_5,morning,fog,coffee,mist,early,cold,cup,haze,sunday,frost
topic_6,summer,end,indian,late,heat,solstice,sound,river,evening,dusk
topic_7,autumn,leaves,falling,evening,equinox,chill,dusk,deep,sunset,fallen
topic_8,winter,solstice,stars,deep,cold,late,cat,train,comes,hands
topic_9,old,new,man,year,woman,dog,days,wall,leaves,young


In [22]:
nmf_model2 = NMF(20, random_state=RANDOM)
nmf_topic2 = nmf_model2.fit_transform(haiku_train_tv2)
pd.DataFrame(top_words(nmf_model2, tv2.get_feature_names(), 10)).add_prefix('word_').rename('topic_{}'.format)

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9
topic_0,rain,winter rain,spring rain,summer rain,autumn rain,sound,scent,smell,night rain,cold
topic_1,moon,harvest,harvest moon,crescent,crescent moon,half,new moon,half moon,day moon,window
topic_2,thy,thou,shall,thee,like,heart,god,life,said,dead
topic_3,night,long,winter night,moonless night,moonless,cold,late night,stars,late,starry
topic_4,day,long,valentine day,valentine,mother,day day,memorial day,memorial,mother day,warm
topic_5,morning,fog,morning fog,morning sun,spring morning,coffee,early,early morning,mist,winter morning
topic_6,summer,end,summer end,indian summer,indian,late,summer rain,late summer,heat,end summer
topic_7,winter,winter rain,winter night,solstice,winter solstice,stars,winter stars,deep,deep winter,winter morning
topic_8,leaves,falling,fallen,fallen leaves,falling leaves,autumn leaves,red,fall,maple,yellow
topic_9,sky,blue,blue sky,clouds,white,sea,stars,high,color,eyes


TD-IDF with bigrams seems to do much better at retrieving relevant topics. Let's try 50 topics

In [23]:
nmf_model2 = NMF(50, random_state=RANDOM)
nmf_topic2 = nmf_model2.fit_transform(haiku_train_tv2)
nmf_50topics_details = pd.DataFrame(top_words(nmf_model2, tv2.get_feature_names(), 10))\
                        .add_prefix('word_').rename('topic_{}'.format)

nmf_50topics_details

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9
topic_0,rain,winter rain,spring rain,summer rain,autumn rain,night rain,smell,scent,soft,soft rain
topic_1,moon,harvest,harvest moon,crescent,crescent moon,new moon,half,half moon,day moon,winter moon
topic_2,thy,thou,thee,art,thou art,hast,art thou,thou hast,shalt,thine
topic_3,night,winter night,moonless night,moonless,starry,starry night,summer night,night rain,late night,night moon
topic_4,day,valentine day,valentine,day day,memorial day,memorial,day moon,warm,spring day,mother day
topic_5,morning,fog,morning fog,morning sun,spring morning,early morning,early,winter morning,mist,coffee
topic_6,summer,indian summer,indian,summer rain,heat,summer end,summer heat,summer night,late summer,solstice
topic_7,autumn,autumn rain,autumn wind,autumn leaves,equinox,autumn chill,autumn equinox,chill,dusk,autumn sun
topic_8,winter,winter rain,winter night,solstice,winter solstice,deep,winter morning,deep winter,winter moon,winter stars
topic_9,sky,blue,blue sky,color,autumn sky,winter sky,sea,sunglasses blue,eyes,clear


The 50 topics also seem pretty decent, let's go with that for now.

In [24]:
nmf_50topics = pd.DataFrame(nmf_topic2, index=haikus_train_df.index).add_prefix('topic_')

nmf_50topics

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_40,topic_41,topic_42,topic_43,topic_44,topic_45,topic_46,topic_47,topic_48,topic_49
3609,0.000005,0.000000,4.689159e-07,0.000637,0.000321,0.000000,0.000162,0.000343,0.001079,0.000000,...,0.000474,0.000307,0.000000,0.000717,0.000000,0.001847,0.000000,0.000654,0.000891,0.002135
12344,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000007,0.000051,0.000000,0.000000,0.000000,0.000011,0.000000,0.000010,0.000000,0.000011
12027,0.000017,0.000000,2.272065e-04,0.000531,0.000070,0.000036,0.000000,0.000331,0.000000,0.000000,...,0.002775,0.000000,0.001250,0.000812,0.000000,0.003113,0.000000,0.000797,0.002742,0.002847
4696,0.000020,0.000017,9.253097e-05,0.000000,0.001024,0.000101,0.002012,0.000199,0.000000,0.000000,...,0.000154,0.000000,0.000864,0.000000,0.000235,0.000000,0.000000,0.000254,0.000000,0.000000
23119,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.010232,0.000129,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.002015,0.071436,0.000000,0.006212,0.000047,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,0.000022,0.000124,2.787117e-04,0.000549,0.000469,0.000000,0.000184,0.000210,0.000104,0.001658,...,0.001719,0.001038,0.002611,0.008232,0.000459,0.000929,0.000000,0.002618,0.000000,0.000572
5390,0.000000,0.000973,0.000000e+00,0.000000,0.000191,0.000138,0.000000,0.000450,0.000000,0.000000,...,0.000075,0.000461,0.000000,0.000188,0.001418,0.000000,0.004348,0.000000,0.000829,0.001437
860,0.000000,0.000000,0.000000e+00,0.000702,0.075547,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.026900,0.000000,0.000000,0.000000,0.000000,0.000000
15795,0.000041,0.000000,0.000000e+00,0.000000,0.046239,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000527,0.000000,0.000000,0.000000,0.000412,0.000000,0.000000,0.000000,0.001214


In [25]:
nmf_50topics.max(axis=1).idxmax()

29340

In [26]:
nmf_50topics.iloc[0].sort_values()

topic_24    0.000000e+00
topic_27    0.000000e+00
topic_29    0.000000e+00
topic_15    0.000000e+00
topic_14    0.000000e+00
topic_13    0.000000e+00
topic_12    0.000000e+00
topic_30    0.000000e+00
topic_31    0.000000e+00
topic_9     0.000000e+00
topic_37    0.000000e+00
topic_38    0.000000e+00
topic_42    0.000000e+00
topic_5     0.000000e+00
topic_44    0.000000e+00
topic_46    0.000000e+00
topic_1     0.000000e+00
topic_25    0.000000e+00
topic_23    0.000000e+00
topic_2     4.689159e-07
topic_0     5.198011e-06
topic_10    2.118179e-05
topic_17    3.430456e-05
topic_32    4.741704e-05
topic_35    8.872725e-05
topic_20    9.973198e-05
topic_22    1.142687e-04
topic_6     1.621451e-04
topic_16    2.432271e-04
topic_28    2.938739e-04
topic_41    3.071341e-04
topic_4     3.211737e-04
topic_7     3.433245e-04
topic_34    3.635730e-04
topic_19    3.867972e-04
topic_11    4.250823e-04
topic_40    4.744653e-04
topic_3     6.369079e-04
topic_47    6.537482e-04
topic_43    7.174449e-04


In [27]:
nmf_50topics.idxmax(axis=1)

3609     topic_49
12344    topic_10
12027    topic_31
4696     topic_11
23119    topic_44
           ...   
29802    topic_43
5390     topic_46
860       topic_4
15795    topic_37
23654    topic_21
Length: 25128, dtype: object

In [28]:
haikus_train_df.loc[29340][line_cols], nmf_50topics_details.loc[nmf_50topics.loc[29340].idxmax()]

(0                  And I
 1    shall have nothing,
 2               nothing!
 Name: 29340, dtype: object,
 word_0         shall
 word_1          look
 word_2          hear
 word_3          tell
 word_4    shall hear
 word_5    shall look
 word_6          meet
 word_7        heaven
 word_8          thee
 word_9          feel
 Name: topic_26, dtype: object)

## 2. With GloVe embeddings

In [30]:
import os

glove_file = './data/image_to_text/glove.840B.300d.txt'
tmp_file = './data/image_to_text/glovetmp.txt'

if not os.path.isfile(tmp_file):
    _ = glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(tmp_file)

In [31]:
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += glove_model[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [32]:
haiku_glove_train = haikus_train_df['text'].apply(lambda s: buildWordVector(s.split(' '), 300)[0])
haiku_glove_test = haikus_test_df['text'].apply(lambda s: buildWordVector(s.split(' '), 300)[0])

haiku_glove_train_df = pd.DataFrame(list(haiku_glove_train), index=haikus_train_df.index).add_prefix('glove_')
haiku_glove_test_df = pd.DataFrame(list(haiku_glove_test), index=haikus_test_df.index).add_prefix('glove_')

In [33]:
haiku_glove_train_df

Unnamed: 0,glove_0,glove_1,glove_2,glove_3,glove_4,glove_5,glove_6,glove_7,glove_8,glove_9,...,glove_290,glove_291,glove_292,glove_293,glove_294,glove_295,glove_296,glove_297,glove_298,glove_299
3609,0.071281,0.144642,0.040427,-0.004059,0.278126,0.092235,-0.164611,0.078713,-0.024613,1.892366,...,-0.122115,0.027652,-0.111866,-0.058247,-0.054992,0.061078,0.093764,-0.007376,-0.133955,-0.090081
12344,-0.082213,0.056645,0.127280,-0.330566,0.092171,-0.079778,0.543255,0.101867,-0.110720,-0.493580,...,0.098641,-0.404362,-0.269700,0.061225,0.083722,0.011395,-0.152077,0.341448,-0.216030,0.013653
12027,0.021070,0.096822,-0.164946,-0.067310,0.060695,-0.049959,0.068478,-0.109064,-0.026961,2.684088,...,-0.323211,0.020446,-0.003793,-0.018777,0.040273,0.065825,-0.032649,0.066116,0.043149,0.038904
4696,0.151458,0.186946,-0.182676,-0.076836,0.059533,-0.110118,-0.147122,0.084796,-0.007867,1.575796,...,-0.242630,0.029339,0.209571,-0.152839,-0.078191,0.091482,0.174094,0.071815,-0.273848,0.003690
23119,0.151909,0.088393,-0.025562,-0.032305,0.238123,0.107035,-0.058097,0.050178,-0.090315,1.781929,...,-0.144367,0.011191,0.072155,-0.245300,-0.110313,-0.028663,0.047038,0.106898,-0.280544,-0.096316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,0.063456,0.065163,-0.226104,-0.085388,0.115210,-0.057175,-0.030782,-0.143396,0.019982,2.057132,...,-0.335843,-0.036722,0.036782,0.100949,-0.040385,0.115409,0.174129,0.024463,0.125788,0.020009
5390,-0.025521,-0.070567,-0.223037,-0.148321,-0.067061,-0.064510,0.051353,-0.084710,0.003703,1.630664,...,-0.140885,0.130814,0.046790,-0.225410,-0.060154,-0.077138,-0.100798,-0.041543,0.210341,0.107860
860,0.055426,0.206669,-0.029647,-0.125029,0.031141,-0.148260,-0.244038,0.144470,-0.002504,1.701388,...,-0.270555,0.079200,0.138867,-0.312291,-0.250849,0.078958,0.095515,-0.017021,-0.011959,0.131399
15795,0.101601,0.110988,-0.073208,-0.027221,0.133731,-0.048321,-0.122075,0.060302,0.027139,2.233458,...,-0.361302,-0.053324,-0.009555,-0.090044,-0.039933,0.109818,-0.055122,0.027885,0.013973,0.029866
