In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

# The haiku dataset: loading and initial cleaning

This was obtained from the dataset created by Jeremy Neiman for use in his own haiku generation model, published in the last few days of 2018; Medium post <a href="https://towardsdatascience.com/generating-haiku-with-deep-learning-dbf5d18b4246">here</a> and Github for the dataset <a href="https://github.com/docmarionum1/haikurnn/tree/master/input/poems">here</a>.

In [2]:
haikus_df = pd.read_csv('./data/image_to_text/haikus.csv')

In [3]:
haikus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143137 entries, 0 to 143136
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   0            143120 non-null  object
 1   1            143123 non-null  object
 2   2            142954 non-null  object
 3   source       143137 non-null  object
 4   0_syllables  143137 non-null  object
 5   1_syllables  143137 non-null  object
 6   2_syllables  143137 non-null  object
dtypes: object(7)
memory usage: 7.6+ MB


In [4]:
haikus_df.source.value_counts()

twaiku         111727
img2poems       11808
sballas          8142
gutenberg        5524
tempslibres      4800
haikuzao         1136
Name: source, dtype: int64

Neiman discardes the twaiku source from his final model because the poetry there appears to be low quality

In [5]:
haikus_notwitter_df = haikus_df[haikus_df.source != 'twaiku']

haikus_notwitter_df.source.value_counts()

img2poems      11808
sballas         8142
gutenberg       5524
tempslibres     4800
haikuzao        1136
Name: source, dtype: int64

In [6]:
haikus_notwitter_df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,tempslibres,23,5,5
2,spring moonset --,a rice ball for,breakfast,tempslibres,34,4,2
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4
...,...,...,...,...,...,...,...
31405,"Jupiter's throne, so dishonestly","won, it was I who secured it: Color and ivory,","marble and bronze, not to mention the poems.",gutenberg,9,1314,11
31406,"Now, all intelligent",men look upon me,in kindness.,gutenberg,6,5,3
31407,They like to Form their,"own image of me, just as",the poet has done.,gutenberg,5,7,5
31408,Nor do the girls take,offense when they see me--by no,means the matrons.,gutenberg,5,7,4


For the sake of simplicity, will only look at the lower syllable count where there are two values

In [7]:
syllable_cols = ['0_syllables', '1_syllables', '2_syllables']

for col in syllable_cols:
    haikus_notwitter_df[col] = haikus_notwitter_df[col].apply(lambda x: int(x.split(',')[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [8]:
haikus_notwitter_df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,tempslibres,2,5,5
2,spring moonset --,a rice ball for,breakfast,tempslibres,3,4,2
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4
...,...,...,...,...,...,...,...
31405,"Jupiter's throne, so dishonestly","won, it was I who secured it: Color and ivory,","marble and bronze, not to mention the poems.",gutenberg,9,13,11
31406,"Now, all intelligent",men look upon me,in kindness.,gutenberg,6,5,3
31407,They like to Form their,"own image of me, just as",the poet has done.,gutenberg,5,7,5
31408,Nor do the girls take,offense when they see me--by no,means the matrons.,gutenberg,5,7,4


In [9]:
haikus_notwitter_df[haikus_notwitter_df['source'].isna()].head(5)

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables


Replace NaN in '0' - '2' with '', drop the remaining null entry

In [10]:
line_cols = ['0', '1', '2']

for col in line_cols:
    haikus_notwitter_df[col].fillna('', inplace=True)
    
haikus_notwitter_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Create a new column that has the whole text of the 3-line poems

In [11]:
haikus_notwitter_df['text'] = haikus_notwitter_df['0'] + ' ' + haikus_notwitter_df['1'] + ' ' + haikus_notwitter_df['2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


...a column with all lower case and without punctuation..

In [12]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

haikus_notwitter_df['text_clean'] = haikus_notwitter_df.text.map(alphanumeric).map(punc_lower)

haikus_notwitter_df['0_clean'] = haikus_notwitter_df['0'].map(alphanumeric).map(punc_lower)
haikus_notwitter_df['1_clean'] = haikus_notwitter_df['1'].map(alphanumeric).map(punc_lower)
haikus_notwitter_df['2_clean'] = haikus_notwitter_df['2'].map(alphanumeric).map(punc_lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the document

...and one that has tokens for line breaks and end of poem <strike>and is a list of words/tokens</strike>

In [13]:
haikus_notwitter_df['text_withtokens'] = haikus_notwitter_df['0'].apply(lambda x: x.split(' ') + ['<nEXt>']) \
                                        + haikus_notwitter_df['1'].apply(lambda x: x.split(' ') + ['<nEXt>']) \
                                        + haikus_notwitter_df['2'].apply(lambda x: x.split(' ') + ['<eNd>'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
# haikus_notwitter_df['text_withtokens'] = haikus_notwitter_df['0'] + ' <nEXt> ' \
#                                          + haikus_notwitter_df['1'] + ' <nEXt> ' \
#                                          + haikus_notwitter_df['2'] + ' <eNd>'

In [15]:
# Using ↕ to represent a new line and ◘ to represent end of poe

haikus_notwitter_df['textchar_withtokens'] = haikus_notwitter_df['0'] + '↕' \
                                         + haikus_notwitter_df['1'] + '↕' \
                                         + haikus_notwitter_df['2'] + '◘'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [16]:
haikus_notwitter_df['text_withtokens_clean'] = haikus_notwitter_df['0_clean'].apply(lambda x: x.split(' ') + ['<nEXt>']) \
                                        + haikus_notwitter_df['1_clean'].apply(lambda x: x.split(' ') + ['<nEXt>']) \
                                        + haikus_notwitter_df['2_clean'].apply(lambda x: x.split(' ') + ['<eNd>'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Split into train and test (80/20)

In [17]:
from sklearn.model_selection import train_test_split

haikus_train_df, haikus_test_df = train_test_split(haikus_notwitter_df, test_size=0.2, random_state=RANDOM)

In [18]:
haikus_train_df.head(3).T

Unnamed: 0,3609,12344,12027
0,an oasis,amoretti sonnet xxvi,there when they came mind suffered shame
1,in the Bible Belt --,e,`these be the same and not the same
2,adult book store,spenser,a-wondering whispered mind
source,tempslibres,img2poems,img2poems
0_syllables,4,7,8
1_syllables,5,1,7
2_syllables,4,2,4
text,an oasis in the Bible Belt -- adult book store,amoretti sonnet xxvi e spenser,there when they came mind suffered shame `thes...
text_clean,an oasis in the bible belt adult book store,amoretti sonnet xxvi e spenser,there when they came mind suffered shame thes...
0_clean,an oasis,amoretti sonnet xxvi,there when they came mind suffered shame


Save this for training on seperate models

In [19]:
haikus_train_df.to_pickle('./data/haikus_train_df.pickle')
haikus_test_df.to_pickle('./data/haikus_test_df.pickle')

# Topic modelling
## 1. From scratch

In [20]:
from gensim.models import KeyedVectors, Doc2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.feature_extraction.text import TfidfVectorizer



In [21]:
#vectorise with TF-IDF
tv = TfidfVectorizer(stop_words='english')

haiku_train_tv = tv.fit_transform(haikus_train_df['text_clean'])
haiku_test_tv  = tv.transform(haikus_test_df['text_clean'])

len(tv.get_feature_names())

23462

In [22]:
tv2 = TfidfVectorizer(stop_words='english', ngram_range=(1,2))

haiku_train_tv2 = tv2.fit_transform(haikus_train_df['text_clean'])
haiku_test_tv2  = tv2.transform(haikus_test_df['text_clean'])

len(tv2.get_feature_names())

160557

23.5k features for unigram, explodes to 160.6k features with bigrams

Let's see what basic topic modelling comes up with

In [23]:
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

In [24]:
def top_words(model, feature_names, n_top_words):
    topic_list = []
    for topic_idx, topic in enumerate(model.components_):
        word_list = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topic_list.append(word_list)

    return topic_list

In [25]:
nmf_model = NMF(20, random_state=RANDOM)
nmf_topic = nmf_model.fit_transform(haiku_train_tv)
pd.DataFrame(top_words(nmf_model, tv.get_feature_names(), 10)).add_prefix('word_').rename('topic_{}'.format)

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9
topic_0,thy,thou,thee,shall,come,god,heart,man,said,life
topic_1,moon,harvest,crescent,new,half,window,just,rising,puddle,cold
topic_2,rain,sound,scent,smell,cold,window,soft,garden,steady,heavy
topic_3,night,stars,cold,late,moonless,starry,dark,sleepless,sleep,window
topic_4,day,year,mother,valentine,memorial,warm,new,hot,rainy,end
topic_5,morning,fog,coffee,mist,early,cold,cup,haze,sunday,frost
topic_6,summer,end,indian,late,heat,solstice,sound,river,evening,dusk
topic_7,autumn,leaves,falling,evening,equinox,chill,dusk,deep,sunset,fallen
topic_8,winter,solstice,stars,deep,cold,late,cat,train,comes,hands
topic_9,old,new,man,year,woman,dog,days,wall,leaves,young


In [26]:
nmf_model2 = NMF(20, random_state=RANDOM)
nmf_topic2 = nmf_model2.fit_transform(haiku_train_tv2)
pd.DataFrame(top_words(nmf_model2, tv2.get_feature_names(), 10)).add_prefix('word_').rename('topic_{}'.format)

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9
topic_0,rain,winter rain,spring rain,summer rain,autumn rain,sound,scent,smell,night rain,cold
topic_1,moon,harvest,harvest moon,crescent,crescent moon,half,new moon,half moon,day moon,window
topic_2,thy,thou,shall,thee,like,heart,god,life,said,dead
topic_3,night,long,winter night,moonless night,moonless,cold,late night,stars,late,starry
topic_4,day,long,valentine day,valentine,mother,day day,memorial day,memorial,mother day,warm
topic_5,morning,fog,morning fog,morning sun,spring morning,coffee,early,early morning,mist,winter morning
topic_6,summer,end,summer end,indian summer,indian,late,summer rain,late summer,heat,end summer
topic_7,winter,winter rain,winter night,solstice,winter solstice,stars,winter stars,deep,deep winter,winter morning
topic_8,leaves,falling,fallen,fallen leaves,falling leaves,autumn leaves,red,fall,maple,yellow
topic_9,sky,blue,blue sky,clouds,white,sea,stars,high,color,eyes


TD-IDF with bigrams seems to do much better at retrieving relevant topics. Let's try 50 topics

In [27]:
nmf_model2 = NMF(50, random_state=RANDOM)
nmf_topic2 = nmf_model2.fit_transform(haiku_train_tv2)
nmf_50topics_details = pd.DataFrame(top_words(nmf_model2, tv2.get_feature_names(), 10))\
                        .add_prefix('word_').rename('topic_{}'.format)

nmf_50topics_details

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9
topic_0,rain,winter rain,spring rain,summer rain,autumn rain,night rain,smell,scent,soft,soft rain
topic_1,moon,harvest,harvest moon,crescent,crescent moon,new moon,half,half moon,day moon,winter moon
topic_2,thy,thou,thee,art,thou art,hast,art thou,thou hast,shalt,thine
topic_3,night,winter night,moonless night,moonless,starry,starry night,summer night,night rain,late night,night moon
topic_4,day,valentine day,valentine,day day,memorial day,memorial,day moon,warm,spring day,mother day
topic_5,morning,fog,morning fog,morning sun,spring morning,early morning,early,winter morning,mist,coffee
topic_6,summer,indian summer,indian,summer rain,heat,summer end,summer heat,summer night,late summer,solstice
topic_7,autumn,autumn rain,autumn wind,autumn leaves,equinox,autumn chill,autumn equinox,chill,dusk,autumn sun
topic_8,winter,winter rain,winter night,solstice,winter solstice,deep,winter morning,deep winter,winter moon,winter stars
topic_9,sky,blue,blue sky,color,autumn sky,winter sky,sea,sunglasses blue,eyes,clear


The 50 topics also seem pretty decent, let's go with that for now.

In [28]:
nmf_50topics = pd.DataFrame(nmf_topic2, index=haikus_train_df.index).add_prefix('topic_')

nmf_50topics

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_40,topic_41,topic_42,topic_43,topic_44,topic_45,topic_46,topic_47,topic_48,topic_49
3609,0.000005,0.000000,4.689159e-07,0.000637,0.000321,0.000000,0.000162,0.000343,0.001079,0.000000,...,0.000474,0.000307,0.000000,0.000717,0.000000,0.001847,0.000000,0.000654,0.000891,0.002135
12344,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000007,0.000051,0.000000,0.000000,0.000000,0.000011,0.000000,0.000010,0.000000,0.000011
12027,0.000017,0.000000,2.272065e-04,0.000531,0.000070,0.000036,0.000000,0.000331,0.000000,0.000000,...,0.002775,0.000000,0.001250,0.000812,0.000000,0.003113,0.000000,0.000797,0.002742,0.002847
4696,0.000020,0.000017,9.253097e-05,0.000000,0.001024,0.000101,0.002012,0.000199,0.000000,0.000000,...,0.000154,0.000000,0.000864,0.000000,0.000235,0.000000,0.000000,0.000254,0.000000,0.000000
23119,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.010232,0.000129,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.002015,0.071436,0.000000,0.006212,0.000047,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,0.000022,0.000124,2.787117e-04,0.000549,0.000469,0.000000,0.000184,0.000210,0.000104,0.001658,...,0.001719,0.001038,0.002611,0.008232,0.000459,0.000929,0.000000,0.002618,0.000000,0.000572
5390,0.000000,0.000973,0.000000e+00,0.000000,0.000191,0.000138,0.000000,0.000450,0.000000,0.000000,...,0.000075,0.000461,0.000000,0.000188,0.001418,0.000000,0.004348,0.000000,0.000829,0.001437
860,0.000000,0.000000,0.000000e+00,0.000702,0.075547,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.026900,0.000000,0.000000,0.000000,0.000000,0.000000
15795,0.000041,0.000000,0.000000e+00,0.000000,0.046239,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000527,0.000000,0.000000,0.000000,0.000412,0.000000,0.000000,0.000000,0.001214


In [29]:
nmf_50topics.max(axis=1).idxmax()

29340

In [30]:
nmf_50topics.iloc[0].sort_values()

topic_24    0.000000e+00
topic_27    0.000000e+00
topic_29    0.000000e+00
topic_15    0.000000e+00
topic_14    0.000000e+00
topic_13    0.000000e+00
topic_12    0.000000e+00
topic_30    0.000000e+00
topic_31    0.000000e+00
topic_9     0.000000e+00
topic_37    0.000000e+00
topic_38    0.000000e+00
topic_42    0.000000e+00
topic_5     0.000000e+00
topic_44    0.000000e+00
topic_46    0.000000e+00
topic_1     0.000000e+00
topic_25    0.000000e+00
topic_23    0.000000e+00
topic_2     4.689159e-07
topic_0     5.198011e-06
topic_10    2.118179e-05
topic_17    3.430456e-05
topic_32    4.741704e-05
topic_35    8.872725e-05
topic_20    9.973198e-05
topic_22    1.142687e-04
topic_6     1.621451e-04
topic_16    2.432271e-04
topic_28    2.938739e-04
topic_41    3.071341e-04
topic_4     3.211737e-04
topic_7     3.433245e-04
topic_34    3.635730e-04
topic_19    3.867972e-04
topic_11    4.250823e-04
topic_40    4.744653e-04
topic_3     6.369079e-04
topic_47    6.537482e-04
topic_43    7.174449e-04


In [31]:
nmf_50topics.idxmax(axis=1)

3609     topic_49
12344    topic_10
12027    topic_31
4696     topic_11
23119    topic_44
           ...   
29802    topic_43
5390     topic_46
860       topic_4
15795    topic_37
23654    topic_21
Length: 25128, dtype: object

In [32]:
haikus_train_df.loc[29340][line_cols], nmf_50topics_details.loc[nmf_50topics.loc[29340].idxmax()]

(0                  And I
 1    shall have nothing,
 2               nothing!
 Name: 29340, dtype: object,
 word_0         shall
 word_1          look
 word_2          hear
 word_3          tell
 word_4    shall hear
 word_5    shall look
 word_6          meet
 word_7        heaven
 word_8          thee
 word_9          feel
 Name: topic_26, dtype: object)

## 2. With GloVe embeddings

In [33]:
import os

glove_file = './data/image_to_text/glove.840B.300d.txt'
tmp_file = './data/image_to_text/glovetmp.txt'

if not os.path.isfile(tmp_file):
    _ = glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(tmp_file)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [34]:
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += glove_model[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [35]:
haiku_glove_train = haikus_train_df['text'].apply(lambda s: buildWordVector(s.split(' '), 300)[0])
haiku_glove_test = haikus_test_df['text'].apply(lambda s: buildWordVector(s.split(' '), 300)[0])

haiku_glove_train_df = pd.DataFrame(list(haiku_glove_train), index=haikus_train_df.index).add_prefix('glove_')
haiku_glove_test_df = pd.DataFrame(list(haiku_glove_test), index=haikus_test_df.index).add_prefix('glove_')

In [36]:
haiku_glove_train_df

Unnamed: 0,glove_0,glove_1,glove_2,glove_3,glove_4,glove_5,glove_6,glove_7,glove_8,glove_9,...,glove_290,glove_291,glove_292,glove_293,glove_294,glove_295,glove_296,glove_297,glove_298,glove_299
3609,0.071281,0.144642,0.040427,-0.004059,0.278126,0.092235,-0.164611,0.078713,-0.024613,1.892366,...,-0.122115,0.027652,-0.111866,-0.058247,-0.054992,0.061078,0.093764,-0.007376,-0.133955,-0.090081
12344,-0.082213,0.056645,0.127280,-0.330566,0.092171,-0.079778,0.543255,0.101867,-0.110720,-0.493580,...,0.098641,-0.404362,-0.269700,0.061225,0.083722,0.011395,-0.152077,0.341448,-0.216030,0.013653
12027,0.021070,0.096822,-0.164946,-0.067310,0.060695,-0.049959,0.068478,-0.109064,-0.026961,2.684088,...,-0.323211,0.020446,-0.003793,-0.018777,0.040273,0.065825,-0.032649,0.066116,0.043149,0.038904
4696,0.151458,0.186946,-0.182676,-0.076836,0.059533,-0.110118,-0.147122,0.084796,-0.007867,1.575796,...,-0.242630,0.029339,0.209571,-0.152839,-0.078191,0.091482,0.174094,0.071815,-0.273848,0.003690
23119,0.151909,0.088393,-0.025562,-0.032305,0.238123,0.107035,-0.058097,0.050178,-0.090315,1.781929,...,-0.144367,0.011191,0.072155,-0.245300,-0.110313,-0.028663,0.047038,0.106898,-0.280544,-0.096316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,0.063456,0.065163,-0.226104,-0.085388,0.115210,-0.057175,-0.030782,-0.143396,0.019982,2.057132,...,-0.335843,-0.036722,0.036782,0.100949,-0.040385,0.115409,0.174129,0.024463,0.125788,0.020009
5390,-0.025521,-0.070567,-0.223037,-0.148321,-0.067061,-0.064510,0.051353,-0.084710,0.003703,1.630664,...,-0.140885,0.130814,0.046790,-0.225410,-0.060154,-0.077138,-0.100798,-0.041543,0.210341,0.107860
860,0.055426,0.206669,-0.029647,-0.125029,0.031141,-0.148260,-0.244038,0.144470,-0.002504,1.701388,...,-0.270555,0.079200,0.138867,-0.312291,-0.250849,0.078958,0.095515,-0.017021,-0.011959,0.131399
15795,0.101601,0.110988,-0.073208,-0.027221,0.133731,-0.048321,-0.122075,0.060302,0.027139,2.233458,...,-0.361302,-0.053324,-0.009555,-0.090044,-0.039933,0.109818,-0.055122,0.027885,0.013973,0.029866


# Stanza

In [37]:
import stanza
stanza.download('en')       # This downloads the English models for the neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 1.53MB/s]                    
2020-06-25 17:14:36 INFO: Downloading default packages for language: en (English)...
2020-06-25 17:14:38 INFO: File exists: C:\Users\vi_ci\stanza_resources\en\default.zip.
2020-06-25 17:14:42 INFO: Finished downloading models and saved to C:\Users\vi_ci\stanza_resources.


In [38]:
nlp = stanza.Pipeline('en', use_gpu=True, verbose=True, pos_batch_size=3000)
doc = nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
doc.sentences[0].print_dependencies()

2020-06-25 17:14:42 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-06-25 17:14:42 INFO: Use device: cpu
2020-06-25 17:14:42 INFO: Loading: tokenize
2020-06-25 17:14:42 INFO: Loading: pos
2020-06-25 17:14:43 INFO: Loading: lemma
2020-06-25 17:14:43 INFO: Loading: depparse
2020-06-25 17:14:44 INFO: Loading: ner
2020-06-25 17:14:44 INFO: Done loading processors!


('Barack', '4', 'nsubj:pass')
('Obama', '1', 'flat')
('was', '4', 'aux:pass')
('born', '0', 'root')
('in', '6', 'case')
('Hawaii', '4', 'obl')
('.', '4', 'punct')


In [43]:
import os

if os.path.isfile('./data/poems_stanza.pickle'):
    poems_stanza = pd.read_pickle('./data/poems_stanza.pickle')
else:
    poems_stanza = haikus_train_df['text'].apply(nlp)
    poems_stanza.to_pickle('./data/poems_stanza.pickle')