In [26]:
!pip install syllapy
#if syllapy is not installed

[0m

In [27]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import syllapy
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

# See Data 

In [28]:
#reading csv and storing into pandas DataFrame
haiku_pd = pd.DataFrame()

#Original Size of data 110938 rows × 190 columns
haiku_df  = pd.read_csv('/kaggle/input/haiku-dataset/all_haiku.csv', low_memory=False)

# drop rows with NaN value and reset index
# size after dropping columns with NaN (64117, 190)
haiku_df = haiku_df.dropna()
haiku_df = haiku_df.reset_index(drop=True)

#checking to see if data was stored
print("Size of data:", haiku_df.shape)
haiku_df.tail()
haiku_df[0::10]

Size of data: (144122, 6)


Unnamed: 0.1,Unnamed: 0,0,1,2,source,hash
0,0,fishing boats,colors of,the rainbow,tempslibres,FISHINGBOATSCOLORSOFTHERAINBOW
10,10,rain,the sound of a horse galloping,through leaves,tempslibres,RAINTHESOUNDOFAHORSEGALLOPINGTHROUGHLEAVES
20,20,morning frost,she leaves,first,tempslibres,MORNINGFROSTSHELEAVESFIRST
30,30,instant message--,moon reveals more,of herself each night,tempslibres,INSTANTMESSAGEMOONREVEALSMOREOFHERSELFEACHNIGHT
40,42,rain,falls from the trees,on the blue iris,tempslibres,RAINFALLSFROMTHETREESONTHEBLUEIRIS
...,...,...,...,...,...,...
144080,117968,I have a headache,Why do Italians have to,be so damn loud for,twaiku,IHAVEAHEADACHEWHYDOITALIANSHAVETOBESODAMNLOUDFOR
144090,117978,The people who just,heard of Drugs You Should Try It,is some lame ass fans,twaiku,THEPEOPLEWHOJUSTHEARDOFDRUGSYOUSHOULDTRYITISSO...
144100,117988,My dad's okay Thank,god he didn't get murdered,in a back alley,twaiku,MYDADSOKAYTHANKGODHEDIDNTGETMURDEREDINABACKALLEY
144110,118000,newt muttering this,is the worst while looking for,his glasses big mood,twaiku,NEWTMUTTERINGTHISISTHEWORSTWHILELOOKINGFORHISG...


# Clean Data
 - Drop all columns that do not contain iportant information.
 - Lower case words.
 - Make sure the format (5, 7, 5) is correct.

In [29]:
#Dropping all columns that do not contain iportant information
haiku_df = haiku_df[['0', '1', '2']]
haiku_df

Unnamed: 0,0,1,2
0,fishing boats,colors of,the rainbow
1,ash wednesday--,trying to remember,my dream
2,snowy morn--,pouring another cup,of black coffee
3,shortest day,flames dance,in the oven
4,haze,half the horse hidden,behind the house
...,...,...,...
144117,I'm not asking did,you say it nor clarify,what you said neither
144118,You are truly a,moron or a liar I'm,inclined to think both
144119,Ain't no selfie on,this earth that's gonna make me,like Theresa May
144120,is doing a great,job turning Independents,into Democrats


In [30]:
#Lower case words.  
for x in haiku_df[['0','1','2']]:
    haiku_df[x] = haiku_df[x].str.lower()
haiku_df

Unnamed: 0,0,1,2
0,fishing boats,colors of,the rainbow
1,ash wednesday--,trying to remember,my dream
2,snowy morn--,pouring another cup,of black coffee
3,shortest day,flames dance,in the oven
4,haze,half the horse hidden,behind the house
...,...,...,...
144117,i'm not asking did,you say it nor clarify,what you said neither
144118,you are truly a,moron or a liar i'm,inclined to think both
144119,ain't no selfie on,this earth that's gonna make me,like theresa may
144120,is doing a great,job turning independents,into democrats


In [31]:
#Make sure the format (5, 7, 5) is correct.
def whole_sentence(haiku):
    for i, sentence in enumerate(haiku):
        count = 0
        try:
            for word in sentence.split():
                count += syllapy.count(word)
            if i == 0 or i == 2:
                if count != 5:
                    return False
            elif i == 1:
                if count != 7:
                    return False
                    break
        except:
            return False
    return True


arr_haiku = haiku_df[['0','1','2']].to_numpy()

haiku_df = haiku_df[[whole_sentence(haiku) for haiku in arr_haiku]]


In [32]:
#Make data into a list for train_test_split
haiku_data = haiku_df[['0','1','2']].to_numpy()

#Reduce dimensionality while keeping the same format
haiku_list = []
for haiku in haiku_data:
    haiku_text = '\n'.join([''.join(line) for line in haiku])
    haiku_list.append(haiku_text)

haiku_train_data, haiku_test_data = train_test_split(haiku_list, test_size=0.2)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(haiku_train_data)

haiku_train_sequences = tokenizer.texts_to_sequences(haiku_train_data)
haiku_test_sequences = tokenizer.texts_to_sequences(haiku_test_data)

haiku_max_sequence_length = max(max(len(seq) for seq in haiku_train_sequences), max(len(seq) for seq in haiku_test_sequences))

haiku_X_train = pad_sequences(haiku_train_sequences, maxlen=haiku_max_sequence_length, padding='post')
haiku_X_test = pad_sequences(haiku_test_sequences, maxlen=haiku_max_sequence_length, padding='post')

haiku_y_train = np.roll(haiku_X_train, -1, axis=1)
haiku_y_test = np.roll(haiku_X_test, -1, axis=1)

haiku_y_train[:, -1] = 0
haiku_y_test[:, -1] = 0

In [33]:
print(haiku_X_train.shape)
print(haiku_y_train.shape[1])

(57146, 17)
17


In [34]:
#Haiku Model using RNN

haiku_vocab = len(tokenizer.word_index) + 1
print(haiku_vocab)
embedding_dim = 128

haiku_model = Sequential()
haiku_model.add(Embedding(haiku_vocab, embedding_dim))
haiku_model.add(LSTM(units=128, return_sequences=True))
haiku_model.add(LSTM(units=128))
haiku_model.add(Dense(units=haiku_X_train.shape[1], activation='softmax'))


# Compile the model
haiku_model.compile(loss='categorical_crossentropy', optimizer='adam')

haiku_model.summary()

20957
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 128)         2682496   
                                                                 
 lstm_4 (LSTM)               (None, None, 128)         131584    
                                                                 
 lstm_5 (LSTM)               (None, 128)               131584    
                                                                 
 dense_1 (Dense)             (None, 17)                2193      
                                                                 
Total params: 2,947,857
Trainable params: 2,947,857
Non-trainable params: 0
_________________________________________________________________


In [35]:
# Train the model
haiku_model.fit(haiku_X_train, haiku_y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7accc8205030>

In [46]:
def generate_haiku(haiku_model, haiku_list):
    haiku_start_word = random.choice(haiku_list)
#     print(start_word)

    for _ in range(1):
        haiku_start_word_index = haiku_list.index(haiku_start_word)
#         print(start_word_index)
        haiku_next_word_probs = haiku_model.predict(
            np.array([[haiku_start_word_index]]))
#         print(next_word_probs)
        haiku_next_word = np.argmax(haiku_next_word_probs)
#         print(next_word)
        haiku_start_word = haiku_list[haiku_next_word]
#         print(start_word)

    return haiku_start_word 


generate_haiku(haiku_model, haiku_list)



'alabaster moon ~\nglowing among bare pine tops\nin cool mountain mist'