In [None]:
import keras
import numpy as np
import pandas as pd
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Dense, LSTM, Dropout, Input
from sklearn.model_selection import train_test_split

In [None]:
# loading the data into file


season1 = pd.read_csv("/content/season1.csv", encoding_errors='ignore')
season1.head()

season1.to_csv('/content/season1.txt', index = False, sep=':')



In [None]:
data_dir = '/content/season1.txt'

with open(data_dir) as f:
    data = f.read()
    
data = data[10:].lower()

# seperate the punchuations from the words
punch = ['.', '[', ']', '(', ')', ';', ':', "'", '/', '"', ',', '?', '*', '!', '-', '$', '%', '&', '\n']

for i in punch:    
    data = data.replace(i, ' ' + i + ' ')
    
data = data.replace('\n', '<NEWLINE>')

In [None]:
data[:400]

"melanie :  why are you late ?  <NEWLINE> rebecca :  you ' re not going to like the answer .  <NEWLINE> melanie :  i already know the answer .  <NEWLINE> rebecca :  i missed the bus .  <NEWLINE> melanie :  i don ' t doubt it ,  no bus stops near brad ' s .  you spent the night ,  the alarm didn ' t work .  or maybe it did .  <NEWLINE> rebecca :  i didn ' t sleep with him .  <NEWLINE> melanie :  gir"

In [None]:
def get_vocab(text):
    
    vocab_to_int = dict()
    int_to_vocab = dict()
    
    vocab = Counter()
    for word in text.split():
        vocab[word] += 1
        
    index = 0    
    for word in vocab:
        vocab_to_int[word] = index
        int_to_vocab[index] = word
        index += 1
        
    return vocab, vocab_to_int, int_to_vocab

vocab, vocab_to_int, int_to_vocab = get_vocab(data)

# converting text into int
text_int = []

for word in data.split():
    text_int.append(vocab_to_int[word])
    
text_int = np.array(text_int) 

In [None]:
seq_len = 200

def get_training_data(data, seq_len):
    
    x_train = []
    y_train = []
    
    for i in range(0, len(data)-seq_len):
        
        x = data[i:i+seq_len]
        y = data[i+1:i+seq_len+1]
        
        x_train.append(np.array(x))
        y_train.append(np.array(y))
        
    return x_train, y_train
  
x, y = get_training_data(text_int, seq_len)

x = np.array(x)
y = np.array(y)
y = y.reshape(y.shape[0], y.shape[1], 1)

In [None]:
embedding = 300
lstm_size = 128
vocab_size = len(vocab)

inp = Input((None,))

embed = Embedding(input_dim=vocab_size, output_dim=embedding)
lstm1 = LSTM(lstm_size, return_sequences=True, return_state=True)
lstm2 = LSTM(lstm_size, return_sequences=True, return_state=True)
lstm3 = LSTM(lstm_size, return_sequences=True, return_state=True)
dense = Dense(vocab_size)

net = embed(inp)
net, h1, c1 = lstm1(net)
net, h2, c2 = lstm2(net)
net, h3, c3 = lstm3(net)
out = dense(net)

model = Model(inp, out)

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.optimizer.lr = 0.05
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print(model.fit(x, y, batch_size=128, epochs=4, shuffle=True))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
<keras.callbacks.History object at 0x7faface0ff10>


In [None]:

init_states = [Input((lstm_size,)) for i in range(6)]

inference = embed(inp)
inference, h1, c1 = lstm1(inference, initial_state=init_states[:2])
inference, h2, c2 = lstm2(inference, initial_state=init_states[2:4])
inference, h3, c3 = lstm3(inference, initial_state=init_states[4:6])
inf_out = dense(inference)

states = [h1, c1, h2, c2, h3, c3]
inf_model = Model([inp]+init_states, [inf_out]+states)

In [None]:
def extract_text(length, start):
    
    states = [np.zeros((1, lstm_size)) for i in range(6)]

    token = np.zeros((1,1))
    token[0,0] = start
    text = int_to_vocab[start] + ' '
    
    for i in range(length):
        
        out = inf_model.predict([token]+states)
        word = np.argmax(out[0][0,0,:])
        text += int_to_vocab[word] + ' '
        states = out[1:7]
        token[0][0] = word
        
    return text 

In [None]:
def post_process_text(text):
    
    punch1 = ['.', ':', '!', ';', ')', ']', '?', ',', '%']
    for i in punch1:
        text = text.replace(' '+i, i)
        
    punch2 = ['[', '(', '$']    
    for i in punch2:
        text = text.replace(i+' ', i)
        
    punch3 = ["'", '-']    
    for i in punch3:
        text = text.replace(' '+i+' ', i)
        
    text = text.split('<NEWLINE>')  
    
        
    return text

In [None]:
generated_text = extract_text(200, 0)
generated_text = post_process_text(generated_text)


["melanie, know, you're not to make you. ", " jessica: i don't think you. ", " foreman: [maintain and but i think i'm not validated you. ", " house: i don't want to see that. ", " foreman: it's not pissing [pause and abusive down; you get to get that with him. ", ' chase: [he leaves.] ', ' foreman: [looking at the carinii. ', ' wilson: i have to make a habits that out? take? [wilson impression.] that why? ', " cameron: lay on this experiencing this spending off i'll just foreman. just accidents, he's good. [pause] oh, you're not lame. ", " cameron: [to the nurse] where's the pissing ", ' foreman: what we do? ', " stacy: he's not insane, the is a contamination. you told me the stood or i do you know. ", ' house: you know. ', ' wilson: why ']


In [None]:

for x in range(len(generated_text)):
 print(generated_text[x])

melanie, know, you're not to make you. 
 jessica: i don't think you. 
 foreman: [maintain and but i think i'm not validated you. 
 house: i don't want to see that. 
 foreman: it's not pissing [pause and abusive down; you get to get that with him. 
 chase: [he leaves.] 
 foreman: [looking at the carinii. 
 wilson: i have to make a habits that out? take? [wilson impression.] that why? 
 cameron: lay on this experiencing this spending off i'll just foreman. just accidents, he's good. [pause] oh, you're not lame. 
 cameron: [to the nurse] where's the pissing 
 foreman: what we do? 
 stacy: he's not insane, the is a contamination. you told me the stood or i do you know. 
 house: you know. 
 wilson: why 
