In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import capstone_2 as cap

In [3]:
# Import Dataset
df_info, df_scripts = cap.load_data()
df_docs_by_ep = cap.agg_dialogue_by_episode(df_scripts, df_info)

In [24]:
df_scripts[df_scripts.Character=='JERRY'].Character.count()

14786

In [37]:
df_scripts.head(5)

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,JERRY,Do you know what this is all about? Do you kno...,0.0,S01E00,1.0
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",0.0,S01E00,1.0
2,GEORGE,Are you through?,0.0,S01E00,1.0
3,JERRY,"You do of course try on, when you buy?",0.0,S01E00,1.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",0.0,S01E00,1.0


In [42]:
import string
string.punctuation
punct = '''[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'''

In [44]:
df_scripts[df_scripts['Character'].str.contains('(',regex=False)]

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
52,(George shows his note-block to Jerry; it says...,NO.),0.0,S01E00,1.0
3932,RESTAURANT MANAGER (BRUCE),How many?,11.0,S02E11,2.0
3933,ELAINE (to Jerry),How many?,11.0,S02E11,2.0
3934,JERRY (to George),Is Tatiana coming?,11.0,S02E11,2.0
3946,JERRY (to Elaine),Tatiana...,11.0,S02E11,2.0
3952,GEORGE (to Jerry),"You know it's a public phone, you're not suppo...",11.0,S02E11,2.0
3969,GEORGE (to guy),"Excuse me, are you going to be much longer? I ...",11.0,S02E11,2.0
3978,GEORGE (whistles to guy on phone),Hey!,11.0,S02E11,2.0
4001,ELAINE (through her teeth),"I know this sounds crazy, but the two men who ...",11.0,S02E11,2.0
4002,ELAINE (through teeth),I'll give you 25 if you let me do it.,11.0,S02E11,2.0


In [66]:
df_scripts = df_scripts.reset_index(drop=True)

In [84]:
def get_speakers_before_jerry(df_scripts):
    df = df_scripts.copy()
    char = 'JERRY'
    main_chars = ['JERRY', 'GEORGE', 'ELAINE', 'KRAMER', 'NEWMAN']
    jerry_dict = {}
    for index, row in df.iterrows():
        if row.Character == char and index != 0:
            #print(index)
            char_before = df.iloc[index - 1].Character
            if char_before not in main_chars:
                continue
                
            if char_before not in jerry_dict:
                jerry_dict[char_before] = 1
            else:
                jerry_dict[char_before] +=1   
        
    return jerry_dict
        
before_jerry = get_speakers_before_jerry(df_scripts) #, 'JERRY'

In [85]:
def get_speakers_before_jerry_probs(before_jerry):         
    main_chars = ['JERRY', 'GEORGE', 'ELAINE', 'KRAMER', 'NEWMAN']
    main_char_probs = {}
    total_before_lines = sum(before_jerry.values())
    
    for char in main_chars:
        main_char_probs[char] = before_jerry[char] / total_before_lines
        
    return main_char_probs

before_jerry_probs = get_speakers_before_jerry_probs(before_jerry)
pprint(before_jerry_probs)
pprint(sum(before_jerry_probs.values()))

{'ELAINE': 0.3012147228057357,
 'GEORGE': 0.39154260663074253,
 'JERRY': 0.05032423052333546,
 'KRAMER': 0.2385605991414741,
 'NEWMAN': 0.018357840898712213}
1.0


In [73]:
sum(before_jerry.values())

14785

In [103]:
def get_speakers_after_jerry(df_scripts):
    df = df_scripts.copy()
    char = 'JERRY'
    main_chars = ['JERRY', 'GEORGE', 'ELAINE', 'KRAMER', 'NEWMAN']
    jerry_dict = {}
    for index, row in df.iterrows():
        if row.Character == char and index != df.shape[0] - 1:
            if index > 54600:
                print(index)
                print(char)
            char_after = df.iloc[index + 1].Character
            if char_after not in main_chars:
                continue
                
            if char_after not in jerry_dict:
                jerry_dict[char_after] = 1
            else:
                jerry_dict[char_after] +=1   
    
    main_char_probs = {}
    total_after_lines = sum(jerry_dict.values())
    
    for char in main_chars:
        main_char_probs[char] = jerry_dict[char] / total_after_lines
        
    return main_char_probs
                

In [104]:
df_scripts.shape

(54606, 5)

In [105]:
df_scripts.tail()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
54601,JERRY,Grand theft auto - don't steal any of my jokes.,23.0,S09E23,9.0
54602,PRISONER 3,You suck - I'm gonna cut you.,23.0,S09E23,9.0
54603,JERRY,"Hey, I don't come down to where you work, and ...",23.0,S09E23,9.0
54604,GUARD,"Alright, Seinfeld, that's it. Let's go. Come on.",23.0,S09E23,9.0
54605,JERRY,"Alright, hey, you've been great! See you in th...",23.0,S09E23,9.0


In [106]:
after_jerry_probs = get_speakers_after_jerry(df_scripts)
after_jerry_probs

54601
JERRY
54603
JERRY


{'JERRY': 0.04996825972612678,
 'GEORGE': 0.38795683322753244,
 'ELAINE': 0.295819352498413,
 'KRAMER': 0.24811825519180195,
 'NEWMAN': 0.018137299356125874}

In [107]:
df_scripts.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,JERRY,Do you know what this is all about? Do you kno...,0.0,S01E00,1.0
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",0.0,S01E00,1.0
2,GEORGE,Are you through?,0.0,S01E00,1.0
3,JERRY,"You do of course try on, when you buy?",0.0,S01E00,1.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",0.0,S01E00,1.0


In [123]:
df_scripts[df_scripts.Character=='GEORGE'].index[0]

2

In [None]:
df_scriplts.iloc(

In [152]:
df_scripts.iloc[:100,:][df_scripts.Character=='JERRY'].tail()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
90,JERRY,"(to Kessler) Yeah. (to the phone) Yeah, people...",0.0,S01E00,1.0
92,JERRY,"(upset) Ohhhh, what are you doing? Kessler, it...",0.0,S01E00,1.0
94,JERRY,"Meat? I dont, I dont know, go... hunt! (Kessle...",0.0,S01E00,1.0
97,JERRY,(cynical) Yeah you almost went to the game. Yo...,0.0,S01E00,1.0
99,JERRY,No.,0.0,S01E00,1.0


In [153]:
df_scripts.iloc[:100,:].tail()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
95,KESSLER,"(from the refrigerator) What happened? Well, t...",0.0,S01E00,1.0
96,KESSLER,"You know, I almost wound up going to that game.",0.0,S01E00,1.0
97,JERRY,(cynical) Yeah you almost went to the game. Yo...,0.0,S01E00,1.0
98,KESSLER,Yeah. (Jerry sits down on the couch. Kessler w...,0.0,S01E00,1.0
99,JERRY,No.,0.0,S01E00,1.0


In [158]:
import time

from tqdm import tqdm

for i in tqdm(range(10)):
    time.sleep(3)

100%|██████████| 10/10 [00:30<00:00,  3.00s/it]


In [169]:
def get_df_before_and_after_JERRY(df_scripts):
    df = df_scripts.copy()
    
    char = 'JERRY'
    main_chars = ['JERRY', 'GEORGE', 'ELAINE', 'KRAMER', 'NEWMAN']
    
    idx_first_line = df[df.Character==char].index[0]
    print(idx_first_line)
    
    rows_appended = []
    # Get first Jerry line and the line after
    df_jerry = pd.DataFrame(df.iloc[idx_first_line:idx_first_line + 2,:])
    rows_appended.append(idx_first_line)
    rows_appended.append(idx_first_line + 1)
    
    print(rows_appended)
    
    for index, row in tqdm(df.iloc[idx_first_line + 2:100, :].iterrows()): #df.shape[0]-1
        
        index_before = index - 1
        index_after = index + 1
        
        row_before = df.iloc[index - 1, :]
        
        row_after = df.iloc[index + 1, :]
        
        char_before = df.iloc[index - 1].Character
        char_after = df.iloc[index + 1].Character
        
        if row.Character == char and index_before not in rows_appended and char_before in main_chars:
            df_jerry = df_jerry.append(row_before)
            rows_appended.append(index_before)
        
        if row.Character == char and index not in rows_appended and char_before in main_chars:
            df_jerry = df_jerry.append(row)
            rows_appended.append(index)
        
            
        if row.Character == char and index_after not in rows_appended and char_after in main_chars:
            df_jerry = df_jerry.append(row_after)
            rows_appended.append(index_after)
             
    return df_jerry
    
df_jerry = get_df_before_and_after_JERRY(df_scripts)
df_jerry                                  

25it [00:00, 209.14it/s]

0
[0, 1]


98it [00:00, 236.60it/s]


Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,JERRY,Do you know what this is all about? Do you kno...,0.0,S01E00,1.0
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",0.0,S01E00,1.0
2,GEORGE,Are you through?,0.0,S01E00,1.0
3,JERRY,"You do of course try on, when you buy?",0.0,S01E00,1.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",0.0,S01E00,1.0
5,JERRY,"Oh, you dont recall?",0.0,S01E00,1.0
6,GEORGE,"(on an imaginary microphone) Uh, no, not at th...",0.0,S01E00,1.0
7,JERRY,"Well, senator, Id just like to know, what you ...",0.0,S01E00,1.0
13,GEORGE,How come youre not doin the second show tomorrow?,0.0,S01E00,1.0
14,JERRY,"Well, theres this uh, woman might be comin in.",0.0,S01E00,1.0


In [176]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [177]:
df_scripts.head(100)

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,JERRY,Do you know what this is all about? Do you kno...,0.0,S01E00,1.0
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",0.0,S01E00,1.0
2,GEORGE,Are you through?,0.0,S01E00,1.0
3,JERRY,"You do of course try on, when you buy?",0.0,S01E00,1.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",0.0,S01E00,1.0
5,JERRY,"Oh, you dont recall?",0.0,S01E00,1.0
6,GEORGE,"(on an imaginary microphone) Uh, no, not at th...",0.0,S01E00,1.0
7,JERRY,"Well, senator, Id just like to know, what you ...",0.0,S01E00,1.0
8,CLAIRE,Mr. Seinfeld. Mr. Costanza.,0.0,S01E00,1.0
9,GEORGE,"Are, are you sure this is decaf? Wheres the or...",0.0,S01E00,1.0


In [170]:
df_jerry['whole_line'] = df_jerry.Character + ": " + df_jerry.Dialogue
df_jerry

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,whole_line
0,JERRY,Do you know what this is all about? Do you kno...,0.0,S01E00,1.0,JERRY: Do you know what this is all about? Do ...
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",0.0,S01E00,1.0,"JERRY: (pointing at Georges shirt) See, to me,..."
2,GEORGE,Are you through?,0.0,S01E00,1.0,GEORGE: Are you through?
3,JERRY,"You do of course try on, when you buy?",0.0,S01E00,1.0,"JERRY: You do of course try on, when you buy?"
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",0.0,S01E00,1.0,"GEORGE: Yes, it was purple, I liked it, I dont..."
5,JERRY,"Oh, you dont recall?",0.0,S01E00,1.0,"JERRY: Oh, you dont recall?"
6,GEORGE,"(on an imaginary microphone) Uh, no, not at th...",0.0,S01E00,1.0,"GEORGE: (on an imaginary microphone) Uh, no, n..."
7,JERRY,"Well, senator, Id just like to know, what you ...",0.0,S01E00,1.0,"JERRY: Well, senator, Id just like to know, wh..."
13,GEORGE,How come youre not doin the second show tomorrow?,0.0,S01E00,1.0,GEORGE: How come youre not doin the second sho...
14,JERRY,"Well, theres this uh, woman might be comin in.",0.0,S01E00,1.0,"JERRY: Well, theres this uh, woman might be co..."


In [163]:
df_scripts.tail(10)

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
54596,PRISONER 1,I am.,23.0,S09E23,9.0
54597,JERRY,I'll talk slower. I'm kidding - I love Cellblo...,23.0,S09E23,9.0
54598,PRISONER 2,Murder one.,23.0,S09E23,9.0
54599,JERRY,"Murder one? Oooooo, watch out everybody. Bette...",23.0,S09E23,9.0
54600,PRISONER 3,Grand theft auto.,23.0,S09E23,9.0
54601,JERRY,Grand theft auto - don't steal any of my jokes.,23.0,S09E23,9.0
54602,PRISONER 3,You suck - I'm gonna cut you.,23.0,S09E23,9.0
54603,JERRY,"Hey, I don't come down to where you work, and ...",23.0,S09E23,9.0
54604,GUARD,"Alright, Seinfeld, that's it. Let's go. Come on.",23.0,S09E23,9.0
54605,JERRY,"Alright, hey, you've been great! See you in th...",23.0,S09E23,9.0


In [156]:
df_jerry.Character.unique()

array(['JERRY', 'GEORGE', 'ELAINE', 'KRAMER'], dtype=object)

In [114]:
# Keras Imports
import keras as K
import random

from keras.layers import Input, Dropout, Dense, concatenate, Embedding
from keras.layers import Flatten, Activation
from keras.optimizers import Adam
from keras.models import Model
from keras.utils import np_utils

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.models import load_model
# from keras.layers import LSTM, CuDNNGRU, CuDNNLSTM
from keras.layers import MaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback

import warnings
warnings.filterwarnings('ignore')
import os
print(os.listdir("../.git"))

['config', 'objects', 'HEAD', 'info', 'logs', 'description', 'hooks', 'refs', 'index', 'packed-refs', 'COMMIT_EDITMSG']


In [197]:
def just_char_dialogue(df_scripts):
    df = df_scripts.copy()

    df.Character = df.Character.astype(str)
    df.Dialogue = df.Dialogue.astype(str)
    df = df[["Character","Dialogue"]]
    
    return df

df_just_char = just_char_dialogue(df_scripts)
df_just_char.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54606 entries, 0 to 54605
Data columns (total 2 columns):
Character    54606 non-null object
Dialogue     54606 non-null object
dtypes: object(2)
memory usage: 853.3+ KB


In [183]:
### This example uses a model that has input_shape=None
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed
from keras.utils import to_categorical
import numpy as np

model = Sequential()

model.add(LSTM(32, return_sequences=True, input_shape=(None, 5)))
model.add(LSTM(8, return_sequences=True))
model.add(TimeDistributed(Dense(2, activation='sigmoid')))

print(model.summary(90))

model.compile(loss='categorical_crossentropy',
              optimizer='adam')

def train_generator():
    while True:
        sequence_length = np.random.randint(10, 100)
        x_train = np.random.random((3, sequence_length, 5))
        # y_train will depend on past 5 timesteps of x
        y_train = x_train[:, :, 0]
        for i in range(1, 5):
            y_train[:, i:] += x_train[:, :-i, i]
        y_train = to_categorical(y_train > 2.5)
        print(len(x_train))
        print(len(y_train))
        yield x_train, y_train

model.fit_generator(train_generator(), steps_per_epoch=30, epochs=5, verbose=1)

__________________________________________________________________________________________
Layer (type)                            Output Shape                        Param #       
lstm_5 (LSTM)                           (None, None, 32)                    4864          
__________________________________________________________________________________________
lstm_6 (LSTM)                           (None, None, 8)                     1312          
__________________________________________________________________________________________
time_distributed_3 (TimeDistributed)    (None, None, 2)                     18            
Total params: 6,194
Trainable params: 6,194
Non-trainable params: 0
__________________________________________________________________________________________
None
Epoch 1/5
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
 1/30 [>.............................] - ETA: 59s - loss: 0.69201000


1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
Epoch 5/5
1000
1000
 1/30 [>.............................] - ETA: 9s - loss: 0.42481000
1000
 2/30 [=>............................] - ETA: 7s - loss: 0.42481000
1000
 3/30 [==>...........................] - ETA: 7s - loss: 0.42261000
1000
 4/30 [===>..........................] - ETA: 6s - loss: 0.42121000
1000
 5/30 [====>.........................] - ETA: 7s - loss: 0.41941000
1000
 6/30 [=====>........................] - ETA: 6s - loss: 0.41801000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000


<keras.callbacks.History at 0x1a3bc97898>

In [201]:
%%time
All_Seinfeld_Scripts = ''
last_seg = ''

for line in df_just_char.values:
#     print(line[0])
#     print(line[2])
    character = line[0].lower()
    dialogue = line[1].lower()
    script = character+": "+dialogue+" \n\n"
    All_Seinfeld_Scripts += script

print(All_Seinfeld_Scripts[:2000])

jerry: do you know what this is all about? do you know, why were here? to be out, this is out...and out is one of the single most enjoyable experiences of life. people...did you ever hear people talking about we should go out? this is what theyre talking about...this whole thing, were all out now, no one is home. not one person here is home, were all out! there are people tryin to find us, they dont know where we are. (on an imaginary phone) did you ring?, i cant find him. where did he go? he didnt tell me where he was going. he must have gone out. you wanna go out you get ready, you pick out the clothes, right? you take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...then youre standing around, whatta you do? you go we gotta be getting back. once youre out, you wanna get back! you wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? where ever you are in life, its my feeling, youve gotta go. 

jerry: (pointing

In [229]:
text_file = open("All_Seinfeld_Scripts.txt", "w")
text_file.write(All_Seinfeld_Scripts)
text_file.close()

In [232]:
text_file = open("All_Seinfeld_Scripts.txt", "r+")
All_Seinfeld_Scripts_from_File = text_file.read()

In [234]:
print(len(All_Seinfeld_Scripts_from_File))
print(len(All_Seinfeld_Scripts))
print(All_Seinfeld_Scripts_from_File == All_Seinfeld_Scripts)

3470782
3470782
True


In [235]:
Text_Data = All_Seinfeld_Scripts_from_File

if len(Text_Data) > 500000:
    Text_Data = Text_Data[:500000]

charindex = list(set(Text_Data))
charindex.sort() 
print(charindex)

np.save("charindex.npy", charindex)

['\n', ' ', '!', '"', '#', '$', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', '\\', ']', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\x92']


In [268]:
%%time
CHARS_SIZE = len(charindex)
SEQUENCE_LENGTH = 75
X_train = []
Y_train = []
for i in range(0, len(Text_Data)-SEQUENCE_LENGTH, 1 ): 
    X = Text_Data[i:i + SEQUENCE_LENGTH]
    Y = Text_Data[i + SEQUENCE_LENGTH]
    X_train.append([charindex.index(x) for x in X])
    Y_train.append(charindex.index(Y))


CPU times: user 27.7 s, sys: 418 ms, total: 28.2 s
Wall time: 28.5 s


In [271]:
X_train.shape

75

In [274]:
Y_train[0]

37

In [275]:
X_train = np.reshape(X_train, (len(X_train), SEQUENCE_LENGTH))

Y_train = np_utils.to_categorical(Y_train)

In [243]:
i= 6
Text_Data[i:i + SEQUENCE_LENGTH]

' do you know what this is all about? do you know, why were here? to be out,'

In [258]:
Text_Data[-1]

'r'

In [263]:
charindex[45]

'm'

In [252]:
charindex.index(Y)

50

In [247]:
charindex[42]

'j'

In [267]:
X_train.shape

(499925, 75)

In [277]:
CHARS_SIZE

60

In [278]:
#X_train[0]
len(Y_train[0])

60

In [280]:
Y_train.shape

(499925, 60)

In [276]:
def get_model():
    model = Sequential()
    inp = Input(shape=(SEQUENCE_LENGTH, ))
    x = Embedding(CHARS_SIZE, 75, trainable=False)(inp)
    x = CuDNNLSTM(512, return_sequences=True,)(x)
    x = CuDNNLSTM(512, return_sequences=True,)(x)
    x = CuDNNLSTM(512,)(x)
    x = Dense(256, activation="elu")(x)
    x = Dense(128, activation="elu")(x)
    outp = Dense(CHARS_SIZE, activation='softmax')(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=['accuracy'],
                 )

    return model

model = get_model()

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 75)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 75, 75)            4500      
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 75, 512)           1206272   
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 75, 512)           2101248   
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (None, 512)               2101248   
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 128)               32896     
__________

In [289]:
''.join(charindex[val] for val in X_train[700])

' the spot, the reservation...then youre standing around, whatta you do? you'

In [282]:
X_train[700]

array([ 1, 52, 40, 37,  1, 51, 48, 47, 52, 12,  1, 52, 40, 37,  1, 50, 37,
       51, 37, 50, 54, 33, 52, 41, 47, 46, 14, 14, 14, 52, 40, 37, 46,  1,
       57, 47, 53, 50, 37,  1, 51, 52, 33, 46, 36, 41, 46, 39,  1, 33, 50,
       47, 53, 46, 36, 12,  1, 55, 40, 33, 52, 52, 33,  1, 57, 47, 53,  1,
       36, 47, 28,  1, 57, 47, 53])

In [281]:
filepath="model_checkpoint.hdf5"

checkpoint = ModelCheckpoint(filepath,
                             monitor='loss',
                             verbose=1,
                             save_best_only=True,
                             mode='min')

early = EarlyStopping(monitor="loss",
                      mode="min",
                      patience=1)

class TextSample(Callback):

    def __init__(self):
       super(Callback, self).__init__() 

    def on_epoch_end(self, epoch, logs={}):
        pattern = X_train[700]
        outp = []
        seed = [charindex[x] for x in pattern]
        sample = 'TextSample:' +''.join(seed)+'|'
        for t in range(100):
          x = np.reshape(pattern, (1, len(pattern)))
          pred = self.model.predict(x)
          result = np.argmax(pred)
          outp.append(result)
          pattern = np.append(pattern,result)
          pattern = pattern[1:len(pattern)]
        outp = [charindex[x] for x in outp]
        outp = ''.join(outp)
        sample += outp
        print(sample)

textsample = TextSample()

In [291]:
print(Text_Data[:400])

jerry: do you know what this is all about? do you know, why were here? to be out, this is out...and out is one of the single most enjoyable experiences of life. people...did you ever hear people talking about we should go out? this is what theyre talking about...this whole thing, were all out now, no one is home. not one person here is home, were all out! there are people tryin to find us, they do


In [4]:
# Import Dataset
df_info, df_scripts = cap.load_data()
df_docs_by_ep = cap.agg_dialogue_by_episode(df_scripts, df_info)

In [5]:
text_file = open("All_Seinfeld_Scripts.txt", "r+")
All_Seinfeld_Scripts_from_File = text_file.read()

Text_Data = All_Seinfeld_Scripts_from_File

In [28]:
text = Text_Data[:100000]
# Deep Learning BOOK example
max_len = 60
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - max_len, step):
    sentences.append(text[i:i + max_len])
    next_chars.append(text[i + max_len])
    
print('Number of sequences: ', len(sentences))
print(next_chars[0])

chars = sorted(list(set(text)))
len(chars)
print(f"Unique Characters: {len(chars)}")

char_indices = dict((char, chars.index(char)) for char in chars)
char_indices

print("Vectorization...")
x = np.zeros((len(sentences), max_len, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Number of sequences:  99940
w
Unique Characters: 50
Vectorization...


In [365]:
sentences[6060]

'ht. \n\ngeorge: maybe im right? of course im right. \n\njerry: t'

In [38]:
x[0]

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       [False,  True, False, ..., False, False, False]])

In [29]:
x.shape
y.shape

(99940, 50)

In [30]:
import keras
from keras import layers
model = keras.models.Sequential()
model.add(layers.LSTM(228, input_shape=(max_len, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [31]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [35]:
import sys
import random 

for epoch in range (1, 20):
    print(f"\n Epoch: {epoch}")
    model.fit(x, y, batch_size=128, epochs=1)
    
    start_index = random.randint(0, len(text) - max_len - 1)
    generated_text = text[start_index: start_index + max_len]
    print("----Generating with seed: ") 
    print(generated_text)

    for temperature in [0.2, 0.5, 1.0, 1.2]:
        generated_text = text[start_index: start_index + max_len]
        print(f'\n ------ temperature: {temperature} \n')
        sys.stdout.write(generated_text)
        # print('generated_text is:', generated_text)
            
        for i in range(250):
            sampled = np.zeros((1, max_len, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1
                
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
#             print(generated_text)
            


 Epoch: 1
Epoch 1/1
----Generating with seed: 
? (beat) elaine ever call you back? 

jerry: no, i guess she

 ------ temperature: 0.2 

? (beat) elaine ever call you back? 

jerry: no, i guess she said i dont know, i cant believe you still get to the park, yes, i have to get to the stock. 

jerry: you know, that working on the conversation. 

jerry: well, i 

  This is separate from the ipykernel package so we can avoid doing imports until


dont know. i dont know. 

jerry: oh no. thats a just-invent out. (he cant believe in t
 ------ temperature: 0.5 

? (beat) elaine ever call you back? 

jerry: no, i guess sheser anything to meet here to sell. 

jerry: what do you want to see it. 

jerry: always we can just firet... 

jerry: how do you know, i cant believe it to pay back. thats it. 

jerry: (to george) the man is how to and you want to see him a might be 
 ------ temperature: 1.0 

? (beat) elaine ever call you back? 

jerry: no, i guess shes lekers. one of a stankwimat occucket. 

george: always do you think we know, the man! 

jerry: oh hey hey very wait to your neighbough whole many peoples that i happie to have home nakwice and my moneys tomorrow... lysull movie. 

jerry: yeah, yeah
 ------ temperature: 1.2 

? (beat) elaine ever call you back? 

jerry: no, i guess she sta-horre einotgly life. a look for combrees? 

jerry: broughe, sit, drock, should go be onre. get someone come with couree! 

george: always me to g

jerry: so it. 

jerry: we can you know her is george. i dont know, i dont have to die, some right to the amaulear night... 

kessler: bik be! 

george: you didnt to a mat, right roll overry. 

morle: lone are you you. 

elaine: showar, its right riok o
 ------ temperature: 1.2 

 decaf left, regular right...its very challenging work. 

jerry: le. ive gilaser super lio the withengsesyus. yeah, right? anywnes? men really... 

vanessa: hi. (feeling) you know, my pict. that ma! you know, george? 

jerry: stell to you maverick to kidd and involved. i had wont show? 

elaine: way, yes, the
 Epoch: 7
Epoch 1/1
----Generating with seed: 
 man wept. 

kramer: oh, hey guys. man, im telling you, this

 ------ temperature: 0.2 

 man wept. 

kramer: oh, hey guys. man, im telling you, this is the wedding. 

jerry: were still great to the park, you come in the conclue. why dont you told her you cant believe that i got to the place is the door meen the couch? 

jerry: look, i got a situation. 

jerry

elaine: oh, simon...is horn ask some the hell you finding a cahine mosty-bblocy cord. if yes. 

jerry
 Epoch: 12
Epoch 1/1
----Generating with seed: 
, why did she call? 

george: how do i know, maybe, you know

 ------ temperature: 0.2 

, why did she call? 

george: how do i know, maybe, you know, i got a stock. 

jerry: yeah, i have the ends. 

jerry: well, i have to go to the place getting with him and they have the elevator and they have the ends out of the plane. 

jerry: oh, he was like the building. 

jerry: oh, they need the stock. 


 ------ temperature: 0.5 

, why did she call? 

george: how do i know, maybe, you know what i want to come this is that i dont think about his business. but if you dont know what you got a half they have the bathroom. why did she say i want? you know what i mean? 

jerry: oh yeah, shes bad. 

kramer: you know, i look at a perfume. (to
 ------ temperature: 1.0 

, why did she call? 

george: how do i know, maybe, you know! oh geeees a lawy. but n

elaine: oh yeah, what does he do? 

george: (to jerry) now, i dont know what they halfly good. 

elaine: how can i dont know what i dont 
 ------ temperature: 0.5 

eah yeah take my number. 555-8643. okay, here he is. 

jerry: i dont know, i cant believe it. 

jerry: i dont know what men want to meet. 

jerry: what does he do? 

elaine: what? 

jerry: you were having the lention. 

jerry: well, im sorry, im gonna go what wasnt so uncommone. 

elaine: okay, thats all over
 ------ temperature: 1.0 

eah yeah take my number. 555-8643. okay, here he is. 

jerry: well, im gont in the way. 

elaine: no, im sorry...tainsation. you wanna get you to vanee big. you mane. youre done! i dont know what i know what they dont... hes a seling? 

elaine: (not beat, he ist.) so, what happened? 

elaine: can i do? 

elai
 ------ temperature: 1.2 

eah yeah take my number. 555-8643. okay, here he is. 

jerry: hey. a driou ir live. 

jerry: uh, i cant believe youre been and these washibe. and the wedder, t

In [45]:
len(generated_text)

60

In [39]:
preds = model.predict(sampled, verbose=0)

In [43]:
preds

array([[8.7646718e-10, 6.3312531e-05, 2.3691375e-07, 6.1593220e-14,
        6.3683427e-11, 1.0123436e-07, 6.8723423e-08, 2.4973217e-06,
        1.9347803e-04, 2.8098771e-06, 1.4478955e-04, 4.3603242e-13,
        1.2278942e-12, 3.6403568e-11, 7.3128205e-12, 1.4243015e-11,
        6.2001043e-14, 5.2784085e-13, 2.4979324e-13, 6.5344702e-13,
        9.3075301e-14, 5.4134784e-08, 2.3676739e-07, 1.4897691e-06,
        8.5376931e-04, 6.8133371e-04, 7.2674141e-03, 2.3734074e-03,
        1.1947947e-04, 4.0527675e-01, 3.8021442e-04, 1.1266294e-03,
        2.5621031e-03, 2.4616023e-10, 3.3410005e-03, 2.6569687e-04,
        4.0226761e-02, 2.5659740e-01, 6.4266182e-04, 1.3082412e-01,
        1.5501831e-08, 2.8416928e-02, 3.5193628e-03, 2.7586954e-02,
        5.5129327e-02, 3.0267034e-02, 1.2372377e-03, 3.3930596e-09,
        8.9538452e-04, 1.8662996e-09]], dtype=float32)

In [213]:
test_text = "george: if hitler was so bad why am i a gay stalin?    \n\nkra"
len(test_text)

60

In [196]:
test_text = "jerry: hi, how's your day going today, what is the deal?\n\nge"
len(test_text)

60

In [286]:
test_text = "jerry: what is the deal with coffee? i mean, its more like? "
len(test_text)

60

In [290]:
test_text = "jerry: the quick brown fox told me this is just a test, no? "
len(test_text)

60

In [467]:
test_text = "george: all right, that's enough. i gotta go home and take a"
len(test_text)

60

In [488]:
sys.stdout.write(test_text)

temperature = 0.57
generated_text = test_text
random.seed(42)
np.random.seed(42)
for i in range(350):
    sampled = np.zeros((1, max_len, len(chars)))
    for t, char in enumerate(generated_text):
        sampled[0, t, char_indices[char]] = 1

    preds = model.predict(sampled, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_char = chars[next_index]

    generated_text += next_char
    generated_text = generated_text[1:]
    
    sys.stdout.write(next_char)

george: all right, that's enough. i gotta go home and take at this couch with a problem put the elevator in from got over there right now. 

jerry: you can i tester. 

jerry: what was the car? 

george: hes

  This is separate from the ipykernel package so we can avoid doing imports until


 a persot. i hate the dryer sellised in the bitco moving out with this problem to get the lobby tickets to the eggees to come to play should see is we have a lot of here. 

jerry: so what i do? i love her

In [278]:
model.save('first_lstm_model.h5')

In [61]:
len('!w-!p?pccc!cccpcpcp?fc-pccpccccccccppcpcppp-p?pccpcccpcppp')

58

In [383]:
x.shape

(99940, 60, 50)

In [382]:
model.evaluate(x, y)



0.821249007661032

In [384]:
print(model.metrics_names) 

['loss']


In [None]:
text_file = open("All_Seinfeld_Scripts.txt", "r+")
All_Seinfeld_Scripts_from_File = text_file.read()

Text_Data = All_Seinfeld_Scripts_from_File

In [None]:
text = Text_Data[:100000]
# Deep Learning BOOK example
max_len = 60
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - max_len, step):
    sentences.append(text[i:i + max_len])
    next_chars.append(text[i + max_len])
    
print('Number of sequences: ', len(sentences))
print(next_chars[0])

chars = sorted(list(set(text)))
len(chars)
print(f"Unique Characters: {len(chars)}")

char_indices = dict((char, chars.index(char)) for char in chars)
char_indices

print("Vectorization...")
x = np.zeros((len(sentences), max_len, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [460]:
random.seed(100)
np.random.seed(100)

import keras
from keras import layers

def create_LSTM_model():
    model = keras.models.Sequential()
    model.add(layers.LSTM(300, input_shape=(max_len, len(chars)))) #return_sequences=True, 
    # model.add(layers.LSTM(228))
    model.add(layers.Dense(len(chars), activation='softmax'))
    optimizer = keras.optimizers.RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

def fit_LSTM_model(model, num_epochs):
    for epoch in range (1, num_epochs):
        print(f"\n Epoch: {epoch}")
        model.fit(x, y, batch_size=128, epochs=1) #, validation_split=0.2

        start_index = random.randint(0, len(text) - max_len - 1)
        generated_text = text[start_index: start_index + max_len]
        print("----Generating with seed: ") 
        print(generated_text)

        for temperature in [0.2, 0.5, 1.0, 1.2]:
            generated_text = text[start_index: start_index + max_len]
            print(f'\n ------ temperature: {temperature} \n')
            sys.stdout.write(generated_text)
            # print('generated_text is:', generated_text)

            for i in range(250):
                sampled = np.zeros((1, max_len, len(chars)))
                for t, char in enumerate(generated_text):
                    sampled[0, t, char_indices[char]] = 1

                preds = model.predict(sampled, verbose=0)[0]
                next_index = sample(preds, temperature)
                next_char = chars[next_index]

                generated_text += next_char
                generated_text = generated_text[1:]

                sys.stdout.write(next_char) 

In [461]:
model_2 = create_LSTM_model()
fit_LSTM_model(model_2, 20)


 Epoch: 1
Epoch 1/1
----Generating with seed: 
 cant wait to get on that boat. 

jerry: me too! 

jerry: i 

 ------ temperature: 0.2 

 cant wait to get on that boat. 

jerry: me too! 

jerry: i dont know what are you didnt want the didnt the stre the guy... 

jerry: what is the pack to the thing in the seed the sees this is there something the seen wast the cats the seent the packing the sees the packet this is the seet on the serent on the
 ------ temperature: 0.5 

 cant wait to get on that boat. 

jerry: me too! 

jerry: i dont she be the cats of the sheing in the sares, bland the packing to the pecket out of the a sere in the sering in the downs and a were telled in the every supton mer on the stress in the hase were dont find a purienday. 

jerry: (couldnt on the see
 ------ temperature: 1.0 

 cant wait to get on that boat. 

jerry: me too! 

jerry: i dont be do ckrease to get ha picken lehe so its a prellt werwalk gor theld... a porticass. 

kramer: you very prestic, where di

  This is separate from the ipykernel package so we can avoid doing imports until


p!rtiting hampsocifor op in homer off, you get 
 Epoch: 5
Epoch 1/1
----Generating with seed: 
rt.) thats dry cleaning. i dont think thats what theyre doin

 ------ temperature: 0.2 

rt.) thats dry cleaning. i dont think thats what theyre doing the bundly what is the couch, i dont know it. 

jerry: i dont know anything to take it. 

jerry: i got the bundly what happens words? 

jerry: yeah, well he want in the couch the really done thing i was a great shirt to the game. 

jerry: you cant 
 ------ temperature: 0.5 

rt.) thats dry cleaning. i dont think thats what theyre doing to me. 

jerry: i got the busto. 

jerry: no, i cant get it. 

jerry: well, i dont know do you see if the contrestic little benn. 

george: he chanking out there was a little deal you be new place, the wall are anyway? 

jerry: yeah, i got a couple
 ------ temperature: 1.0 

rt.) thats dry cleaning. i dont think thats what theyre doing. essalioday. the door talking ainh, hr. why dont you speed you bees like my cl

other in for a session. this guy is a brilliant man. lenny brought that come on, you cant be so sick and he can say that back the guy, i cant believe it. i cant believe it. 

jerry: i dont know. 

jerry: i dont know. 

jerry: i dont know. 

jerry: i dont know what the door? 

jerry: i dont know. 

jerry: how 
 ------ temperature: 0.5 

other in for a session. this guy is a brilliant man. lenny bromable... 

jerry: i cannot need to the bathrooming to a fault. 

jerry: right, you can other in the case. 

jerry: what? you could got in the dryer of mening? 

jerry: why? 

jerry: were st way to go to three of some office. 

jerry: i have a man a
 ------ temperature: 1.0 

other in for a session. this guy is a brilliant man. lenny bruce here. 

jerry: you think, what apartment? 

jerry: youh let me do. hes one time. 

jerry: no, i put it? 

jerry: no man, if here for thesond... 

jerry: i thought it movin?, who wants to like, no off i think thats grong with it. 

george: yeah, 
 ------ tempe

vanessa: i said the market couch with her a couple of pretuth, its not how you to meet now? 

george: you know, i cant think about go out agains partual what i do? what do you think a little blone? 

jerry: so what i did it? 

george: (to george) its me thing about this.
 ------ temperature: 1.0 

ry: you did not tell me not to sell. 

vanessa: i said the minart. shrushes not enderge and i dont know how no no no no no no, missing joes, wilkinson, the new e? you won the detergences. its not could as holl the diffese like vion. joel, theyre wusing... 

jerry: youre in the a coep goes depponed rigater. 


 ------ temperature: 1.2 

ry: you did not tell me not to sell. 

vanessa: i said the market some chonena, charman, this on imeverytataking. 

laura: (laura.) ill apterdulut- absolutely. 

jerry: he knows he these and get place adeah, pamela, lasts been cold...the cos! sumbers? how can i know, i dont know a thin show. listen, uh, move 
 Epoch: 16
Epoch 1/1
----Generating with seed: 
ocat

In [463]:
fit_LSTM_model(model_2, 10)


 Epoch: 1
Epoch 1/1
----Generating with seed: 
hi, welcome back. how were the shows? 

jerry: great, i had 

 ------ temperature: 0.2 

hi, welcome back. how were the shows? 

jerry: great, i had  ' e  o   e oo & &

  This is separate from the ipykernel package so we can avoid doing imports until


   ie  oo ex o& o e  & i  oo  o #oe   e  e i  e  ee eeee i o   e ee    oooi eee    o e  e  o  ee eoeeie' oeeoo  age oeee o f#we oe eyo  

i  oe o e   e   e  o  o  o mye ooe ee a  o &ee  oon  e oe e e  it a e e eo e  e  eoo oo e  a b
 ------ temperature: 0.5 

hi, welcome back. how were the shows? 

jerry: great, i had  ' e  o   e oo & &   ie  oe on o ooei   ae  e ea & ee o    
  e oe  e o exi  ei e'r e e oo ee ee &      e e oo e 
e o   oo eno     e eee e eo e o  e eoo   e eeeo e  o e    e t ee  a e oee i ee  oee  et lo  e  un   in a ou  e e o  oe o  ooo    ed aee 
 ------ temperature: 1.0 

hi, welcome back. how were the shows? 

jerry: great, i had  ' e oo     o   e      e ee  i ooo oo &o i   eine   e o    # e  ee 
  o  o  i  e     o e e a e ee ieo#e oe i  eoteeo    oe  o e o # eeeeei ie  e i  oi o ce je yo t no    agea o e  ne oo e    oe. exee  # t e enei ooe o       oe   o e  8xeoe o el e e  
 ------ temperature: 1.2 

hi, welcome back. how were the shows? 

jerry: great, i had  'e 

george: no, i dont want it. i want iti iti  it ee oi e in tin tot t t oett ii iiinotot t  ei e ine ini i t tit io io i eo t i tie e  t   iin ine it i  iee ine iie to ttti i iie  ioni ti e io to io titi ti tt i i ooi  i i ei i t  i i ieitne ie io iei  oei it to tit t it tonn t  ioe i  t 
 ------ temperature: 1.0 

ake it? you want it? 

george: no, i dont want it. i want ititititni ieto i t it ii i  otto  oo  e i t ieototieen t toit t titi ee tt eo it oo  ii itot  to i   i eee t i
i e e  it it iie t  in e inoi te  t  o  ioot ie t iit iooti tot i ieit to e eo t t iono t iit  i itoi tteint o  ttini i inei  i oti iiteiio 
 ------ temperature: 1.2 

ake it? you want it? 

george: no, i dont want it. i want it t iit itoo t ieie  oettiotett te  t int te iiioie e i eii totn t o eio ti te oe inoo iteon tei iei  te i ot eionet  e eo ete ieot ei oe t inein oo i to ie e  otiti on ti oe titioeit to ne t io  i toiinet in totooe tti int in t  ti  o t ii eeo ii ti 
 Epoch: 7
Epoch 1/1
----Generating w

KeyboardInterrupt: 

In [440]:
model.summary()
model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_3 (Dense)              (None, 50)                6450      
Total params: 98,098
Trainable params: 98,098
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_40 (LSTM)               (None, 128)               91648     
_________________________________________________________________
dense_26 (Dense)             (None, 50)                6450      
Total params: 98,098
Trainable params: 98,098
Non-trainable params: 0
_________________________________________________________________


In [464]:
model_2.evaluate(x, y)



[9.47408180633783, 0.1845907544526716]