In [144]:
import numpy as np
import pandas as pd
import gensim
import string
import json
import random

from sklearn.model_selection import train_test_split
from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential

### Load Data

In [14]:
### Parse CSV File of sentences
utterances = pd.read_csv("../../friends/results.csv")

In [29]:
### Create Dialog IDs
latest_utterances = {}
def getDialogId(did):
    uid = latest_utterances.get(did, 0)
    latest_utterances[did] = uid + 1
    return "dia{}_utt{}".format(did, uid)

In [30]:
utterances["Utterance_ID"] = utterances["Dialogue_ID"].apply(getDialogId)

In [35]:
utterances.head()

Unnamed: 0,Utterance,Reply,Season,Episode,Dialogue_ID,Generated_Reply,bleu,Utterance_ID
0,also i was the point person on my companys tra...,you mustve had your hands full.,8,21,0,also i was the point person on my companys tra...,0.6168006,dia0_utt0
1,you mustve had your hands full.,that i did. that i did.,8,21,0,that was my bike. it had a missing leg.,7.290770999999999e-78,dia0_utt1
2,that i did. that i did.,so lets talk a little bit about your duties.,8,21,0,"is that what you think?! well, y'",6.313992999999999e-78,dia0_utt2
3,so lets talk a little bit about your duties.,my duties? all right.,8,21,0,let me see the door key. hey!,0.5081327,dia0_utt3
4,my duties? all right.,"now youll be heading a whole division, so youl...",8,21,0,"my duties? all right... very good. oh, but loo...",5.775354e-78,dia0_utt4


In [24]:
audio_embeddings = pd.read_csv("audio_embeddings.csv", index_col=0).sort_index()

In [27]:
audio_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
dia0_utt0,0.468404,0.278676,-0.466101,-0.841499,0.811462,0.994261,0.079472,0.331361,0.70999,-0.652272,0.534986,-0.043683,0.237591,-0.517976,-0.125858,-0.664146,-0.092333,0.472566,-0.217704,-1.215915
dia0_utt1,0.393439,-0.04978,0.54095,0.341101,-1.450056,-0.316093,0.223109,-0.665428,-0.001696,-0.21429,-0.177799,2.145403,-1.066256,1.891518,0.460961,-2.707002,1.432687,1.104046,-0.708277,0.283212
dia0_utt10,0.364907,0.629486,0.278706,-0.984126,0.193778,1.058093,0.347793,-0.489006,1.241197,-0.929034,0.689872,0.935992,0.025185,-0.083003,0.703121,-0.704908,-0.480345,0.544629,0.103707,-1.37271
dia0_utt11,0.320398,0.26672,-0.333487,-0.584721,0.715918,0.393879,-0.152199,0.579379,0.354464,-0.509131,0.413786,-0.60633,-0.009577,-0.631173,-0.258914,0.079926,-0.292592,0.496993,-0.277546,-0.737783
dia0_utt12,-0.063901,-0.062718,-0.055214,-0.043416,-1.132506,-0.409087,0.424417,-0.287484,0.831095,0.186804,-0.437253,1.50792,-0.452466,0.811226,-0.392926,-1.476732,0.504981,0.480592,-0.888524,0.775039


### Train Word Embeddings

In [123]:
docs = list(utterances["Utterance"])
docs.extend(list(utterances["Reply"]))

In [124]:
max_sentence_len = 40
tokenized_sentences = [[word for word in doc.lower().translate(str.maketrans('','',string.punctuation)).split()[
    :max_sentence_len]] for doc in docs]

In [125]:
# Train Word2Vec

word_model = gensim.models.Word2Vec(tokenized_sentences, size=50, min_count=0, window=5, iter=100)
pretrained_weights = word_model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape

In [126]:
print("Vocab Size:", vocab_size, ", Embedding Size: ", embedding_size)

Vocab Size: 5907 , Embedding Size:  50


In [127]:
word_model.wv['ambulance']

array([ 0.07773598, -0.29375657, -0.22691308,  0.32868487, -0.18725865,
        0.09280089, -0.22820775,  0.16686167, -0.00537472,  0.18685572,
       -0.33349225, -0.05626639, -0.06709337, -0.28205168,  0.07664964,
       -0.17803718,  0.32260135, -0.05070792,  0.19358161,  0.18862797,
       -0.05803084, -0.31198484, -0.05836983, -0.23335382, -0.12515719,
       -0.46920678,  0.3272013 , -0.03200192,  0.1112607 , -0.12537071,
        0.07637703, -0.16718976,  0.2437104 , -0.27371076, -0.1978086 ,
        0.08723114,  0.0630024 , -0.6420263 , -0.00084112,  0.12879665,
       -0.00786434,  0.2639555 ,  0.29323307,  0.13385877, -0.14369838,
       -0.23961295, -0.11977836, -0.62264687,  0.02373069,  0.44756612],
      dtype=float32)

In [128]:
word_model.save("meld_text_w2v.model")

In [129]:
# Checking that model is properly trained
for word in ['model', 'joey', 'learn']:
  most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.wv.most_similar(word)[:8])
  print('  %s -> %s' % (word, most_similar))

  model -> victorias (0.94), boxer (0.78), kristindoes (0.76), girlie (0.74), shorts (0.73), fundamentally (0.69), stephanie (0.69), unmarriable (0.67)
  joey -> chandler (0.52), sure (0.50), rachel (0.49), ross (0.45), you (0.45), cause (0.43), i (0.43), way (0.42)
  learn -> react (0.55), bambi (0.50), maroon (0.49), handshake (0.47), waitwait (0.47), youhow (0.46), thathot (0.45), split (0.44)


In [66]:
def word2idx(word):
  return word_model.wv.vocab[word].index

def idx2word(idx):
  return word_model.wv.index2word[idx]

In [69]:
word2idx("joey")

80

In [70]:
idx2word(80)

'joey'

### Combine Word and Audio Embeddings

In [72]:
filtered_utterances = utterances[utterances["Utterance_ID"].isin(list(audio_embeddings.index))].copy()

In [73]:
filtered_utterances.shape

(7985, 8)

In [168]:
utterance_key2idx = {}
utterance_idx2key = {}
utterance_key2idx_words = {}

combined_weights = []
filtered_sentences = []
final_idx = 0

def combineWeights(row):
    global final_idx
    
    complete_sentence = row["Utterance"] + " " + row["Reply"]
    utt = [word for word in complete_sentence.lower().translate(
        str.maketrans('','',string.punctuation)).split()[:max_sentence_len]]
    uid = row["Utterance_ID"]
    filtered_sentences.append(utt)
    audio_vector = np.array(audio_embeddings.loc[uid])
    for word in utt:
        word_vector = pretrained_weights[word2idx(word)]
        if uid not in utterance_key2idx:
            utterance_key2idx[uid] = {}
        
        if word not in utterance_key2idx_words:
            utterance_key2idx_words[word] = {}
            
        utterance_key2idx[uid][word] = final_idx
        utterance_idx2key[final_idx] = (uid, word)
        
        utterance_key2idx_words[word][uid] = final_idx
        
        combined_weights.append(np.append(audio_vector, word_vector))
        
        final_idx += 1
    
    return " ".join(utt)
        
filtered_utterances["tokens"] = filtered_utterances.apply(combineWeights, axis=1)
combined_weights = np.array(combined_weights)

In [169]:
combined_weights.shape

(125768, 70)

In [170]:
np.save("combined_weights", combined_weights)

In [171]:
with open("combined_weights_key2idx.json", "w") as fp:
    json.dump(utterance_key2idx, fp)

In [172]:
with open("combined_weights_idx2key.json", "w") as fp:
    json.dump(utterance_idx2key, fp)

In [173]:
with open("combined_weights_key2idx_words.json", "w") as fp:
    json.dump(utterance_key2idx_words, fp)

In [174]:
filtered_utterances.to_csv("filtered_utterances_tokens.csv", index=False)

### Create Training and Testing Data

In [175]:
train, test = train_test_split(filtered_utterances, test_size = 0.01)

In [176]:
train.to_csv("lstm_train.csv", index=False)
test.to_csv("lstm_test.csv", index=False)

In [177]:
train_sentences = list(train["tokens"])
train_ids = list(train["Utterance_ID"])

test_sentences = list(test["tokens"])
test_ids = list(test["Utterance_ID"])

In [206]:
train_x = np.zeros([len(train_sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(train_sentences)], dtype=np.int32)

for i, sentence_str in enumerate(train_sentences):
    sentence = sentence_str.split()
    sid = train_ids[i]
    for t, word in enumerate(sentence[:-1]):
        train_x[i, t] = utterance_key2idx[sid][word]
    train_y[i] = utterance_key2idx[sid][sentence[-1]]
    
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

train_x shape: (7905, 40)
train_y shape: (7905,)


In [207]:
test_x = np.zeros([len(test_sentences), max_sentence_len], dtype=np.int32)
test_y = np.zeros([len(test_sentences)], dtype=np.int32)

for i, sentence_str in enumerate(test_sentences):
    sentence = sentence_str.split()
    sid = test_ids[i]
    for t, word in enumerate(sentence[:-1]):
        test_x[i, t] = utterance_key2idx[sid][word]
    test_y[i] = utterance_key2idx[sid][sentence[-1]]
    
print('test_x shape:', test_x.shape)
print('test_y shape:', test_y.shape)

test_x shape: (80, 40)
test_y shape: (80,)


### Create Model

In [208]:
combined_embedding_size = combined_weights.shape[1]

In [212]:
model = Sequential()
model.add(Embedding(input_dim=combined_weights.shape[0], output_dim=combined_weights.shape[1], weights=[combined_weights]))
model.add(LSTM(units=combined_embedding_size))
model.add(Dense(units=combined_weights.shape[0]))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [213]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)

def generate_next(text, num_generated=10):
    word_idxs = [random.choice(list(utterance_key2idx_words[word].values())) for word in text.lower().split()]
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_idxs))
        idx = sample(prediction[-1], temperature=0.7)
        word_idxs.append(idx)
    return ' '.join(utterance_idx2key[idx][1] for idx in word_idxs)

def on_epoch_end(epoch, _):
    print('\nGenerating text after epoch: %d' % epoch)
    texts = [
        'you mustve had your hands full',
        'oh wait i',
        'oh wait i',
        'sorry about that',
    ]
    for text in texts:
        sample = generate_next(text)
        print('%s... -> %s' % (text, sample))

In [221]:
model.fit(train_x, train_y,
        batch_size=128,
        epochs=100,
        callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])

Epoch 1/100

Generating text after epoch: 0
you mustve had your hands full... -> you mustve had your hands full ya geller kay i eight think scared was games no
oh wait i... -> oh wait i annabelle half so in okay really ya 830 yknow done
oh wait i... -> oh wait i night fine right so it interesting wanted its to joke
sorry about that... -> sorry about that yours pillow if be ohh a out it girl monana
Epoch 2/100

Generating text after epoch: 1
you mustve had your hands full... -> you mustve had your hands full great naked because lot me why convention he pay chef
oh wait i... -> oh wait i cute why do girl cant urgent oop now out remembered
oh wait i... -> oh wait i mean thing rach doing are said again why year comes
sorry about that... -> sorry about that kid joey 830 in we night thinking poem he sissy
Epoch 3/100

Generating text after epoch: 2
you mustve had your hands full... -> you mustve had your hands full claims a me know could questions action ok kristin things
oh wait i... -> oh 


Generating text after epoch: 17
you mustve had your hands full... -> you mustve had your hands full matter i want who one brutal wonderful that enough david
oh wait i... -> oh wait i and sucker freak here somethin hey i well kids frank
oh wait i... -> oh wait i actually listen again it museum that ball not ahh before
sorry about that... -> sorry about that long ready view so ow awful picked wanna him gina
Epoch 19/100

Generating text after epoch: 18
you mustve had your hands full... -> you mustve had your hands full wesley uhhuh monica plans ill syracuse here woman god he
oh wait i... -> oh wait i have oh here okay about if again is it barcelona
oh wait i... -> oh wait i mean off me find thats him rachel fine you i
sorry about that... -> sorry about that it on whoohoho damage im this up this ive hey
Epoch 20/100

Generating text after epoch: 19
you mustve had your hands full... -> you mustve had your hands full issues this god ah ask so sucks some is music
oh wait i... -> oh wait i y


Generating text after epoch: 34
you mustve had your hands full... -> you mustve had your hands full so think adoin here yeah not stores candy then you
oh wait i... -> oh wait i right and she was 23 isnt stick them jill if
oh wait i... -> oh wait i cares went great is cheap know wait really paper her
sorry about that... -> sorry about that dont been hip wanna justif doesnt it uhhmm okay photographer
Epoch 36/100

Generating text after epoch: 35
you mustve had your hands full... -> you mustve had your hands full dime down lesbian fights no baby that her here why
oh wait i... -> oh wait i on yeah wedding jake is how kinda tell it haha
oh wait i... -> oh wait i for pheebs it gonna why mail it to out out
sorry about that... -> sorry about that hard andand in pheebs anything you yet it where you
Epoch 37/100

Generating text after epoch: 36
you mustve had your hands full... -> you mustve had your hands full so there what bamboozled i me yes really funny black
oh wait i... -> oh wait i here 

sorry about that... -> sorry about that him sweater stevens die yeah doing your together girl wedding
Epoch 52/100

Generating text after epoch: 51
you mustve had your hands full... -> you mustve had your hands full what back see rule jokes monica love that uhoh with
oh wait i... -> oh wait i that yeah son yeah cant oh unbelievable anyway it relax
oh wait i... -> oh wait i problem box and secret worry thats pack sets thinking everybody
sorry about that... -> sorry about that one the party hey go yeah us here look cooking
Epoch 53/100

Generating text after epoch: 52
you mustve had your hands full... -> you mustve had your hands full huh you tonight no chip about anyway sorry rip stevens
oh wait i... -> oh wait i mom sorry for you moustache what diego that unbelievable basis
oh wait i... -> oh wait i we over inspector did again duties say please tell ross
sorry about that... -> sorry about that way be gonna chandler youyouyou robe word we barcelona something
Epoch 54/100

Generating tex

oh wait i... -> oh wait i guy careful said me and scared feeling people friend phonetically
sorry about that... -> sorry about that you doing impossible god again gift having watch is choice
Epoch 69/100

Generating text after epoch: 68
you mustve had your hands full... -> you mustve had your hands full hey movie jockstrap up means him go commercial yknow beautiful
oh wait i... -> oh wait i blast off okay appalachia two you are about bonnie joey
oh wait i... -> oh wait i go damnit no ohhh work off really expression she water
sorry about that... -> sorry about that nothin my phoebe her no cards right top on yeah
Epoch 70/100

Generating text after epoch: 69
you mustve had your hands full... -> you mustve had your hands full dismissed to do oh right why great shot ya what
oh wait i... -> oh wait i she down game please i has bathroom word party peeking
oh wait i... -> oh wait i asteroid her me underwear lemonade man joey green as emily
sorry about that... -> sorry about that this here cha

oh wait i... -> oh wait i ever know 15 sweater em congratulationsohwow im baby underwear on
oh wait i... -> oh wait i please it okay jew then radiator me ring toast about
sorry about that... -> sorry about that about donuts i face yeah night later strength mean of
Epoch 86/100

Generating text after epoch: 85
you mustve had your hands full... -> you mustve had your hands full oh there right to crazy yes please long wedding bye
oh wait i... -> oh wait i seven market hang down sides what on anyway story friends
oh wait i... -> oh wait i today her lotion goodacre are baby about me really gay
sorry about that... -> sorry about that what um happened any okay stuff yes once okay off
Epoch 87/100

Generating text after epoch: 86
you mustve had your hands full... -> you mustve had your hands full chelsea what gift suzie asleep partner how come on me
oh wait i... -> oh wait i song know anyone detail see passed you park burt you
oh wait i... -> oh wait i preapproved you dont drag not phoebe dont

<keras.callbacks.callbacks.History at 0x7fa88e6095d0>

In [167]:
filtered_utterances.sample(5)

Unnamed: 0,Utterance,Reply,Season,Episode,Dialogue_ID,Generated_Reply,bleu,Utterance_ID,tokens
8180,"oh wait, i think i hear him.",oh - oh my god!,9,12,953,"oh my god! chad, is that you?",6.608447e-78,dia953_utt13,oh wait i think i hear him oh oh my god
5525,"oh yeah, it must be tough to keep your hands o...","yeah, but im pretty sure hes gay.",8,12,649,just hang in,1.254338e-154,dia649_utt21,oh yeah it must be tough to keep your hands of...
5692,"yeah, tell that to my uncle lenny.",why? what happened to him?,3,15,664,"so how did it go? ah, we did",0.5410823,dia664_utt6,yeah tell that to my uncle lenny why what happ...
5738,sorry about that. so. what have you been up to?,"oh, not much. i-i got a job.",1,2,669,"oh my god, ross,",1.027019e-77,dia669_utt0,sorry about that so what have you been up to o...
5481,"first of all um, i love you both so much and y...","okay, bla-bla-bla-bla!! who is it?!",7,6,646,,0.0,dia646_utt4,first of all um i love you both so much and yo...


In [222]:
model.save_weights('lstm_weights.h5')

In [223]:
model.save('lstm.h5')

In [203]:
sample(model.predict(test_x[0])[-1], 0).shape

()

In [220]:
np.argmax(model.predict(test_x[0]))

273934

In [233]:
sample(model.predict(test_x[0])[-1], temperature=0.7)

80081