# 10th June Notebook (chat bot)

In [123]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import os
import re
import csv
import unicodedata
import random
import codecs
import itertools

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

## Data Preprocessing

In [3]:
movie_lines_path = os.path.join("Data", "movie_lines.txt")
movie_conv_path = os.path.join("Data", "movie_conversations.txt")

In [4]:
# Checking text format
with open(movie_lines_path, "r") as f:
    txt = f.readlines()
    for i in txt[:5]:
        print(i, end="")

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.


In [5]:
line_fields = ["line_id", "character_id", "movie_id", "character_name", "logs"]
dialogues = {}

with open(movie_lines_path, "r") as f:
    for line in f:
        string = line.split(" +++$+++ ")
        current_line = {}
        for i, col in enumerate(line_fields):
            current_line[col] = string[i]
        dialogues[string[0]] = current_line
f.close()
print("Completed!")

Completed!


In [6]:
len(dialogues)

304713

In [7]:
dialogues["L1044"]

{'line_id': 'L1044',
 'character_id': 'u2',
 'movie_id': 'm0',
 'character_name': 'CAMERON',
 'logs': 'They do to!\n'}

In [8]:
with open(movie_conv_path, "r") as f:
    string = f.readlines()
    for i in string[:5]:
        print(i, end="")
f.close()

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']


In [9]:
conv_fields = ["character1_id", "character2_id", "movie_id", "utterance_ids"]
conversations = []

with open(movie_conv_path, "r") as f:
    for line in f:
        string = line.split(" +++$+++ ")
        current_conv = {}
        for i, col in enumerate(conv_fields):
            current_conv[col] = string[i]
        line_ids = eval(current_conv["utterance_ids"])
        current_conv["lines"] = []
        for line_id in line_ids:
            current_conv["lines"].append(dialogues[line_id])
        conversations.append(current_conv)
f.close()

In [10]:
conversations[0]

{'character1_id': 'u0',
 'character2_id': 'u2',
 'movie_id': 'm0',
 'utterance_ids': "['L194', 'L195', 'L196', 'L197']\n",
 'lines': [{'line_id': 'L194',
   'character_id': 'u0',
   'movie_id': 'm0',
   'character_name': 'BIANCA',
   'logs': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'line_id': 'L195',
   'character_id': 'u2',
   'movie_id': 'm0',
   'character_name': 'CAMERON',
   'logs': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'line_id': 'L196',
   'character_id': 'u0',
   'movie_id': 'm0',
   'character_name': 'BIANCA',
   'logs': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'line_id': 'L197',
   'character_id': 'u2',
   'movie_id': 'm0',
   'character_name': 'CAMERON',
   'logs': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]}

In [11]:
qa_pairs = []
for c in conversations:
    for i in range(len(c["lines"]) - 1):
        inputs_text = c["lines"][i]["logs"].strip()
        target_text = c["lines"][i + 1]["logs"].strip()
        if inputs_text and target_text:
            qa_pairs.append([inputs_text, target_text])
print("Execution Finished!")

Execution Finished!


In [12]:
qa_pairs[:5]

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ['Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"],
 ["You're asking me out.  That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.']]

In [None]:
save_file = os.path.join("Data", "processed_lines.txt")
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
with open(save_file, "w", encoding="utf-8") as of:
    write = csv.writer(of, delimiter=delimiter)
    for i in qa_pairs:
        write.writerow(i)
of.close()

In [3]:
data_path = os.path.join("Data", "processed_lines.txt")
with open(data_path, "rb") as f:
    string = f.readlines()
    for i in string[:5]:
        print(i)
f.close()

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"


In [24]:
padding_token = 0
start_sen_token = 1
end_sen_token = 2

In [75]:
class TextProcessing:
    def __init__(self, name):
        self.name = name
        self.word_to_index = {}
        self.word_to_count = {}
        self.index_to_word = {padding_token: "PAD", start_sen_token: "SOS", end_sen_token: "EOS"}
        self.index = 3

    def add_text(self, text):
        for i in text.split():
            self.__word__(i)

    def __word__(self, word):
        if word not in self.word_to_index:
            self.word_to_index[word] = self.index
            self.word_to_count[word] = 1
            self.index_to_word[self.index] = word
            self.index += 1
        else:
            self.word_to_count[word] += 1

    def clean(self, min_counts, debug=False):
        text = []
        if debug:
            print(self.word_to_index)
        for key, val in self.word_to_count.items():
            if val >= min_counts:
                text.append(key)
        print("After removing text size reduced from {} to {}".format(len(self.word_to_index), len(text)))
        self.word_to_index.clear()
        self.word_to_count.clear()
        self.index_to_word = {padding_token: "PAD", start_sen_token: "SOS", end_sen_token: "EOS"}
        self.index = 3
        for i in text:
            self.__word__(i)

In [76]:
def get_ascii(text):
    return "".join(i for i in unicodedata.normalize("NFD", text) if unicodedata.category(i) != "Mn")

In [77]:
def normalize_text(text: str):
    text = get_ascii(text.lower().strip())
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    text = re.sub(r"\s+", r" ", text).strip()
    return text

In [78]:
texts = open(data_path, "r", encoding="utf-8").read().strip().split("\n")
qa_pairs = [[normalize_text(i) for i in pair.split("\t")] for pair in texts]
print("Finished!")

Finished!


In [79]:
qa_pairs[:5]

[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well i thought we d start with pronunciation if that s okay with you .'],
 [''],
 ['well i thought we d start with pronunciation if that s okay with you .',
  'not the hacking and gagging and spitting part . please .'],
 [''],
 ['not the hacking and gagging and spitting part . please .',
  'okay . . . then how bout we try out some french cuisine . saturday ? night ?']]

In [80]:
text_process = TextProcessing("Movie dialogues")

In [81]:
len_threshold = 10

def comp(x: str, y: str):
    return len(x.split()) < len_threshold and len(y.split()) < len_threshold

def filter_qa(text):
    return [t for t in text if comp(t[0], t[1])]

In [82]:
qa_pairs = [t for t in qa_pairs if len(t) > 1]

In [83]:
len(qa_pairs)

221282

In [84]:
qa_pairs[:5]

[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well i thought we d start with pronunciation if that s okay with you .'],
 ['well i thought we d start with pronunciation if that s okay with you .',
  'not the hacking and gagging and spitting part . please .'],
 ['not the hacking and gagging and spitting part . please .',
  'okay . . . then how bout we try out some french cuisine . saturday ? night ?'],
 ['you re asking me out . that s so cute . what s your name again ?',
  'forget it .'],
 ['no no it s my fault we didn t have a proper introduction', 'cameron .']]

In [85]:
cleaned_text = filter_qa(qa_pairs)

In [86]:
cleaned_text[:5]

[['there .', 'where ?'],
 ['you have my word . as a gentleman', 'you re sweet .'],
 ['hi .', 'looks like things worked out tonight huh ?'],
 ['you know chastity ?', 'i believe we share an art instructor'],
 ['have fun tonight ?', 'tons']]

In [87]:
for i in cleaned_text:
    text_process.add_text(i[0])
    text_process.add_text(i[1])

In [88]:
text_process.index

18007

In [89]:
for i in cleaned_text:
    text_process.add_text(i[0])
    text_process.add_text(i[1])

In [90]:
text_process.index

18007

In [91]:
def remove_min(voc, pairs, min_count=3):
    voc.clean(min_count, False)
    new_pairs = []

    for pair in pairs:
        inputs = pair[0]
        outputs = pair[1]
        left, right = True, True
        for word in inputs.split():
            if word not in voc.word_to_index:
                left = False
                break
        for word in outputs.split():
            if word not in voc.word_to_index:
                right = False
                break
        if left and right:
            new_pairs.append(pair)
    print("Original size: {}, current size: {}".format(len(pairs), len(new_pairs)))
    return new_pairs

In [92]:
new_qa = remove_min(text_process, cleaned_text)

After removing text size reduced from 18004 to 11353
Original size: 64271, current size: 58499


In [93]:
new_qa[:5]

[['there .', 'where ?'],
 ['you have my word . as a gentleman', 'you re sweet .'],
 ['hi .', 'looks like things worked out tonight huh ?'],
 ['have fun tonight ?', 'tons'],
 ['well no . . .', 'then that s all you had to say .']]

## Data Prepration

In [94]:
def ind_from_sen(voc, sen):
    return [voc.word_to_index[word] for word in sen.split()] + [end_sen_token]

In [97]:
ind_from_sen(text_process, new_qa[1][0])

[7, 8, 9, 10, 4, 11, 12, 13, 2]

In [100]:
new_qa[1][0] + " <eos>"

'you have my word . as a gentleman <eos>'

In [113]:
inputs, outputs = [], []

for i in new_qa[:10]:
    inputs.append(i[0])
    outputs.append(i[1])
print(len(inputs))
print(inputs)

10
['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']


In [114]:
testing = [ind_from_sen(text_process, i) for i in inputs]
testing

[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 32, 22, 6, 2],
 [34, 35, 4, 4, 4, 2],
 [36, 37, 38, 39, 7, 40, 41, 42, 4, 2],
 [43, 2],
 [48, 7, 49, 41, 46, 50, 6, 2],
 [51, 52, 53, 6, 2],
 [59, 2]]

In [115]:
def zero_padding(lst, fill_val=0):
    return list(itertools.zip_longest(*lst, fillvalue=fill_val))

In [116]:
testing_2 = [len(i) for i in testing]
max(testing_2)

10

In [117]:
testing_3 = zero_padding(testing)
print(len(testing_3))
testing_3

10


[(3, 7, 16, 8, 34, 36, 43, 48, 51, 59),
 (4, 8, 4, 32, 35, 37, 2, 7, 52, 2),
 (2, 9, 2, 22, 4, 38, 0, 49, 53, 0),
 (0, 10, 0, 6, 4, 39, 0, 41, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 46, 2, 0),
 (0, 11, 0, 0, 2, 40, 0, 50, 0, 0),
 (0, 12, 0, 0, 0, 41, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 42, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [118]:
def bin_matt(lst, val=90):
    hot = []
    for k, seq, in enumerate(lst):
        hot.append([])
        for j in seq:
            if j == padding_token:
                hot[k].append(0)
            else:
                hot[k].append(1)
    return hot

In [119]:
testing_4 = bin_matt(testing_3)
testing_4

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [120]:
def input_var(lst, voc):
    ind_batch = [ind_from_sen(voc, i) for i in lst]
    ln = torch.tensor([len(i) for i in ind_batch])
    pad_list = zero_padding(ind_batch)
    pad_var = torch.LongTensor(pad_list)
    return pad_var, ln

In [125]:
def out_var(lst, voc):
    ind_batch = [ind_from_sen(voc, i) for i in lst]
    max_target = max([len(i) for i in ind_batch])
    pad_list = zero_padding(ind_batch)
    mask = bin_matt(pad_list)
    mask = torch.ByteTensor(mask)
    pad_var = torch.LongTensor(pad_list)
    return pad_var, mask, max_target

In [126]:
def train_data(voc, qa_batch):
    qa_batch.sort(key=lambda x: len(x[0].split()), reverse=True)
    input_data, output_data = [], []
    for i in qa_batch:
        input_data.append(i[0])
        output_data.append(i[1])
    ins, ln = input_var(input_data, voc)
    outs, mask, max_tar = out_var(output_data, voc)
    return ins, ln, outs, mask, max_tar

In [127]:
# Testing
train_data(text_process, [random.choice(new_qa) for i in range(5)])

(tensor([[  89,  820,  933, 7784, 1619],
         [1958,   41,    4, 6118,    6],
         [   4, 1086, 8814,    4,    2],
         [ 160, 2930,    4,    2,    0],
         [  12,    4,    2,    0,    0],
         [4467,    2,    0,    0,    0],
         [  57,    0,    0,    0,    0],
         [  46,    0,    0,    0,    0],
         [   4,    0,    0,    0,    0],
         [   2,    0,    0,    0,    0]]),
 tensor([10,  6,  5,  4,  3]),
 tensor([[  51,   60,   69, 6243, 2805],
         [   6,   38,  102,   51,    4],
         [   2,  793,   24,  232,    2],
         [   0,   21,    6,  116,    0],
         [   0,   67,    2,   77,    0],
         [   0,    2,    0,    6,    0],
         [   0,    0,    0,    2,    0]]),
 tensor([[1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1],
         [0, 1, 1, 1, 0],
         [0, 1, 1, 1, 0],
         [0, 1, 0, 1, 0],
         [0, 0, 0, 1, 0]], dtype=torch.uint8),
 7)