# 10th June Notebook (chat bot)

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import os
import re
import csv
import unicodedata
import codecs
import itertools

In [30]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

## Data Preprocessing

In [3]:
movie_lines_path = os.path.join("Data", "movie_lines.txt")
movie_conv_path = os.path.join("Data", "movie_conversations.txt")

In [4]:
# Checking text format
with open(movie_lines_path, "r") as f:
    txt = f.readlines()
    for i in txt[:5]:
        print(i, end="")

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.


In [5]:
line_fields = ["line_id", "character_id", "movie_id", "character_name", "logs"]
dialogues = {}

with open(movie_lines_path, "r") as f:
    for line in f:
        string = line.split(" +++$+++ ")
        current_line = {}
        for i, col in enumerate(line_fields):
            current_line[col] = string[i]
        dialogues[string[0]] = current_line
f.close()
print("Completed!")

Completed!


In [6]:
len(dialogues)

304713

In [7]:
dialogues["L1044"]

{'line_id': 'L1044',
 'character_id': 'u2',
 'movie_id': 'm0',
 'character_name': 'CAMERON',
 'logs': 'They do to!\n'}

In [8]:
with open(movie_conv_path, "r") as f:
    string = f.readlines()
    for i in string[:5]:
        print(i, end="")
f.close()

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']


In [9]:
conv_fields = ["character1_id", "character2_id", "movie_id", "utterance_ids"]
conversations = []

with open(movie_conv_path, "r") as f:
    for line in f:
        string = line.split(" +++$+++ ")
        current_conv = {}
        for i, col in enumerate(conv_fields):
            current_conv[col] = string[i]
        line_ids = eval(current_conv["utterance_ids"])
        current_conv["lines"] = []
        for line_id in line_ids:
            current_conv["lines"].append(dialogues[line_id])
        conversations.append(current_conv)
f.close()

In [10]:
conversations[0]

{'character1_id': 'u0',
 'character2_id': 'u2',
 'movie_id': 'm0',
 'utterance_ids': "['L194', 'L195', 'L196', 'L197']\n",
 'lines': [{'line_id': 'L194',
   'character_id': 'u0',
   'movie_id': 'm0',
   'character_name': 'BIANCA',
   'logs': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'line_id': 'L195',
   'character_id': 'u2',
   'movie_id': 'm0',
   'character_name': 'CAMERON',
   'logs': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'line_id': 'L196',
   'character_id': 'u0',
   'movie_id': 'm0',
   'character_name': 'BIANCA',
   'logs': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'line_id': 'L197',
   'character_id': 'u2',
   'movie_id': 'm0',
   'character_name': 'CAMERON',
   'logs': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]}

In [11]:
qa_pairs = []
for c in conversations:
    for i in range(len(c["lines"]) - 1):
        inputs_text = c["lines"][i]["logs"].strip()
        target_text = c["lines"][i + 1]["logs"].strip()
        if inputs_text and target_text:
            qa_pairs.append([inputs_text, target_text])
print("Execution Finished!")

Execution Finished!


In [12]:
qa_pairs[:5]

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ['Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"],
 ["You're asking me out.  That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.']]

In [None]:
save_file = os.path.join("Data", "processed_lines.txt")
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
with open(save_file, "w", encoding="utf-8") as of:
    write = csv.writer(of, delimiter=delimiter)
    for i in qa_pairs:
        write.writerow(i)
of.close()

In [31]:
data_path = os.path.join("Data", "processed_lines.txt")
with open(data_path, "rb") as f:
    string = f.readlines()
    for i in string[:5]:
        print(i)
f.close()

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"


In [32]:
padding_token = 0
start_sen_token = 1
end_sen_token = 2

In [45]:
class TextProcessing:
    def __init__(self, name):
        self.name = name
        self.word_to_index = {}
        self.word_to_count = {}
        self.index_to_word = {padding_token: "PAD", start_sen_token: "SOS", end_sen_token: "EOS"}
        self.index = 3

    def add_text(self, text):
        for i in text.split():
            self.__word__(i)

    def __word__(self, word):
        if word not in self.word_to_index:
            self.word_to_index[word] = self.index
            self.word_to_count[word] = 1
            self.index_to_word[self.index] = word
            self.index += 1
        else:
            self.word_to_count[word] += 1

    def clean(self, min_counts):
        text = []
        for key, val in self.word_to_count:
            if val >= min_counts:
                text.append(key)
        print("After removing text size reduced from {} to {}".format(len(self.word_to_index), len(text)))
        self.word_to_index.clear()
        self.word_to_count.clear()
        self.index_to_word = {padding_token: "PAD", start_sen_token: "SOS", end_sen_token: "EOS"}
        self.index = 3
        for i in text:
            self.__word__(i)

In [34]:
def get_ascii(text):
    return "".join(i for i in unicodedata.normalize("NFD", text) if unicodedata.category(i) != "Mn")

In [35]:
def normalize_text(text: str):
    text = get_ascii(text.lower().strip())
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    text = re.sub(r"\s+", r" ", text).strip()
    return text

In [36]:
texts = open(data_path, "r", encoding="utf-8").read().strip().split("\n")
qa_pairs = [[normalize_text(i) for i in pair.split("\t")] for pair in texts]
print("Finished!")

Finished!


In [37]:
qa_pairs[:5]

[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well i thought we d start with pronunciation if that s okay with you .'],
 [''],
 ['well i thought we d start with pronunciation if that s okay with you .',
  'not the hacking and gagging and spitting part . please .'],
 [''],
 ['not the hacking and gagging and spitting part . please .',
  'okay . . . then how bout we try out some french cuisine . saturday ? night ?']]

In [46]:
text_process = TextProcessing("Movie dialogues")

In [39]:
len_threshold = 10

def comp(x: str, y: str):
    return len(x.split()) < len_threshold and len(y.split()) < len_threshold

def filter_qa(text):
    return [t for t in text if comp(t[0], t[1])]

In [40]:
qa_pairs = [t for t in qa_pairs if len(t) > 1]

In [51]:
len(qa_pairs)

221282

In [41]:
qa_pairs[:5]

[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well i thought we d start with pronunciation if that s okay with you .'],
 ['well i thought we d start with pronunciation if that s okay with you .',
  'not the hacking and gagging and spitting part . please .'],
 ['not the hacking and gagging and spitting part . please .',
  'okay . . . then how bout we try out some french cuisine . saturday ? night ?'],
 ['you re asking me out . that s so cute . what s your name again ?',
  'forget it .'],
 ['no no it s my fault we didn t have a proper introduction', 'cameron .']]

In [42]:
cleaned_text = filter_qa(qa_pairs)

In [43]:
cleaned_text[:5]

[['there .', 'where ?'],
 ['you have my word . as a gentleman', 'you re sweet .'],
 ['hi .', 'looks like things worked out tonight huh ?'],
 ['you know chastity ?', 'i believe we share an art instructor'],
 ['have fun tonight ?', 'tons']]

In [49]:
for i in cleaned_text:
    text_process.add_text(i[0])
    text_process.add_text(i[1])

In [50]:
text_process.index

18007