## BMO Script Chatbot
Uses the dialgoue from Adventure Time episodes to fine-tune chat transformer model

#### Imports

In [30]:
import requests
import time
import re
import os
import string
import sys
from bs4 import BeautifulSoup
import numpy as np
from urllib.parse import urljoin
from tqdm import tqdm
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import spacy


In [28]:
### CODE EXECUTION FLAGS
# set to True to run the code; set to False to skip the code and just import

SCRAPE_TRANSCRIPTS = False

#### Scrape transcript files from web

In [4]:
# SETUP GENERAL SCRAPER FUNCTIONS

driver = None;

#retrieve the body content from a link
def getLinkSoup(link,wait_time=1):
    global driver
    if driver is None:
        driver = webdriver.Chrome(ChromeDriverManager().install())

    #check if it is a valid link first before running
    if not validLink(link):
        return None
    
    driver.get(link)
    time.sleep(wait_time) #if you want to wait 1 seconds for the page to load
    response = driver.page_source
    soup = BeautifulSoup(response, 'html.parser')
    return soup

#retrieve the body content from a link without using selenium
def quickLinkSoup(link):
    response = requests.get(
        url=link,
        headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    )
    if response.status_code != 200:
        print(f"> ERROR: Link [{link}] not found... (Response: {response.status_code})")
        return None
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

#check if the link is valid
def validLink(link):
    response = requests.get(
        url=link,
        headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    )
    if response.status_code != 200:
        print(f"> ERROR: Link [{link}] not found... (Response: {response.status_code})")
        return False
    else:
        return True


In [5]:
# SCRAPE FROM THE ADVENTURE TIME WIKI WEBSITE
# SAVE TO A DICTIONARY LABELED BY EPISODES (S#E#)

def getTranscriptLinkSet():
    main_link = "https://adventuretime.fandom.com/wiki/Category_talk:Transcripts"
    soup = quickLinkSoup(main_link)
    if soup is None:
        print("No soup for you!")
        return None
    
    #get h2 from soup
    h2s = soup.findAll("h2")
    season_labels = [re.sub('\s\(.*\)','',h.findChildren()[1].text) for h in h2s]
    tables = [h.findNext("table") for h in h2s]
    transcripts = {}
    for i in range(len(tables)):
        table = tables[i]
        if table is None:
            continue
        links = [urljoin(main_link, a['href']) for a in table.findAll("a")]
        for l in range(len(links)):
            snum = season_labels[i].split(" ")[1] if season_labels[i].split(" ")[0] == "Season" else "0"
            transcripts[f"S{snum}E{l+1}"] = links[l]

    return transcripts
    
if SCRAPE_TRANSCRIPTS:
    transcripts = getTranscriptLinkSet()

dict_keys(['S0E1', 'S1E1', 'S1E2', 'S1E3', 'S1E4', 'S1E5', 'S1E6', 'S1E7', 'S1E8', 'S1E9', 'S1E10', 'S1E11', 'S1E12', 'S1E13', 'S1E14', 'S1E15', 'S1E16', 'S1E17', 'S1E18', 'S1E19', 'S1E20', 'S1E21', 'S1E22', 'S1E23', 'S1E24', 'S1E25', 'S1E26', 'S2E1', 'S2E2', 'S2E3', 'S2E4', 'S2E5', 'S2E6', 'S2E7', 'S2E8', 'S2E9', 'S2E10', 'S2E11', 'S2E12', 'S2E13', 'S2E14', 'S2E15', 'S2E16', 'S2E17', 'S2E18', 'S2E19', 'S2E20', 'S2E21', 'S2E22', 'S2E23', 'S2E24', 'S2E25', 'S2E26', 'S3E1', 'S3E2', 'S3E3', 'S3E4', 'S3E5', 'S3E6', 'S3E7', 'S3E8', 'S3E9', 'S3E10', 'S3E11', 'S3E12', 'S3E13', 'S3E14', 'S3E15', 'S3E16', 'S3E17', 'S3E18', 'S3E19', 'S3E20', 'S3E21', 'S3E22', 'S3E23', 'S3E24', 'S3E25', 'S3E26', 'S4E1', 'S4E2', 'S4E3', 'S4E4', 'S4E5', 'S4E6', 'S4E7', 'S4E8', 'S4E9', 'S4E10', 'S4E11', 'S4E12', 'S4E13', 'S4E14', 'S4E15', 'S4E16', 'S4E17', 'S4E18', 'S4E19', 'S4E20', 'S4E21', 'S4E22', 'S4E23', 'S4E24', 'S4E25', 'S4E26', 'S5E1', 'S5E2', 'S5E3', 'S5E4', 'S5E5', 'S5E6', 'S5E7', 'S5E8', 'S5E9', 'S5E10', 

In [43]:
# get the dialogue from a link
def scrapeTranscript(link,only_char='BMO'):
    soup = quickLinkSoup(link)
    if soup is None:
        print("No soup for you!")
        return None
    
    #get dialogue
    dialogues = soup.findAll("dd")
    dialogues = [d.text for d in dialogues]

    #remove descriptions
    dialogues = [re.sub('\[.+\]', '', d) for d in dialogues]
    dialogues = [re.sub(r' ( )+',r' ',d) for d in dialogues]
    dialogues = [d.strip() for d in dialogues]
    dialogues = list(filter(lambda d: d != "", dialogues))
    dialogues = list(filter(lambda d: re.sub('^(.*):', '', d).strip() != "", dialogues))

    #add filtering 
    other_chars = []
    if only_char is not None:
        dial_edits = []
        for d in dialogues:
            if d.startswith(f"{only_char}:"):
                dial_edits.append(d)
            else:
                anon_d = re.sub("^(.*):",'X:',d)
                c = re.compile("^(.*):")
                if(c.search(d) is not None):
                    other_chars.append(re.sub('\(.+\)', '', c.search(d).group(1)))
                dial_edits.append(anon_d)
    else:
        dial_edits = dialogues
                
    return dial_edits, set(other_chars)

if SCRAPE_TRANSCRIPTS:
    # get all the BMO lines from all the episodes
    at_lines = {}
    all_chars = []
    with tqdm(total=len(transcripts.keys())) as pbar:
        for k in transcripts.keys():
            at_lines[k], chars = scrapeTranscript(transcripts[k])
            all_chars.extend(chars)
            pbar.update(1)
    all_chars = set(all_chars)

100%|██████████| 280/280 [01:25<00:00,  3.27it/s]


In [44]:
# export BMO lines to a text file
if SCRAPE_TRANSCRIPTS:
    exported_scripts = []
    with open("../data/bmo_lines.txt", "w+", encoding="utf-8") as f:
        # write the character list first
        for c in all_chars:
            f.write(f"+ {c}\n")
        f.write("\n\n")

        # write the BMO lines
        for k in at_lines.keys():
            #check if BMO is even in the episode
            if "BMO: " not in " ".join(at_lines[k]):
                continue
            else:
                exported_scripts.append(k)

            f.write(f"=== {k} ===\n")
            for l in at_lines[k]:
                f.write(f"{l}\n")
            f.write("\n\n")


    print(f"Exported {len(exported_scripts)} / {len(at_lines)} scripts to file.")

    bmo_lines = {}
    for k in exported_scripts:
        bmo_lines[k] = at_lines[k]

# import BMO lines from a text file (thank you Copilot!)
else:
    bmo_lines = {}
    all_chars = []
    with open("../data/bmo_lines.txt", "r", encoding="utf-8") as f:
        lines = f.readlines()
        episode = None
        for l in lines:
            if l.startswith("+"):
                all_chars.append(l.replace("+","").strip())
            elif l.startswith("==="):
                episode = l.replace("===","").strip()
                bmo_lines[episode] = []
            elif l.strip() == "":
                continue
            else:
                bmo_lines[episode].append(l.strip())


Exported 78 / 280 scripts to file.


In [57]:
# remove the character name from the line
def replaceCharName(script,chars,exclude='BMO'):
    anon_script = []
    for l in script:
        nl = l
        for c in chars:
            if c != exclude and c in l:
                nl = re.sub(rf"\b{c}\b","X",nl)
        anon_script.append(nl)
    return anon_script


# get the BMO lines from the episodes
bmo_lines_anon = {}
for k in bmo_lines.keys():
    bmo_lines_anon[k] = replaceCharName(bmo_lines[k],all_chars)

In [56]:
# print(bmo_lines_anon['S5E28'])

['BMO: Hello?', 'X: BMO?', 'X: BMO?', 'X: BMO, are you okay?', 'X: What did you do?', "BMO: It wasn't me. I didn't do noth- do noth- do nothing.", 'X: Come on BMO. We got to get you to the hospital or whatevs.', 'BMO: No, no. I am fine. Really. Oh, yes. Okay. Please take me to get fixed. I need- need- need- need to get new core system drivers installed. We can get them at the X  Factory in the Bad Lands, where I was born.', 'X: The X  Factory?', "BMO: Yeah. I am programmed with emergency instructions to get there. Come on! There's no- no- no- no- no time to lo- lo- lo- lo- lose.", "BMO: Okay. Now straight down this place. Okay, now take a right- right- right- right- left up here. The instructions also say that damaged X  get a full memoryX wipe before repairs. Don't let on what happened... or all the years we've been spent together will vanish like tears in the oven!", 'X: Tears in the oven? !', "BMO: Yeah, no fooling. Oh wait, there's more! It says here only X  are allowed in the fact