## Collect data

In [1]:
# Needed libraries
import requests
from bs4 import BeautifulSoup
import re

In [2]:
# Some urls to extract dialogues from
urls = ["https://imsdb.com/scripts/A-Most-Violent-Year.html", "https://imsdb.com/scripts/Absolute-Power.html",
"https://imsdb.com/scripts/Reservoir-Dogs.html",
"https://imsdb.com/scripts/Natural-Born-Killers.html", "https://imsdb.com/scripts/Jackie-Brown.html", 
 "https://imsdb.com/scripts/Four-Rooms.html", 
"https://imsdb.com/scripts/Catch-Me-If-You-Can.html", "https://imsdb.com/scripts/Ex-Machina.html", 
"https://imsdb.com/scripts/Heist.html", "https://imsdb.com/scripts/Invictus.html", 
"https://imsdb.com/scripts/Only-God-Forgives.html", "https://imsdb.com/scripts/Passengers.html", 
"https://imsdb.com/scripts/Quantum-Project.html"]

* Auxiliary functions and Data Structures to deal with the text

In [3]:
def clean_line(input_text):
    """
    Cleans a given line of unnecessary characters
    """
    clean_line= input_text.text

    # find a break in the line and remove the following
    rm_index = clean_line.find("\r\n\r\n")
    if rm_index != -1:
        clean_line = clean_line[0:rm_index]

    # find and remove parenthesis and what it is inside
    rm_index1 = clean_line.find("(")
    rm_index2 = clean_line.find(")")
    if rm_index1 != -1:
        clean_line = clean_line[0:rm_index1] + clean_line[rm_index2+1:]


    # remove special characters
    clean_line = clean_line.strip()
    clean_line= clean_line.replace("\r", "")
    clean_line = clean_line.replace("\n", " ")
    clean_line = re.sub("\s+", " ", clean_line)


    return clean_line

In [4]:
def appropiate_sent(text):
    """
    If It is not empty or does not have numbers innit
    """
    appropiate = True

    if any(map(str.isdigit, text)) or text =="" :
        appropiate = False

    return appropiate

In [5]:
class Sentence:

    def __init__(self, who, what):
        self.who = who
        self.what = what

    def print_sentence(self):
        text = self.who + ": " + self.what

        return text
    
class Dialogue:

    def __init__(self, sentences):
        self.num_sentences = len(sentences)
        self.sentences = sentences
        self.speakers = list(set([sent.who for sent in sentences]))
        self.num_speakers = len(self.speakers)
        
    def print_dialogue(self):
        text = ""
        for sent in self.sentences:
            text += sent.print_sentence() + "\n"

        print(text)
        return 



In [6]:
def check_phrase(phrase):
    """
    checks if the phrase is smaller than 20 words and arranges it if not.
    """

    aux_list = phrase.split()
    
    if len(aux_list) <20:
        return phrase
    else:
        aux_list = re.split("(\? |\.|\!|;)",phrase)
        final_phrase = ""
        num_words = 0
        
        while aux_list !=[]:
            last_item = aux_list.pop()
            len_last_item = len(last_item.split())
            if len_last_item < (20 - num_words):
                final_phrase = last_item + " " + final_phrase
            num_words += len_last_item

        final_phrase=final_phrase.replace(".  ", "", 1)
        final_phrase = final_phrase.replace(" .", ".")
        return final_phrase

* Function that gathers the past functions and creates a text file containing the dialogues of a movie passed as an url

In [7]:
def extract_dialogue(url):
    
    try:
        page = requests.get(url)
    except:
        print("Error al abrir la URL")

    soup = BeautifulSoup(page.text, 'html.parser')
    # Buscamos el <div> correspondiente y sacamos su contenido:
    content = soup.find('pre')

    # Extract by content EACH HTML TOPIC
    all = [x for x in content]
    if len(all) == 1:
        content = content.find('pre')
        all = [x for x in content]

    # PARSING OF EACH HTML TOPIC
    parsed_doc = []
    for line in all:

        # We found out that the sentences that start by " " are the dialogues
        if line.text.startswith(" "):
            c_line = clean_line(line)
            parsed_doc.append(c_line)
        # if it is not a dialogue, it is a scene change
        else:
            parsed_doc.append("-------")


    # CLEANING THE EACH TOPIC
    cleaned_doc = ["-------"]
    i= 0
    while i < len(parsed_doc) - 1:

        # 2 case: it starts with -
        if parsed_doc[i].__contains__("-"):
            if not cleaned_doc[-1].__contains__("-"):
                cleaned_doc.append(parsed_doc[i])

        # 1: if it is not alpha. Numbers are not welcomed
        elif appropiate_sent(parsed_doc[i]):
            cleaned_doc.append(parsed_doc[i])

        else:
            pass

        i+=1
    
    # STRUCTURING THE DIALOGUES into speaker and what
    structured_doc = []
    i= 0
    dialogue_sentences = []
    while i < (len(cleaned_doc) - 1):
        
        if cleaned_doc[i].isupper() and not cleaned_doc[i+1].isupper() and not cleaned_doc[i+1].__contains__("--"):
            sent = Sentence(who=cleaned_doc[i], what=cleaned_doc[i+1])
            dialogue_sentences.append(sent)
            i+=2
        
        elif dialogue_sentences != []:
            dialogue = Dialogue(dialogue_sentences)
            structured_doc.append(dialogue)
            
            dialogue_sentences = []
            i+=1
        else:

            i+=1

    # WRITING TO TEXT FILE IF IT FITS THE STANDARD (20 words)
    if structured_doc !=[]:
        name_file = re.findall("scripts/(.*)\.html", url)[0] + ".txt"
        f = open(name_file, "w", encoding = "utf-8")

        for dialogue in structured_doc:
            if dialogue.num_speakers ==2 and dialogue.num_sentences>1:
                question = check_phrase(dialogue.sentences[0].what)
                question_bool = False if question == "" else True
                for i in range(1, dialogue.num_sentences -1):
                    answer = check_phrase(dialogue.sentences[i].what)
                    answer_bool = False if answer == "" else True

                    if question_bool and answer_bool:
                        f.write(question + "\t" + answer + "\n")
                    
                    question_bool = answer_bool
                    question = answer
        f.close()

    return structured_doc

* Extracting dialogues

In [10]:
for url in urls:
    extract_dialogue(url)