<h2><font color = "orange">Preparing Cornell Dialogs Data</font></h2>

In [149]:
import os
import sys
import zipfile

import requests

from chatbot import logger

<h3><font color = "green">Get Data</font></he>

In [174]:
cornell_movie_dialogs_data_url = "https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip"
local_compressed_data_path = "./data/cornell_movie_dialogs/raw/dialogs.zip"
local_extracted_data_path = "./data/cornell_movie_dialogs/raw/"

In [175]:
def download_data(source: str, destination: str) -> str:
    """Downloads a dataset, i.e. a csv or a zipped file from a source
    url to a local destination path.

    If the download fails, the code exists with an error message.

    Args:
        source (str): A url to the data file.
        destination (str): A local filepath to save
            the downloaded data.

    Returns:
        str: The path to the downloaded data.
    """
    if not os.path.isdir(os.path.dirname(destination)):
        os.makedirs(os.path.dirname(destination), exist_ok=True)

    response = requests.get(url=source)

    if response.status_code == 200:
        logger.info(f"Downloading data from {source} to {destination}")
        with open(file=destination, mode="wb") as f:
            f.write(response.content)
    else:
        logger.error(f"Failed to download data. Status code: {response.status_code}")
        sys.exit(1)

    return destination

def unzip_data(compressed_file: str, destination_dir: str) -> str:
    """Extracts the content of a compressed (`.zip`) file
    to a local directory and removes the compressed file afterwards.

    Note:
        If the destination directory and/or its subdirectories 
        do not exist, they will be created.
    
    Args:
        compressed_file (str): The path to the compressed file.
        destination (str): Local directory path to extract the 
            content of the compressed file.
    
    Returns:
        str: The destination directory path.
    
    Raises:
        FileNotFoundError: If the compressed file does not exist.
    """
    if not os.path.exists(path=compressed_file):
        raise FileNotFoundError(
            f"The compressed file '{compressed_file}' does not exist."
        )

    if not os.path.isdir(s=destination_dir):
        os.makedirs(destination_dir, exist_ok=True)

    logger.info(f"Unzipping data {compressed_file} to {destination_dir}")
    with zipfile.ZipFile(file=compressed_file, mode="r") as zip_ref:
        zip_ref.extractall(path=destination_dir)
    os.remove(path=compressed_file)
        
    return destination_dir

In [176]:
# Download the data
data_path = download_data(source=cornell_movie_dialogs_data_url, destination=local_compressed_data_path)

# Extract data
extracted_data = unzip_data(compressed_file=data_path, destination_dir=local_extracted_data_path)

[2;36m2024-03-20 21:53:45,201: [0m [37mDEBUG   [0m [37m Starting new HTTPS connection (1): zissou.infosci.cornell.edu:443[0m
[2;36m2024-03-20 21:53:45,881: [0m [37mDEBUG   [0m [37m https://zissou.infosci.cornell.edu:443 "GET /convokit/datasets/movie-corpus/movie-corpus.zip HTTP/1.1" 200 40854701[0m
[2;36m2024-03-20 21:54:07,531: [0m [32mINFO    [0m [32m Downloading data from https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip to ./data/cornell_movie_dialogs/raw/dialogs.zip[0m
[2;36m2024-03-20 21:54:07,557: [0m [32mINFO    [0m [32m Unzipping data ./data/cornell_movie_dialogs/raw/dialogs.zip to ./data/cornell_movie_dialogs/raw/[0m


<h3><font color = "green">Prepare Data</font></h3>

In [177]:
import codecs
import csv
import json
from typing import Tuple, Dict, Any, List

In [178]:
movie_corpus_path = os.path.join(local_extracted_data_path, "movie-corpus")
utterances_path = os.path.join(movie_corpus_path, "utterances.jsonl")
processed_data_path = "./data/cornell_movie_dialogs/processed/formatted_dialogs.txt"

<h4><font color = "cyan">Create Formatted File</font></h4>

In [179]:
def load_lines_and_conversations(filepath: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """Splits each line of the target file to create 
    lines and conversations.
    
    Each line in the created lines consists of a single 
    sentece, and each conversation consists of multiple 
    lines (sentences).

    Args:
        filepath (str): The path to the target file.
    
    Returns:
        Tuple[Dict[str, Any], Dict[str, Any]]:
            The individual lines and conversations.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(
            f"The provided file '{filepath}' does not exist."
        )

    lines = {}
    conversations = {}
    with open(file=filepath, mode="r", encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            line_json = json.loads(s=line)
            
            # Extract fields for line object
            line_obj = {}
            line_obj["line_id"] = line_json["id"]
            line_obj["character_id"] = line_json["speaker"]
            line_obj["text"] = line_json["text"]        
            lines[line_obj["line_id"]] = line_obj

            # Extract fields for conversation object
            if line_json["conversation_id"] not in conversations:
                conv_obj = {}
                conv_obj["conversation_id"] = line_json["conversation_id"]
                conv_obj["movie_id"] = line_json["meta"]["movie_id"]
                conv_obj["lines"] = [line_obj]
            else:
                conv_obj = conversations[line_json["conversation_id"]]
                conv_obj["lines"].append(line_obj)
                conv_obj["lines"] = sorted(
                    conv_obj["lines"], key=lambda item: item["line_id"]
                )
            conversations[conv_obj["conversation_id"]] = conv_obj

    return lines, conversations

def extract_sentence_pairs(conversations: Dict[str, Any]) -> List[List[str]]:
    """Extracts pairs of query-answer sentences from 
    the conversations dictionary.
    
    Args:
        conversations (Dict[str, Any]): The conversations
            dictionary.
    
    Returns:
        List[List[str]]: All the query-answer sentence 
            pairs.
    """
    qa_pairs = []
    for conversation in conversations.values():
        for i in range(len(conversation["lines"]) - 1):
            query_sentence = conversation["lines"][i]["text"].strip()
            target_sentence = conversation["lines"][i+1]["text"].strip()

            # Filter wrong samples (if one of the lists is empty)
            if query_sentence and target_sentence:
                qa_pairs.append([query_sentence, target_sentence])
    return qa_pairs

def create_formatted_file(
        conversations: Dict[str, Any], 
        destination: str,
        delimiter: str = "\t"
) -> None:
    """Creates a `.txt` file in the target destination
    from the provided conversations dictionary.

    Each line of the file contains a tab-seperated query
    and a response sentence pair.

    Args:
        conversations (Dict[str, Any]): The conversations 
            dictionary.
        destination (str): Local filepath to create and 
            save the formatted file.
        delimiter (str): The string to seperate the query
            sentence from the response sentence. 
            Defaults to ``'\t'``.
    """
    if not os.path.isdir(os.path.dirname(destination)):
        os.makedirs(name=os.path.dirname(destination), exist_ok=True)

    delimiter = str(codecs.decode(delimiter, "unicode_escape"))

    logger.info(f"Creating formatted file at '{destination}'\n")
    with open(file=destination, mode="w", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=delimiter, lineterminator="\n")
        for pair in extract_sentence_pairs(conversations=conversations):
            writer.writerow(pair)

def print_lines(file: str, n: int = 10) -> None:
    """Prints a `n` number of lines from a file.
    
    Args:
        file (str): Path to the file.
        n (int, optional): Number of lines
            to print (default=10).
    """
    with open(file=file, mode="rb") as f:
        lines = f.readlines()
    for line in lines[:n]:
        print(line)

In [180]:
lines, conversations = load_lines_and_conversations(filepath=utterances_path)

In [181]:
sentence_pairs = extract_sentence_pairs(conversations=conversations)
sentence_pairs

[['They do to!', 'They do not!'],
 ['She okay?', 'I hope so.'],
 ['Wow', "Let's go."],
 ['I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
  'No'],
 ['No', "Okay -- you're gonna need to learn how to lie."],
 ["I figured you'd get to the good stuff eventually.", 'What good stuff?'],
 ['What good stuff?', 'The "real you".'],
 ['The "real you".', 'Like my fear of wearing pastels?'],
 ['do you listen to this crap?', 'What crap?'],
 ['What crap?',
  "Me.  This endless ...blonde babble. I'm like, boring myself."],
 ["Me.  This endless ...blonde babble. I'm like, boring myself.",
  'Thank God!  If I had to hear one more story about your coiffure...'],
 ['Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."',
  'No...'],
 ['Well, no...', "Then that's all you had to say."],
 ["Then that's all you had to say.", 'But'],
 ['But', 'You always been this selfish?'],
 ['I looked for you back at the party, but 

In [182]:
create_formatted_file(
    conversations=conversations,
    destination=processed_data_path,
    delimiter="\t"
)

[2;36m2024-03-20 21:54:10,675: [0m [32mINFO    [0m [32m Creating formatted file at './data/cornell_movie_dialogs/processed/formatted_dialogs.txt'
[0m
