## Imports

In [1]:
from datasets import load_dataset
import numpy as np
import json
from tqdm import tqdm

## Dialog templates
Templates for converting dialogs to prompts

In [2]:
DIALOG_TEMPLATES = {
    ### template for 4+ line dialogs
    "four_more_lines": [
        """
Here's a {template} between {char1} and {char2} in a scene from a {genre} movie
    {dialogue1}
User : Can you continue the {template}
Assistant : Sure, the next dialogue for this scene could be
    {dialogue2}
 """,
        """
    {dialogue1}
User : Can you provide more dialog assuming {genre} movie
    {dialogue2}
""",
        """
I'm trying to complete the dialog for my characters {char1} and {char2}. Here's the {template}, Please help me complete it
    {dialogue1}
Assistant : Sure
    {dialogue2}
""",
        """
User : Assume {char1} and {char2} are characters from a {genre} movie, continue the conversation between them
    {dialogue1}
Assistant : Sure
    {dialogue2}
""",
    ],
    ## template for 4 line dialogs
    "four_lines": [
        """
    {dialogue1}
User : provide a response assuming you're {char2}
Assistant : Sure
    {dialogue2}
""",
        """
    {dialogue1}
User : respond as {char2} to complete the conversation
Assistant : Sure
    {dialogue2}
""",
    ],
}

## Code

- download movies_lines.txt from [here](https://www.kaggle.com/datasets/rajathmc/cornell-moviedialog-corpus)

In [6]:
def get_movie_dialogs():

    with open("/home/shahul/Data/movie_lines.txt", "rb") as f:
        movie_lines_data = [x.decode("latin").split("+++$+++") for x in f.readlines()]
    movie_dialog_dict = {}
    for dialog in movie_lines_data:
        movie_dialog_dict[dialog[0].strip()] = {
            "characterID": dialog[1].strip(),
            "characterName": dialog[3].strip(),
            "text": dialog[-1].strip(),
        }

    return movie_dialog_dict

In [34]:
def get_dialogs(dialog_dict, start, end):

    dialog_list = []
    for idx in range(start, end + 1):
        dialog_list.append(dialog_dict[f"L{idx}"]["characterName"] + ": " + dialog_dict[f"L{idx}"]["text"])
    num_lines = len(dialog_list)

    assert num_lines >= 1, "Number of lines should be greater than one"

    if (num_lines < 6):
        dialog1 = "\n    ".join(dialog_list[:-1])
        dialog2 = dialog_list[-1]
    else:
        dialog_len = np.random.randint(3, (num_lines // 2) + 1)
        dialog1 = "\n    ".join(dialog_list[:dialog_len])
        dialog2 = "\n    ".join(dialog_list[dialog_len:])

    return dialog1, dialog2


def choose_prompt(num_lines):

    assert num_lines >= 1, "Number of lines should be greater than one"

    if num_lines < 6:
        prompt = np.random.choice(DIALOG_TEMPLATES["four_lines"])

    else:
        prompt = np.random.choice(DIALOG_TEMPLATES["four_more_lines"])

    return prompt


def convert_to_prompts(dataset, movie_dialog_dict, output_dir=".", split="train"):

    with open(f"{output_dir}/{split}.jsonl", "w", encoding="utf8") as output:

        i = 0
        while i < len(dataset["train"]):

            max_lines = np.random.randint(7, 12)
            data = dataset[split][i]
            lineids = [int(lineid[1:]) for lineid in data["utterance"]["LineID"]]
            num_lines = len(lineids)
            char_ids = sorted([data["characterID1"].strip(), data["characterID1"].strip()])
            while num_lines < max_lines:
                i += 1
                data = dataset[split][i]
                char_id_new = sorted([data["characterID1"].strip(), data["characterID1"].strip()])
                ## make sure that characters are the same
                if char_id_new == char_ids:
                    lineids_new = [int(lineid[1:]) for lineid in data["utterance"]["LineID"]]
                    if lineids_new[0] == (lineids[-1] + 1):  ##ensure continuety
                        lineids.extend(lineids_new)
                    else:
                        break
                else:
                    break
                num_lines = len(lineids)

            genre = "-".join(data["movieGenres"][:2])
            template = np.random.choice(["dialog", "script", "play"])
            char1 = movie_dialog_dict[f"L{lineids[0]}"]["characterName"]
            
            if num_lines < 6 :
                if num_lines % 2 == 0:
                    char2 = movie_dialog_dict[f"L{lineids[1]}"]["characterName"]
                else:
                    char2 = char1
            else:
                char2 = movie_dialog_dict[f"L{lineids[1]}"]["characterName"]
            
            
            dialogue1, dialogue2 = get_dialogs(movie_dialog_dict, lineids[0], lineids[-1])
            prompt = choose_prompt(num_lines)
            
            prompt = prompt.format(
                char1=char1, char2=char2, dialogue1=dialogue1, dialogue2=dialogue2, genre=genre, template=template
            )
            i+=1
            output.write(f"{json.dumps({'conversation': prompt})}\n")

In [8]:
movie_dialog_dict = get_movie_dialogs()
dataset = load_dataset("cornell_movie_dialog")

Found cached dataset cornell_movie_dialog (/home/shahul/.cache/huggingface/datasets/cornell_movie_dialog/default/0.1.0/b67b3433cf894b551cddcd82efdff0826f39b39a11d5c149e746a546a8dc85f3)


  0%|          | 0/1 [00:00<?, ?it/s]

In [35]:
convert_to_prompts(dataset, movie_dialog_dict)

## Upload as HF Dataset

In [36]:
dataset_ = load_dataset("json",data_files={"train":"./train.jsonl"})

Using custom data configuration default-dcc8e5aadbb9b2bf


Downloading and preparing dataset json/default to /home/shahul/.cache/huggingface/datasets/json/default-dcc8e5aadbb9b2bf/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/shahul/.cache/huggingface/datasets/json/default-dcc8e5aadbb9b2bf/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [41]:
dataset_.push_to_hub("shahules786/OA-cornell-movies-dialog")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/39 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

## Load Dataset from HF

In [11]:
dataset_ = load_dataset("shahules786/OA-cornell-movies-dialog")

Downloading readme:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Using custom data configuration shahules786--OA-cornell-movies-dialog-bb4490d57ad0b94e


Downloading and preparing dataset None/None to /home/shahul/.cache/huggingface/datasets/shahules786___parquet/shahules786--OA-cornell-movies-dialog-bb4490d57ad0b94e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.71M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/39877 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/shahul/.cache/huggingface/datasets/shahules786___parquet/shahules786--OA-cornell-movies-dialog-bb4490d57ad0b94e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [42]:
for i in range(0,5):
    print("##")
    print(dataset_["train"][i]["conversation"])

##

Here's a dialog between BIANCA and CAMERON in a scene from a comedy-romance movie
    BIANCA: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
    CAMERON: Well, I thought we'd start with pronunciation, if that's okay with you.
    BIANCA: Not the hacking and gagging and spitting part.  Please.
    CAMERON: Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
    BIANCA: You're asking me out.  That's so cute. What's your name again?
User : Can you continue the dialog
Assistant : Sure, the next dialogue for this scene could be
    CAMERON: Forget it.
    BIANCA: No, no, it's my fault -- we didn't have a proper introduction ---
    CAMERON: Cameron.
    BIANCA: The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.
    CAMERON: Seems like she could get a date easy enough...
 
##

    CAMERON: Why?
    BIANCA: 