## Imports

In [5]:
from datasets import load_dataset
import numpy as np
import json
from tqdm import tqdm

## Dialog templates
Templates for converting dialogs to prompts

In [6]:
DIALOG_TEMPLATES = {
    ### template for 4+ line dialogs
    "four_more_lines": [
        """
Here's a {template} between {char1} and {char2} in a scene from a {genre} movie
    {dialogue1}
User : Can you continue the {template}
Assistant : Sure, the next dialogue for this scene could be
    {dialogue2}
 """,
        """
    {dialogue1}
User : Can you provide more dialog assuming {genre} movie
    {dialogue2}
""",
        """
I'm trying to complete the dialog for my characters {char1} and {char2}.Here's the {template}, Please help me complete it
    {dialogue1}
Assistant : Sure
    {dialogue2}
""",
        """
User : Assume {char1} and {char2} are characters from a {genre} movie, continue the conversation between them
    {dialogue1}
Assistant : Sure
    {dialogue2}
""",
    ],
    ## template for 4 line dialogs
    "four_lines": [
        """
    {dialogue1}
User : provide a response assuming you're {char1}
Assistant : Sure
    {dialogue2}
""",
        """
    {dialogue1}
User : respond as {char1} to complete the conversation
Assistant : Sure
    {dialogue2}
""",
    ],
}

## Code

- download movies_lines.txt from [here](https://www.kaggle.com/datasets/rajathmc/cornell-moviedialog-corpus)

In [7]:
def get_movie_dialogs():

    with open("./movie_lines.txt", "rb") as f:
        movie_lines_data = [x.decode("latin").split("+++$+++") for x in f.readlines()]
    movie_dialog_dict = {}
    for dialog in movie_lines_data:
        movie_dialog_dict[dialog[0].strip()] = {
            "characterID": dialog[1].strip(),
            "characterName": dialog[3].strip(),
            "text": dialog[-1].strip(),
        }

    return movie_dialog_dict

In [8]:
def get_dialogs(dialog_dict, start, end):

    dialog_list = []
    for idx in range(start, end + 1):
        dialog_list.append(dialog_dict[f"L{idx}"]["characterName"] + ": " + dialog_dict[f"L{idx}"]["text"])
    num_lines = len(dialog_list)

    assert num_lines >= 1, "Number of lines should be greater than one"

    if num_lines < 6:
        dialog1 = "\n    ".join(dialog_list[:-1])
        dialog2 = dialog_list[-1]
    else:
        dialog_len = np.random.randint(3, (num_lines // 2) + 1)
        dialog1 = "\n    ".join(dialog_list[:dialog_len])
        dialog2 = "\n    ".join(dialog_list[dialog_len:])

    return dialog1, dialog2


def choose_prompt(num_lines):

    assert num_lines >= 1, "Number of lines should be greater than one"

    if num_lines < 6:
        prompt = np.random.choice(DIALOG_TEMPLATES["four_lines"])

    else:
        prompt = np.random.choice(DIALOG_TEMPLATES["four_more_lines"])

    return prompt


def convert_to_prompts(dataset, movie_dialog_dict, output_dir=".", split="train"):

    with open(f"{output_dir}/{split}.jsonl", "w", encoding="utf8") as output:

        i = 0
        while i < len(dataset["train"]):

            max_lines = np.random.randint(7, 10)
            data = dataset[split][i]
            lineids = [int(lineid[1:]) for lineid in data["utterance"]["LineID"]]
            num_lines = len(lineids)
            char_ids = sorted([data["characterID1"].strip(), data["characterID1"].strip()])
            while num_lines < max_lines:
                i += 1
                data = dataset[split][i]
                char_id_new = sorted([data["characterID1"].strip(), data["characterID1"].strip()])
                ## make sure that characters are the same
                if char_id_new == char_ids:
                    lineids_new = [int(lineid[1:]) for lineid in data["utterance"]["LineID"]]
                    if lineids_new[0] == (lineids[-1] + 1):  ##ensure continuety
                        lineids.extend(lineids_new)
                    else:
                        break
                else:
                    break
                num_lines = len(lineids)

            genre = "-".join(data["movieGenres"][:2])
            template = np.random.choice(["dialog", "script", "play"])
            char1 = movie_dialog_dict[f"L{lineids[0]}"]["characterName"]
            char2 = movie_dialog_dict[f"L{lineids[1]}"]["characterName"]
            dialogue1, dialogue2 = get_dialogs(movie_dialog_dict, lineids[0], lineids[-1])
            prompt = choose_prompt(num_lines)
            prompt = prompt.format(
                char1=char1, char2=char2, dialogue1=dialogue1, dialogue2=dialogue2, genre=genre, template=template
            )

            output.write(f"{json.dumps({'conversation': prompt})}\n")

In [9]:
movie_dialog_dict = get_movie_dialogs()
dataset = load_dataset("cornell_movie_dialog")

Found cached dataset cornell_movie_dialog (/home/shahul/.cache/huggingface/datasets/cornell_movie_dialog/default/0.1.0/b67b3433cf894b551cddcd82efdff0826f39b39a11d5c149e746a546a8dc85f3)


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
convert_to_prompts(dataset, movie_dialog_dict)


    EMIL: Listen to me.  I don't want sex.  Just give me the address and then you go.
    HONEY: Look, man, I don't give a shit if you want sex or not, but you're payin' for my time.
    EMIL: Give me the address!!
User : provide a response assuming you're EMIL
Assistant : Sure
    HONEY: Alright, alright - don't hurt me! Please, it's in my book, in my purse!


User : Assume RICK and ALICE are characters from a fantasy-horror movie, continue the conversation between them
    RICK: I heard you screaming.  Was it a bad one?
    ALICE: It was bad.
    RICK: Doesn't the dream master work for you anymore?
Assistant : Sure
    ALICE: I can't find him.
    RICK: Hey, since when do you play Thomas Edison?  This looks like Sheila's.
    ALICE: It is...was. It's a zapper, it might help me stay awake.
    RICK: Yeah, or turn you into toast.


    RANDY: Yes?
    MRS. HAMMEN: Oh, Stewardess. My husband is very sick. Can you do something, please?
    RANDY: Well, the doctor will be with you in jus


    BOZO: SHIT.
User : provide a response assuming you're BOZO
Assistant : Sure
    HEROINE: MOVE YOUR ASS!


    SCOTT: Why did they block the door again?
User : respond as SCOTT to complete the conversation
Assistant : Sure
    JACOB: To keep the daylight out! This is where they sleep! Get to the door!


    NICK: Somebody must have seen it.
User : provide a response assuming you're NICK
Assistant : Sure
    MAJOR HICKS: It happened so fast no one knew what hit them 'til is was over.


    COMPUTER: Systems register functional.
User : respond as COMPUTER to complete the conversation
Assistant : Sure
    GWEN: All systems are working, Commander.


    MATHESAR: A thousand apologies. We have failed you.
    JASON: You what?.. What are you talking about?
User : provide a response assuming you're MATHESAR
Assistant : Sure
    MATHESAR: We have seen you victorious in many more desperate situations. The fault must lie with us, with the ship...


    JOSEPH REED: Congress is bitterly oppos


    MAC: Did you say you were thinking about adoption?
User : provide a response assuming you're MAC
Assistant : Sure
    JUNO: Yeah, well, there's this couple who've been trying to have a baby for five years.


    CASPER: Yo, you got any weed around here?
    TELLY: Naw. But we should run by the park and get a dime. Maybe Darcy will be at the park.
    CASPER: Yo. I'm gonna get buff dude.
    TELLY: You are?
User : respond as CASPER to complete the conversation
Assistant : Sure
    CASPER: Yeah. The other day, some sort of Chinese bitch told me I'd look good with muscles.


    COLWYN: Take the western passage.
User : respond as COLWYN to complete the conversation
Assistant : Sure
    LYSSA: All directions are the same here.


    YNYR: We seek the Fortress of Krull.
    SEER: Such a vision will be opposed. Who seeks it?
    YNYR: The new King.
    SEER: With an old voice?
    YNYR: You know the voice.
User : Can you provide more dialog assuming fantasy-action movie
    SEER: Yes. Y


Here's a play between PEARL HARPER and JOHN HARPER in a scene from a drama-film-noir movie
    PEARL HARPER: Now can I tell?
    JOHN HARPER: Hm?
    PEARL HARPER: When Mr. Powell's our Daddy then I can tell him about --
User : Can you continue the play
Assistant : Sure, the next dialogue for this scene could be
    JOHN HARPER: You swore, Pearl!
    PEARL HARPER: John! Don't!
    JOHN HARPER: You promised Dad you wouldn't never tell!
 

    BULJANOFF: Yes! We could stay with Leon!
User : provide a response assuming you're BULJANOFF
Assistant : Sure
    IRANOFF: Leon, how would you like to have three lifelong friends?


    KOPALSKI: ...and Kopalski.
User : provide a response assuming you're KOPALSKI
Assistant : Sure
    NINOTCHKA: Don't make it difficult for me. This is no more a pleasure trip for me than it is for you.


Here's a play between TAXI DRIVER and NINOTCHKA in a scene from a comedy-romance movie
    TAXI DRIVER: Where to, madame?
    NINOTCHKA: Can you recommend a restaur


    AGNES: Because I'm getting fat.
    MOTHER MIRIAM: Oh, for Heaven's sake.
    AGNES: I am, there's too much flesh on me.
    MOTHER MIRIAM: Agnes...
    AGNES: I'm a blimp.
User : Can you provide more dialog assuming drama-mystery movie
    MOTHER MIRIAM: Why does it matter whether you're fat or not...
    AGNES: Because...
    MOTHER MIRIAM: ... You needn't worry about being attractive here.
    AGNES: I do, I have to be attractive to God.
    MOTHER MIRIAM: He loves you the way you are.
    AGNES: No he doesn't. He hates fat people.
    MOTHER MIRIAM: Who told you this?
    AGNES: It's a sin to be fat.
    MOTHER MIRIAM: Why?
    AGNES: Look at the statues, they're thin.
    MOTHER MIRIAM: Agnes...
    AGNES: That's because they're suffering... suffering is beautiful, I want to be beautiful.
    MOTHER MIRIAM: Who tells you these things?
    AGNES: Christ said it in the Bible, he said - suffer the little children, I want to suffer like a little child.
    MOTHER MIRIAM: That's n



    ALBERT: Damn right!
User : respond as ALBERT to complete the conversation
Assistant : Sure
    VINCE: What do you think, Sal? Jesus, you think we'd miss this?


I'm trying to complete the dialog for my characters NICK and SAL.Here's the script, Please help me complete it
    NICK: Oh cards, maybe. Poker... It's getting cold, Sal. I'm going to take you in.  We'll call Angela. The guys can help her bring you home... Did I tell you I was going on a trip?
    SAL: Trip? What do you mean, Nick? You said you'd be --
    NICK: It's okay. Hey, it's okay! Just a week. Just to see Phantom Mary.
    SAL: Phantom Mary?
Assistant : Sure
    NICK: Didn't I ever tell you about Phantom Mary?
    SAL: No.
    NICK: Well... Phantom Mary's on my mother's side. Naturally no one there admits it because Phantom Mary's pretty weird... You want to hear the whole story?
    SAL: Yeah!
    NICK: Like I say, Phantom Mary's pretty weird... Lives alone, lives way out in the middle of nowhere with a cat calle



    GRANT: If you had a transistor about this size and power output, and a thin enough wire --  -- could you piece it together?
    CORA: No, it requires such absolute precision --
    GRANT: A surgeon might...
    CORA: Oh yes, I'm sure Dr. Duval could. If we had the parts.
User : provide a response assuming you're GRANT
Assistant : Sure
    GRANT: I've got a source. All I have to do is tap it.


User : Assume JACK and TYLER are characters from a crime-drama movie, continue the conversation between them
    JACK: You see, when you travel, everything is small, self-contained--
    TYLER: The spork.  I get it.  You're very clever.
    JACK: Thank you.
    TYLER: How's that working out for you?
Assistant : Sure
    JACK: What?
    TYLER: Being clever.
    JACK: Well, uh... great.
    TYLER: Keep it up, then.  Keep it right up.


    RAYMOND: S-S-Stuff.
User : respond as RAYMOND to complete the conversation
Assistant : Sure
    TYLER: "Stuff."  Were the mid-terms hard?


    NORA: You'r


    HERBERT: No arguments there, Dr. Hulme! All that time inside working on those novels of theirs. They don't get fresh air or exercise!
User : respond as HERBERT to complete the conversation
Assistant : Sure
    HONORA: frowns at Henry.


    KIRSTY: You win! Okay pull over.
    TREVOR: But... I thought...
User : respond as KIRSTY to complete the conversation
Assistant : Sure
    KIRSTY: If what I've heard is true this could be the last time for a long long time. Besides we've got a whole seven minutes before the next one. Clock's ticking. Tick-tock...


    LIEUTENANT: Anderson, Arthurs, Boulier. What is this?
    PAUL: The guest list. It hasn't been updated since the murder of the president.
    LIEUTENANT: Are you trying to make a fool of me? There are no Europeans left in that hotel. Get me the names of all the cockroaches in there.
    PAUL: That will take time.
User : respond as LIEUTENANT to complete the conversation
Assistant : Sure
    LIEUTENANT: You don't have time. If I 


    EDWARD: Love this car!  Is it new?
User : respond as EDWARD to complete the conversation
Assistant : Sure
    STUCKEY: Yes!  And you don't even know where you're going!


    VIZZINI: Once the horse reaches the castle, the fabric will make the Prince suspect the Guilderians have abducted his love. When he finds her body dead on the Guilder frontier, his suspicions will be totally confirmed.
    FEZZIK: You never said anything about killing anyone.
    VIZZINI: I've hired you to help me start a war. That's a prestigious line of work with a long and glorious tradition.
    FEZZIK: I just don't think it's right, killing an innocent girl.
User : provide a response assuming you're VIZZINI
Assistant : Sure
    VIZZINI: Am I going mad or did the word "think" escape your lips? You were not hired for your brains, you hippopotamic land mass.


    FLORENCE: Make it snappy, Colonel.
    SERGEANT: There's an autopsy ordered at Bellevue immediately. You better skip over there.
    FLORENCE: Do


    NICHOLAS: The last few days, um... I've been thinking... had a lot of spare time.  I want to tell you... I'm starting to understand why you left me.  I've been resenting you for it, maybe, but ... I want to apologize, for all of it.  For shutting you out, for not being there.  I...  Anyway, I hope you can forgive me.
    ELIZABETH: There's nothing to forgive.
    NICHOLAS: It would mean a lot to me... if you and I could be friends.  If I could have you back in my life... in some small way.  It would be important.
User : provide a response assuming you're NICHOLAS
Assistant : Sure
    ELIZABETH: Of course, Nick...


    MARROW: What is that tune?
User : provide a response assuming you're MARROW
Assistant : Sure
    NELL: I don't know.  A lullaby I guess. My mother used to hum it to me. And her mother before that, and so on.  Hugh Crain, would you care to dance?


    JONATHAN: Jesus, what is this?
User : provide a response assuming you're JONATHAN
Assistant : Sure
    PENDERGAST: A

## Load Dataset from HF

In [11]:
dataset_ = load_dataset("shahules786/OA-cornell-movies-dialog")

Downloading readme:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Using custom data configuration shahules786--OA-cornell-movies-dialog-bb4490d57ad0b94e


Downloading and preparing dataset None/None to /home/shahul/.cache/huggingface/datasets/shahules786___parquet/shahules786--OA-cornell-movies-dialog-bb4490d57ad0b94e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.71M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/39877 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/shahul/.cache/huggingface/datasets/shahules786___parquet/shahules786--OA-cornell-movies-dialog-bb4490d57ad0b94e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
indices = [int(x) for x in np.random.randint(0, 100, 10)]
for i in indices:
    print("##")
    print(dataset_["train"][i]["conversation"])

52
##

Here's a play between MICHAEL and JOEY in a scene from a comedy-romance movie
    MICHAEL: Hey.
    JOEY: Are you lost?
    MICHAEL: Nope - just came by to chat
User : Can you continue the play
Assistant : Sure, the next dialogue for this scene could be
    JOEY: We don't chat.
    MICHAEL: Well, actually, I thought I'd run an idea by you.  You know, just to see if you're interested.
    JOEY: We're not.
 
1
##

    CAMERON: Why?
    BIANCA: Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.
    CAMERON: That's a shame.
    BIANCA: Gosh, if only we could find Kat a boyfriend...
User : provide a response assuming you're CAMERON
Assistant : Sure
    CAMERON: Let me see what I can do.

52
##

Here's a play between MICHAEL and JOEY in a scene from a comedy-romance movie
    MICHAEL: Hey.
    JOEY: Are you lost?
    MICHAEL: Nope - just came by to chat
User : Can you continue the play
Assistant : Sure,