## Imports

In [15]:
from datasets import load_dataset
import numpy as np
import json

## Dialog templates
Templates for converting dialogs to prompts

In [5]:
DIALOG_TEMPLATES = {
    
### template for 4+ line dialogs
    "four_more_lines" : [
"""
Here's a {template} between {char1} and {char2} in a scene from a {genre} movie
    {dialogue1}
User : Can you continue the {template}
Assistant : Sure, the next dialogue for this scene could be
    {dialogue2}
 """,
    
"""
    {dialogue1}
User : Can you provide more dialog assuming {genre} movie
    {dialogue2}
""",
    
"""
I'm trying to complete the dialog for my characters {char1} and {char2}.Here's the {template}, Please help me complete it
    {dialogue1}
Assistant : Sure
    {dialogue2}
""",
    ],
    
## template for 2 line dialogs

    "two_lines" : [
"""
User : Assume {char1} and {char2} are characters from a {genre} movie, what would {char2} say if 
    {dialogue1}
Assistant : {char2} could say
    {dialogue2}
""",
],
    
## template for 3 line dialogs
    "three_lines" : [
"""
    {dialogue1}
User : provide a response assuming you're {char1}
Assistant : Sure
    {dialogue2}
""",
    
"""
    {dialogue1}
User : respond as {char1} to complete the conversation
Assistant : Sure
    {dialogue2}
""",
    ]
    
    
    
}

## Code

In [6]:
def get_movie_dialogs():
    
    with open("/home/shahul/Data/movie_lines.txt",'rb') as f:
        movie_lines_data = [x.decode("latin").split("+++$+++") for x in f.readlines()]
    movie_dialog_dict = {}
    for dialog in movie_lines_data:
        movie_dialog_dict[dialog[0].strip()] = {"characterID":dialog[1].strip(),"characterName":dialog[3].strip(),'text':dialog[-1].strip()}
        
    return movie_dialog_dict

In [24]:
def get_dialogs(dialog_dict,start,end):
    
    dialog_list = []
    for idx in range(start,end+1):
        dialog_list.append(dialog_dict[f'L{idx}']['characterName']+": "+dialog_dict[f'L{idx}']['text'])
    num_lines = len(dialog_list)
    
    assert num_lines > 1, "Number of lines should be greater than one"
        
    if num_lines == 2:
        dialog1 = dialog_list[0]
        dialog2 = dialog_list[1]
    elif num_lines in (3,4):
        dialog1 = "\n    ".join(dialog_list[:2])
        dialog2 = "\n    ".join(dialog_list[2:])
    else:
        dialog_len = np.random.randint(2,(num_lines+1)//2)
        dialog1 = "\n    ".join(dialog_list[:dialog_len])
        dialog2 = "\n    ".join(dialog_list[dialog_len:])
    
    return dialog1,dialog2
    
    
def choose_prompt(num_lines):
        
        assert num_lines > 1, "Number of lines should be greater than one"
        
        if num_lines==2:
            prompt =  np.random.choice(DIALOG_TEMPLATES["two_lines"])
            
        elif num_lines == 3:
            prompt = np.random.choice(DIALOG_TEMPLATES["three_lines"])

        else:
            prompt = np.random.choice(DIALOG_TEMPLATES["four_more_lines"])
        
        
        return prompt
    

def convert_to_prompts(dataset,movie_dialog_dict,output_dir=".",split = "train"):
    
   
    with open(f"{output_dir}/{split}.jsonl", "w", encoding="utf8") as output:
        
        for i in range(0,10):
            data  = dataset[split][i]

            lineids = [int(lineid[1:]) for lineid in data['utterance']['LineID']]
            num_lines = len(lineids)

            genre = "-".join(data['movieGenres'][:2])
            template = np.random.choice(['dialog','script','play'])
            char1 = movie_dialog_dict[f'L{lineids[0]}']['characterName']
            char2 = movie_dialog_dict[f'L{lineids[1]}']['characterName']
            dialogue1,dialogue2 = get_dialogs(movie_dialog_dict,lineids[0],lineids[-1])
            prompt = choose_prompt(num_lines)
            prompt = prompt.format(char1=char1, char2=char2, dialogue1=dialogue1, dialogue2=dialogue2, genre=genre, template=template)
            
            if i<5:
                print("##")
                print(prompt)

            output.write(f"{json.dumps({'conversation': prompt})}\n")


        

In [19]:
movie_dialog_dict = get_movie_dialogs()
dataset = load_dataset("cornell_movie_dialog")

Found cached dataset cornell_movie_dialog (/home/shahul/.cache/huggingface/datasets/cornell_movie_dialog/default/0.1.0/b67b3433cf894b551cddcd82efdff0826f39b39a11d5c149e746a546a8dc85f3)


  0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
convert_to_prompts(dataset,movie_dialog_dict)

##

Here's a play between BIANCA and CAMERON in a scene from a comedy-romance movie
    BIANCA: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
    CAMERON: Well, I thought we'd start with pronunciation, if that's okay with you.
User : Can you continue the play
Assistant : Sure, the next dialogue for this scene could be
    BIANCA: Not the hacking and gagging and spitting part.  Please.
    CAMERON: Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
 
##

User : Assume BIANCA and CAMERON are characters from a comedy-romance movie, what would CAMERON say if 
    BIANCA: You're asking me out.  That's so cute. What's your name again?
Assistant : CAMERON could say
    CAMERON: Forget it.

##

I'm trying to complete the dialog for my characters BIANCA and CAMERON.Here's the play, Please help me complete it
    BIANCA: No, no, it's my fault -- we didn't have a proper introduction ---
