# Process script of Friends TV Show in order to create dialogue dataset

In [8]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

## Read episodes scripts

In [9]:
dialogues = []

scene_id = -1
for ep_path in sorted(Path("episodes/").iterdir()):    
    with open(ep_path, 'r') as file:        
        for i, line in enumerate(file):
            if line.casefold().startswith(("written","(","{")):
                continue
            if line.casefold().startswith("["):
                scene_id += 1
                continue
            try:           
                character,text = line.split(":",maxsplit=1)
                character = re.sub(r"\(.*\)","",character)
                text = re.sub(r"\(.*\)","",text)
                row = {"character": character.lower(), "text": text.strip(),"episode":ep_path.stem,"scene":scene_id}
                dialogues.append(row)
            except:                    
                continue
                # print(f"ERROR: {ep_path.stem}; Line: {i}")                    

In [10]:
df = pd.DataFrame(data = dialogues)
df.to_csv("friends_dialog.csv",index=False)
# df = pd.read_csv("friends_dialog.csv",)

## Create dialog pairs for one character in each scene

In [11]:
dialog_pairs = []
curr_char = None
base_ut = None
cur_scene = 0
for i,row in df.iterrows():
    char = row["character"]
    if curr_char is None or char == curr_char or row["scene"] != cur_scene:
        curr_char = char
        base_ut = row["text"]
        cur_scene = row["scene"]
        continue
    dialog_pairs.append((base_ut,row["text"]))

## Create dialog pairs for each character in each scene (a lot of utterances are repeated across dataset)

In [114]:
scenes_start_df = df[df["scene"].ne(df["scene"].shift())]
scenes_end = {int(scene):i for i,scene in scenes_start_df["scene"].shift().iloc[1:].iteritems()}
dialog_pairs = []
for (char,scene),group in df.groupby(["character","scene"],sort=False):
    char_indices = group.index.tolist()
    indices_groups = []
    try:
        for i in range(char_indices[0],scenes_end[scene]):
            if i in char_indices:
                indices_groups.append([])
            else:
                indices_groups[-1].append(i)
        
        for char_idx, idx_group in zip(char_indices,indices_groups):
            base_ut = df.loc[[char_idx] * len(idx_group)]["text"].tolist()
            answers = df.loc[idx_group]["text"].tolist()
            dialog_pairs.extend(list(zip(base_ut,answers)))       
    except:
        continue

In [20]:
base_ut = dialog_pairs[0][0]
new_dialog_pairs = [dialog_pairs[0]]
for pair in dialog_pairs[1:]:
    if pair[0] != base_ut:
        new_dialog_pairs.append(pair)
        base_ut = pair[0]

In [21]:
train_d, test_d = train_test_split(new_dialog_pairs,test_size=0.2,random_state=42,shuffle=False)

In [22]:
test_d,val_d = train_test_split(test_d,test_size=0.5,random_state=42)

In [23]:
def write_file(dialog_pairs,split):
    with open(f"{split}.txt","w") as f:
        for q,a in dialog_pairs:
            line = f"{q}|{a}\n"
            f.write(line)
write_file(train_d,"train")
write_file(test_d,"test")
write_file(val_d,"val")