# Libraries

In [59]:
import pandas as pd
import random
import json

# Load dataset - movie_lines.txt

In [60]:
# Define the path to the movie_lines.txt file
file_path = 'movie_lines.txt'
# Initialize empty lists to store the data
lineID = []
characterID = []
movieID = []
character_name = []
text_of_utterance = []
# Read first line in the file
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        # Split each line using ' +++$+++ ' as the delimiter
        line = line.split(' +++$+++ ')
        # Extract the fields
        # lineID.append(line[0])
        # characterID.append(line[1])
        movieID.append(line[2])
        character_name.append(line[3])
        text_of_utterance.append(line[4])
    f.close()

# Create a dataframe from the lists
# df = pd.DataFrame({'Line ID': lineID, 'Character ID': characterID, 'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})
df1 = pd.DataFrame({'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})


# Load dataset - movie_characters_metadata.txt

In [61]:
# Define the path to the movie_lines.txt file
file_path = 'movie_characters_metadata.txt'
# Initialize empty lists to store the data
#characterID = []
character_name = []
movieID = []
movie_title = []
# Read first line in the file
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        # Split each line using ' +++$+++ ' as the delimiter
        line = line.split(' +++$+++ ')
        # Extract the fields
        # lineID.append(line[0])
        # characterID.append(line[1])
        character_name.append(line[1])
        movieID.append(line[2])
        movie_title.append(line[3])
    f.close()

# Create a dataframe from the lists
# df = pd.DataFrame({'Line ID': lineID, 'Character ID': characterID, 'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})
df2 = pd.DataFrame({'Movie ID': movieID, 'Character Name': character_name, 'Movie Title': movie_title})


# Preprocessing

In [62]:
# Display the first 5 rows of the dataframe
df1.head()

Unnamed: 0,Movie ID,Character Name,Text of Utterance
0,m0,BIANCA,They do not!\n
1,m0,CAMERON,They do to!\n
2,m0,BIANCA,I hope so.\n
3,m0,CAMERON,She okay?\n
4,m0,BIANCA,Let's go.\n


In [63]:
df2.head()

Unnamed: 0,Movie ID,Character Name,Movie Title
0,m0,BIANCA,10 things i hate about you
1,m0,BRUCE,10 things i hate about you
2,m0,CAMERON,10 things i hate about you
3,m0,CHASTITY,10 things i hate about you
4,m0,JOEY,10 things i hate about you


In [64]:
# Number of rows in the dataframe
print('Number of rows in the dataframe: {}'.format(df1.shape[0]))
# Number of unique character names
print('Number of unique character names: {}'.format(df1['Character Name'].nunique()))
# Number of unique movies
print('Number of unique movies: {}'.format(df1['Movie ID'].nunique()))
# Number of unique character IDs
# print('Number of unique character IDs: {}'.format(df['Character ID'].nunique()))

Number of rows in the dataframe: 304713
Number of unique character names: 5356
Number of unique movies: 617


In [65]:
# Merge the two DataFrames on 'Movie ID' and 'Character Name'
combined_df = pd.merge(df1, df2, on=['Movie ID', 'Character Name'], how='outer')

In [66]:
combined_df.head()

Unnamed: 0,Movie ID,Character Name,Text of Utterance,Movie Title
0,m0,BIANCA,They do not!\n,10 things i hate about you
1,m0,BIANCA,I hope so.\n,10 things i hate about you
2,m0,BIANCA,Let's go.\n,10 things i hate about you
3,m0,BIANCA,Okay -- you're gonna need to learn how to lie.\n,10 things i hate about you
4,m0,BIANCA,I'm kidding. You know how sometimes you just ...,10 things i hate about you


In [67]:
# Number of rows in the dataframe
print('Number of rows in the dataframe: {}'.format(combined_df.shape[0]))

Number of rows in the dataframe: 304713


# Prepare dataset for fine-tuning

In [68]:
# Convert the Character Name, Movie Title, and Text columns to JSON format to use with GPT-3.5
subset_df = combined_df[['Character Name', 'Movie Title', 'Text of Utterance']]
# json_data = subset_df.to_json(orient='records')

subset_df.head()

Unnamed: 0,Character Name,Movie Title,Text of Utterance
0,BIANCA,10 things i hate about you,They do not!\n
1,BIANCA,10 things i hate about you,I hope so.\n
2,BIANCA,10 things i hate about you,Let's go.\n
3,BIANCA,10 things i hate about you,Okay -- you're gonna need to learn how to lie.\n
4,BIANCA,10 things i hate about you,I'm kidding. You know how sometimes you just ...


In [69]:
# Randomly pick 50 rows from the subset_df dataframe

random.seed(123)
random_rows = random.sample(range(0, len(subset_df)), 50)
subset_df2 = subset_df.iloc[random_rows, :]
subset_df2.head()

Unnamed: 0,Character Name,Movie Title,Text of Utterance
27453,MIERZWIAK,eternal sunshine of the spotless mind,Is that Nietzsche?\n
140339,NICK,book of shadows: blair witch 2,Let's just leave it at: it was one hell of a s...
45710,BURNS,his girl friday,The insurance business?\n
213511,JD,"lock, stock and two smoking barrels",Yes?\n
139750,MIRACIA,blade,There are other ways to see. Sit.\n


In [70]:
# For the GPT-3.5 chatbot, we need to create a JSON file with the following format:
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
messages = []
# For every row in the subset_df2 dataframe, create a dictionary and append it to the messages list
for i in range(len(subset_df2)):
    messages.append({'role': 'system', 'content': 'Marv is a factual chatbot that is also a movie buff.'})
    messages.append({'role': 'user', 'content': f'What is a movie line from {subset_df2.iloc[i, 1]}'})
    messages.append({'role': 'assistant', 'content': subset_df2.iloc[i, 2]})

In [71]:
# View the messages list
# Number of items per line
items_per_line = 3
for i in range(0, len(messages), items_per_line):
    items = messages[i:i+items_per_line]
    for item in items:
        print(item)
    print() # Print a blank line

{'role': 'system', 'content': 'Marv is a factual chatbot that is also a movie buff.'}
{'role': 'user', 'content': 'What is a movie line from eternal sunshine of the spotless mind'}
{'role': 'assistant', 'content': 'Is that Nietzsche?\n'}

{'role': 'system', 'content': 'Marv is a factual chatbot that is also a movie buff.'}
{'role': 'user', 'content': 'What is a movie line from book of shadows: blair witch 2'}
{'role': 'assistant', 'content': "Let's just leave it at: it was one hell of a surprise.\n"}

{'role': 'system', 'content': 'Marv is a factual chatbot that is also a movie buff.'}
{'role': 'user', 'content': 'What is a movie line from his girl friday'}
{'role': 'assistant', 'content': 'The insurance business?\n'}

{'role': 'system', 'content': 'Marv is a factual chatbot that is also a movie buff.'}
{'role': 'user', 'content': 'What is a movie line from lock, stock and two smoking barrels'}
{'role': 'assistant', 'content': 'Yes?\n'}

{'role': 'system', 'content': 'Marv is a factual

In [72]:
with open('fine-tuning.json', 'w') as f:
    json.dump(messages, f)
    f.close()