# Libraries

In [51]:
#!pip install langchain
! pip install --upgrade openai


Collecting openai
  Obtaining dependency information for openai from https://files.pythonhosted.org/packages/1e/9f/385c25502f437686e4aa715969e5eaf5c2cb5e5ffa7c5cdd52f3c6ae967a/openai-0.28.1-py3-none-any.whl.metadata
  Downloading openai-0.28.1-py3-none-any.whl.metadata (11 kB)
Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.27.6
    Uninstalling openai-0.27.6:
      Successfully uninstalled openai-0.27.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandasai 0.2.11 requires openai<0.28.0,>=0.27.5, but you have openai 0.28.1 which is incompatible.[0m[31m
[0mSuccessfully installed openai-0.28.1


In [52]:
import pandas as pd
import json
import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")


# Load and Preprocess Data

Our selection of data was narrowed down by our approach to correlate movie actors and the movie lines that they spoke. We used the Cornell Movie-Dialogs Corpus, which is a collection of metadata-rich conversations extracted from raw movie scripts. 

### Load dataset - movie_lines.txt

In [3]:
# Define the path to the movie_lines.txt file
file_path = 'nlp_group_movie_dataset/movie_lines.txt'
# Initialize empty lists to store the data
lineID = []
characterID = []
movieID = []
character_name = []
text_of_utterance = []
# Read first line in the file
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        # Split each line using ' +++$+++ ' as the delimiter
        line = line.split(' +++$+++ ')
        # Extract the fields
        lineID.append(line[0])
        characterID.append(line[1])
        movieID.append(line[2])
        character_name.append(line[3])
        text_of_utterance.append(line[4])
    f.close()

# Create a dataframe from the lists
# df = pd.DataFrame({'Line ID': lineID, 'Character ID': characterID, 'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})
df = pd.DataFrame({'Line ID': lineID, 'Character ID': characterID, 'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,Line ID,Character ID,Movie ID,Character Name,Text of Utterance
0,L1045,u0,m0,BIANCA,They do not!\n
1,L1044,u2,m0,CAMERON,They do to!\n
2,L985,u0,m0,BIANCA,I hope so.\n
3,L984,u2,m0,CAMERON,She okay?\n
4,L925,u0,m0,BIANCA,Let's go.\n


### Load dataset - movie_characters_metadata.txt

In [4]:
# Define the path to the movie_lines.txt file
file_path = 'nlp_group_movie_dataset/movie_characters_metadata.txt'
# Initialize empty lists to store the data
#characterID = []
character_name = []
movieID = []
movie_title = []
# Read first line in the file
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        # Split each line using ' +++$+++ ' as the delimiter
        line = line.split(' +++$+++ ')
        # Extract the fields
        # lineID.append(line[0])
        # characterID.append(line[1])
        character_name.append(line[1])
        movieID.append(line[2])
        movie_title.append(line[3])
    f.close()

# Create a dataframe from the lists
# df = pd.DataFrame({'Line ID': lineID, 'Character ID': characterID, 'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})
df2 = pd.DataFrame({'Movie ID': movieID, 'Character Name': character_name, 'Movie Title': movie_title})

### Combine Dataset

In [5]:
# Merge the two DataFrames on 'Movie ID' and 'Character Name'
combined_df = pd.merge(df, df2, on=['Movie ID', 'Character Name'], how='outer')
combined_df.head()

Unnamed: 0,Line ID,Character ID,Movie ID,Character Name,Text of Utterance,Movie Title
0,L1045,u0,m0,BIANCA,They do not!\n,10 things i hate about you
1,L985,u0,m0,BIANCA,I hope so.\n,10 things i hate about you
2,L925,u0,m0,BIANCA,Let's go.\n,10 things i hate about you
3,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.\n,10 things i hate about you
4,L870,u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...,10 things i hate about you


### Preprocess Data

In [19]:
def preprocess_text(text):
    # Convert words to lowercase
    text = text.lower()

    # Remove new lines
    text = text.replace('\n', '')
    text = text.replace('\r', '')
    text = text.replace('\t', '')
    text = text.replace('...', '')
    text = text.replace('--', '')

    return text


# Remove unprocessed text
combined_df['process_utterance'] = combined_df['Text of Utterance'].apply(preprocess_text)
combined_df.head()

Unnamed: 0,Line ID,Character ID,Movie ID,Character Name,Text of Utterance,Movie Title,process_utterance
0,L1045,u0,m0,BIANCA,They do not!\n,10 things i hate about you,they do not!
1,L985,u0,m0,BIANCA,I hope so.\n,10 things i hate about you,i hope so.
2,L925,u0,m0,BIANCA,Let's go.\n,10 things i hate about you,let's go.
3,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.\n,10 things i hate about you,okay you're gonna need to learn how to lie.
4,L870,u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...,10 things i hate about you,i'm kidding. you know how sometimes you just ...


# EDA

In [20]:
def analyze_dataframe(df, name ):
    print("\nEDA on: {}".format(name))
    # interested columns
    columns = ['Character Name', 'Movie ID', 'Character ID']
    for column in columns:
        # check if dataframe contains a column named 'Character Name'
        if column in df.columns:        
            if column == 'Character Name':        
                # Number of unique character names
                print('Number of unique character names: {}'.format(df[column].nunique()))
            if column == 'Movie ID':
                # Number of unique movies
                print('Number of unique movies: {}'.format(df[column].nunique()))
            if column == 'Character ID':
                # Number of unique character IDs
                print('Number of unique character IDs: {}'.format(df[column].nunique()))
            
analyze_dataframe(df, "movie_lines.txt df")
analyze_dataframe(df2, "movie_characters_metadata.txt df")
analyze_dataframe(combined_df, "Merged dataframe")


EDA on: movie_lines.txt df
Number of unique character names: 5356
Number of unique movies: 617
Number of unique character IDs: 9035

EDA on: movie_characters_metadata.txt df
Number of unique character names: 5356
Number of unique movies: 617

EDA on: Merged dataframe
Number of unique character names: 5356
Number of unique movies: 617
Number of unique character IDs: 9035


# Fine Tunning

### Format input data

In [21]:
def create_training_ds(df, count= 10):
    ds = []
    
    # give me 10 random samples in the processed utterance
    # dynamically insert the movie line, character name, and movie title into the format template
    sample = df.sample(count)
    for index, row in sample.iterrows():
        movie_title = row['Movie Title']
        movie_line = row['process_utterance']
        character_name = row['Character Name']

        sys_cont = "You are an chatbot that is an expert in movie line quotes. You are limited to only answer movie related questions but can pull historical, or recent data to gain more context of the movie or what possible lines were said by characters. Inputs from an example of a raw script are: movie line, character, and movie title in that order. Ex: {movie_line}, {character_name}, {movie_title}".format(movie_line=movie_line, character_name=character_name, movie_title=movie_title)
        user_cont = "Which character said this line {movie_line}, from the movie {movie_title}?".format(movie_line=movie_line, movie_title=movie_title)
        
        format_template = {"messages": [
            {"role": "system", "content": sys_cont}, 
            {"role": "user", "content": user_cont}
        ]}
        ds.append(format_template.copy())
    return ds

data = create_training_ds(combined_df, 3)
print(data) 


[{'messages': [{'role': 'system', 'content': 'You are an chatbot that is an expert in movie line quotes. You are limited to only answer movie related questions but can pull historical, or recent data to gain more context of the movie or what possible lines were said by characters. Inputs from an example of a raw script are: movie line, character, and movie title in that order. Ex: sack him?  david, what else can i do?  this business is not, repeat, not breaking even. and david  notice anything this morning?, GRIERSON, bean'}, {'role': 'user', 'content': 'Which character said this line sack him?  david, what else can i do?  this business is not, repeat, not breaking even. and david  notice anything this morning?, from the movie bean?'}]}, {'messages': [{'role': 'system', 'content': "You are an chatbot that is an expert in movie line quotes. You are limited to only answer movie related questions but can pull historical, or recent data to gain more context of the movie or what possible li

### 2

In [54]:
def create_dataset(df, count= 10):
    training_data = {
        "prompt": [],
        "completion": []
    }
    
    # give me 10 random samples in the processed utterance
    # dynamically insert the movie line, character name, and movie title into the format template
    sample = df.sample(count)
    for index, row in sample.iterrows():
        movie_title = row['Movie Title']
        movie_line = row['process_utterance']
        character_name = row['Character Name']

        sys_cont = "You are an chatbot that is an expert in movie line quotes. You are limited to only answer movie related questions but can pull historical, or recent data to gain more context of the movie or what possible lines were said by characters. Inputs from an example of a raw script are: movie line, character, and movie title in that order. Ex: {movie_line}, {character_name}, {movie_title}".format(movie_line=movie_line, character_name=character_name, movie_title=movie_title)
        user_cont = "Which character said this line {movie_line}, from the movie {movie_title}?".format(movie_line=movie_line, movie_title=movie_title)
        
        training_data["prompt"].append(sys_cont)
        training_data["completion"].append(user_cont)


    return training_data


ds2 = create_dataset(combined_df, 10)
print("training data size: ", len(ds2["prompt"]))

def dict_to_jsonl(dictionary, output_file):
    with open(output_file, 'w') as file:
        for prompt, completion in zip(dictionary["prompt"], dictionary["completion"]):
            json_obj = {"prompt": prompt, "completion": completion}
            json_line = json.dumps(json_obj)  # Convert dict to JSON string
            file.write(json_line + '\n')  # Write JSON string to file with newline
    file.close()
    return output_file


file = dict_to_jsonl(ds2, "training_data.jsonl")


training data size:  10


You can use file from the previous cell to invote the fine tune job. 

In [50]:
response = openai.File.create(
    file=open(file),
    purpose='fine-tune'
)

print(response)

{
  "bytes": 5770,
  "created_at": 1697769077,
  "filename": "file",
  "id": "file-IJWH39SoQnC9b2q24PYr7fSW",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}


Not working yet to view status, says an email will be sent to Angel probably

In [53]:
# View the status of the file
# List 10 fine-tuning jobs
openai.FineTuningJob.list(limit=10)

# Retrieve the state of a fine-tune
openai.FineTuningJob.retrieve("file-DbZPjtA8PIj6LnL33X1LAi4D")


AttributeError: module 'openai' has no attribute 'FineTuningJob'