In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [21]:
# Define the path to the movie_lines.txt file
file_path = 'movie_lines.txt'
# Initialize empty lists to store the data
lineID = []
characterID = []
movieID = []
character_name = []
text_of_utterance = []
# Read first line in the file
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        # Split each line using ' +++$+++ ' as the delimiter
        line = line.split(' +++$+++ ')
        # Extract the fields
        lineID.append(line[0])
        characterID.append(line[1])
        movieID.append(line[2])
        character_name.append(line[3])
        text_of_utterance.append(line[4])
    f.close()
        # Print the result
        #print('Line ID: {}\nCharacter ID: {}\nMovie ID: {}\nCharacter Name: {}\nText of Utterance: {}\n'.format(line[0], line[1], line[2], line[3], line[4]))

# Create a dataframe from the lists
df = pd.DataFrame({'Line ID': lineID, 'Character ID': characterID, 'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})


In [3]:
# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,Line ID,Character ID,Movie ID,Character Name,Text of Utterance
0,L1045,u0,m0,BIANCA,They do not!\n
1,L1044,u2,m0,CAMERON,They do to!\n
2,L985,u0,m0,BIANCA,I hope so.\n
3,L984,u2,m0,CAMERON,She okay?\n
4,L925,u0,m0,BIANCA,Let's go.\n


In [4]:
# Number of rows in the dataframe
print('Number of rows in the dataframe: {}'.format(df.shape[0]))
# Number of unique character names
print('Number of unique character names: {}'.format(df['Character Name'].nunique()))
# Number of unique movies
print('Number of unique movies: {}'.format(df['Movie ID'].nunique()))
# Number of unique character IDs
print('Number of unique character IDs: {}'.format(df['Character ID'].nunique()))


Number of rows in the dataframe: 304713
Number of unique character names: 5356
Number of unique movies: 617
Number of unique character IDs: 9035


In [5]:
# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define tokenization function
def tokenize(batch):
    input_ids_list = []
    attention_mask_list = []
    for text in batch: # For each text in the batch
        # Tokenize the text and convert to input features
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) # Return PyTorch tensors
        input_ids_list.append(inputs["input_ids"].squeeze(0)) # Token ids
        attention_mask_list.append(inputs["attention_mask"].squeeze(0)) # Mask to avoid performing attention on padding
    return input_ids_list, attention_mask_list



In [6]:
df['Text of Utterance'][:10]


0                                       They do not!\n
1                                        They do to!\n
2                                         I hope so.\n
3                                          She okay?\n
4                                          Let's go.\n
5                                                Wow\n
6     Okay -- you're gonna need to learn how to lie.\n
7                                                 No\n
8    I'm kidding.  You know how sometimes you just ...
9                   Like my fear of wearing pastels?\n
Name: Text of Utterance, dtype: object

In [7]:
input_ids, attention_mask = tokenize(df['Text of Utterance'][:10])

In [1]:
import openai

# Set the API key
openai.api_key = 'API key here'

def get_assistant_reply(user_input):
    # Create a conversation with the model using the user's question
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant."
            },
            {
                "role": "user",
                "content": user_input
            }
        ]
    )

    # Extract the assistant's reply from the response
    return response['choices'][0]['message']['content']

# Continuous interaction loop
while True:
    # Prompt the user for a question
    user_question = input("\nPlease ask a question (or type 'exit' to stop): ")

    # Exit condition
    if user_question.lower() == 'exit':
        print("Goodbye!")
        break

    assistant_reply = get_assistant_reply(user_question)
    print(f"Assistant: {assistant_reply}")