# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load and Preprocess Data

Our selection of data was narrowed down by our approach to correlate movie actors and the movie lines that they spoke. We used the Cornell Movie-Dialogs Corpus, which is a collection of metadata-rich conversations extracted from raw movie scripts. 

### Load dataset - movie_lines.txt

In [6]:
# Define the path to the movie_lines.txt file
file_path = 'nlp_group_movie_dataset/movie_lines.txt'
# Initialize empty lists to store the data
lineID = []
characterID = []
movieID = []
character_name = []
text_of_utterance = []
# Read first line in the file
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        # Split each line using ' +++$+++ ' as the delimiter
        line = line.split(' +++$+++ ')
        # Extract the fields
        # lineID.append(line[0])
        # characterID.append(line[1])
        movieID.append(line[2])
        character_name.append(line[3])
        text_of_utterance.append(line[4])
    f.close()

# Create a dataframe from the lists
# df = pd.DataFrame({'Line ID': lineID, 'Character ID': characterID, 'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})
df1 = pd.DataFrame({'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})
# Display the first 5 rows of the dataframe
df1.head()

Unnamed: 0,Movie ID,Character Name,Text of Utterance
0,m0,BIANCA,They do not!\n
1,m0,CAMERON,They do to!\n
2,m0,BIANCA,I hope so.\n
3,m0,CAMERON,She okay?\n
4,m0,BIANCA,Let's go.\n


### Load dataset - movie_characters_metadata.txt

In [7]:
# Define the path to the movie_lines.txt file
file_path = 'nlp_group_movie_dataset/movie_characters_metadata.txt'
# Initialize empty lists to store the data
#characterID = []
character_name = []
movieID = []
movie_title = []
# Read first line in the file
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        # Split each line using ' +++$+++ ' as the delimiter
        line = line.split(' +++$+++ ')
        # Extract the fields
        # lineID.append(line[0])
        # characterID.append(line[1])
        character_name.append(line[1])
        movieID.append(line[2])
        movie_title.append(line[3])
    f.close()

# Create a dataframe from the lists
# df = pd.DataFrame({'Line ID': lineID, 'Character ID': characterID, 'Movie ID': movieID, 'Character Name': character_name, 'Text of Utterance': text_of_utterance})
df2 = pd.DataFrame({'Movie ID': movieID, 'Character Name': character_name, 'Movie Title': movie_title})
df2.head()

Unnamed: 0,Movie ID,Character Name,Movie Title
0,m0,BIANCA,10 things i hate about you
1,m0,BRUCE,10 things i hate about you
2,m0,CAMERON,10 things i hate about you
3,m0,CHASTITY,10 things i hate about you
4,m0,JOEY,10 things i hate about you


### Combine Dataset

In [16]:
# Merge the two DataFrames on 'Movie ID' and 'Character Name'
combined_df = pd.merge(df1, df2, on=['Movie ID', 'Character Name'], how='outer')
combined_df.head()

Unnamed: 0,Movie ID,Character Name,Text of Utterance,Movie Title
0,m0,BIANCA,They do not!\n,10 things i hate about you
1,m0,BIANCA,I hope so.\n,10 things i hate about you
2,m0,BIANCA,Let's go.\n,10 things i hate about you
3,m0,BIANCA,Okay -- you're gonna need to learn how to lie.\n,10 things i hate about you
4,m0,BIANCA,I'm kidding. You know how sometimes you just ...,10 things i hate about you


# EDA

In [32]:
def analyze_dataframe(df, name ):
    print("\nEDA on: {}".format(name))
    # interested columns
    columns = ['Character Name', 'Movie ID', 'Character ID']
    for column in columns:
        # check if dataframe contains a column named 'Character Name'
        if column in df.columns:        
            if column == 'Character Name':        
                # Number of unique character names
                print('Number of unique character names: {}'.format(df[column].nunique()))
            if column == 'Movie ID':
                # Number of unique movies
                print('Number of unique movies: {}'.format(df[column].nunique()))
            if column == 'Character ID':
                # Number of unique character IDs
                print('Number of unique character IDs: {}'.format(df[column].nunique()))
            
    # print('Number of rows in the dataframe: {}'.format(df.shape[0]))
    #     print('Number of unique character names: {}'.format(df['Character Name'].nunique()))
    # if df['Movie ID'].bool():    
    #     print('Number of unique movies: {}'.format(df['Movie ID'].nunique()))
    # if df['Character ID'].bool():
    #     print('Number of unique character IDs: {}'.format(df['Character ID'].nunique()))

analyze_dataframe(df1, "movie_lines.txt df")
analyze_dataframe(df2, "movie_characters_metadata.txt df")
analyze_dataframe(combined_df, "Merged dataframe")


EDA on: movie_lines.txt df
Number of unique character names: 5356
Number of unique movies: 617

EDA on: movie_characters_metadata.txt df
Number of unique character names: 5356
Number of unique movies: 617

EDA on: Merged dataframe
Number of unique character names: 5356
Number of unique movies: 617
