In [12]:
import json
import csv

# Assuming all files are in the current directory, adjust paths as necessary
utterances_file = 'movie/utterances.jsonl'
speakers_file = 'movie/speakers.json'
conversations_file = 'movie/conversations.json'
output_csv_file = 'new_movie_dialogs.csv'

# Load the speaker and conversation data into memory for quick lookup
speakers_data = {}
with open(speakers_file, 'r', encoding='utf-8') as file:
    speakers_data = json.load(file)

conversations_data = {}
with open(conversations_file, 'r', encoding='utf-8') as file:
    conversations_data = json.load(file)

# Open the CSV file for writing
with open(output_csv_file, mode='w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['utterance_id', 'conversation_id', 'text', 'speaker_id', 'character_name', 'gender', 'movie_id', 'movie_name', 'release_year', 'rating', 'votes', 'genre']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Open the utterances file and read line by line
    with open(utterances_file, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i >= 25000:  # Only process the first 150 lines
                break
            utterance_data = json.loads(line)
            
            # Extract speaker information using the speaker ID from the utterance
            speaker_id = utterance_data.get('speaker')
            speaker_info = speakers_data.get(speaker_id, {}).get('meta', {})
            
            # Extract conversation/movie information using the conversation ID from the utterance
            conversation_id = utterance_data.get('conversation_id')
            conversation_info = conversations_data.get(conversation_id, {}).get('meta', {})
            
            # Write the relevant information to the CSV
            writer.writerow({
                'utterance_id': utterance_data.get('id', 'N/A'),  # Provide a default if missing
                'conversation_id': conversation_id or 'N/A',
                'text': utterance_data.get('text', 'N/A'),
                'speaker_id': speaker_id or 'N/A',
                'character_name': speaker_info.get('character_name', 'N/A'),
                'gender': speaker_info.get('gender', 'N/A'),
                'movie_id': speaker_info.get('movie_idx', 'N/A'),
                'movie_name': conversation_info.get('movie_name', 'N/A'),
                'release_year': conversation_info.get('release_year', 'N/A'),
                'rating': conversation_info.get('rating', 'N/A'),
                'votes': conversation_info.get('votes', 'N/A'),
                'genre': ', '.join(conversation_info.get('genre', []))  # Join the list into a comma-separated string
            })


In [13]:
import spacy
import pandas as pd

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the dataset
df = pd.read_csv('new_movie_dialogs.csv')

# Define a function to perform NER
def extract_entities(text):
    # Process the text with spaCy
    doc = nlp(text)
    # Extract entities from the doc
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    # Return the entities
    return entities

# Apply the NER function to the first 50 rows to keep computation time reasonable
df['entities'] = df['text'].head(50).apply(extract_entities)

# Now df['entities'] contains the named entities recognized in each piece of text
print(df[['text', 'entities']].head())

# You could then save this back to CSV if desired
df.to_csv('movie_dialogs_with_entities.csv', index=False)


           text entities
0  They do not!       []
1   They do to!       []
2    I hope so.       []
3     She okay?       []
4     Let's go.       []


In [55]:
import pandas as pd



# Find interactions
interactions = df.groupby('conversation_id')['speaker_id'].apply(list)

# Create nodes and links
nodes = []
links = []

for conversation in interactions:
    for i, speaker in enumerate(conversation):
        if speaker not in nodes:
            nodes.append(speaker)
        if i < len(conversation) - 1:  # Check if there is a next speaker
            links.append({'source': speaker, 'target': conversation[i + 1]})

# Create DataFrames for nodes and links
nodes_df = pd.DataFrame({'Character_ID': nodes})
links_df = pd.DataFrame(links)

# Create a DataFrame for interactions
interactions_df = pd.DataFrame({'conversation_id': interactions.index, 'speakers': interactions})

# Save interactions to a CSV file
interactions_df.to_csv('interactions.csv', index=False)


In [51]:
interactions

conversation_id
L1007                           [u0, u5, u0, u5, u0]
L101486                                 [u592, u585]
L101494                                 [u585, u595]
L101502                     [u595, u585, u595, u585]
L101507               [u585, u592, u585, u592, u585]
                             ...                    
L982                                        [u5, u0]
L984                                        [u0, u2]
L986                                  [u11, u5, u11]
L989       [u11, u5, u11, u5, u11, u5, u11, u5, u11]
L998                                  [u11, u5, u11]
Name: speaker_id, Length: 7001, dtype: object

In [24]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Load the dataset
df = pd.read_csv('new_movie_dialogs.csv')

# Initialize the sentiment intensity analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Define a function to get the sentiment score
def get_sentiment(text):
    # Since text can be NaN, we convert it to an empty string which is handled by VADER
    text = str(text) if pd.notnull(text) else ''
    return sia.polarity_scores(text)['compound']

# Apply the function to get sentiment score for each dialogue
df['sentiment'] = df['text'].apply(get_sentiment)

# Save the dataframe with the sentiment scores to a new csv file
output_file = 'movie_dialogs_with_sentiment.csv'
df.to_csv(output_file, index=False)

output_file

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


'movie_dialogs_with_sentiment.csv'

In [26]:
import spacy
import pandas as pd

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your dataset
df = pd.read_csv('new_movie_dialogs.csv')

# Define a function to extract entities
def extract_entities(text):
    # Skip empty texts or NaNs
    if pd.isna(text):
        return []
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]


# Apply the function to extract entities for each dialogue
df['entities'] = df['text'].apply(extract_entities)

# Save the dataframe with the entities
df.to_csv('movie_dialogs_with_entities.csv', index=False)


In [28]:

df = pd.read_csv('movie_dialogs_with_entities.csv')
# Calculate dialogue count and average sentiment per character
character_metrics = df.groupby('speaker_id').agg({
    'text': 'count',
    'sentiment': 'mean'
}).rename(columns={'text': 'dialogue_count', 'sentiment': 'average_sentiment'}).reset_index()

# Extract entities and count the most common ones per character
character_entities = (
    df.explode('entities')
    .groupby(['speaker_id', 'entities'])
    .size()
    .reset_index(name='count')
    .sort_values(['speaker_id', 'count'], ascending=[True, False])
    .groupby('speaker_id')
    .head(5)  # Top 5 entities per character
)

# Merge the metrics and entities
character_profile = character_metrics.merge(character_entities, on='speaker_id', how='left')

# Save to CSV
character_profile.to_csv('character_profiles.csv', index=False)


In [53]:
import pandas as pd
import gensim
from gensim import corpora
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('new_movie_dialogs.csv')

# Initialize a tokenizer and stopwords
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

# Preprocess the text
def preprocess(text):
    # Tokenize
    tokens = tokenizer.tokenize(str(text).lower())  # Convert to string to handle NaN
    # Remove stopwords
    return [token for token in tokens if token not in stop_words]

# Apply preprocessing to each document
df['processed'] = df['text'].apply(preprocess)

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(df['processed'])

# Create a document-term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['processed']]

# Perform LDA
NUM_TOPICS = 5
ldamodel = gensim.models.LdaMulticore(doc_term_matrix, num_topics=NUM_TOPICS, id2word=dictionary, passes=15)

# Assign the most relevant topic to each document
def get_dominant_topic(doc_bow):
    topic_distribution = ldamodel.get_document_topics(doc_bow)
    topic_distribution = sorted(topic_distribution, key=lambda x: x[1], reverse=True)
    return topic_distribution[0][0] if topic_distribution else None

df['dominant_topic'] = [get_dominant_topic(doc) for doc in doc_term_matrix]

# Save the dataframe with the dominant topics to a new csv file
output_file = 'movie_dialogs_with_topics.csv'
df.to_csv(output_file, index=False)

# Print some of the topics for reference
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(0, '0.041*"know" + 0.022*"yeah" + 0.014*"well" + 0.010*"way" + 0.009*"sorry"')
(1, '0.028*"like" + 0.020*"right" + 0.019*"go" + 0.016*"got" + 0.015*"okay"')
(2, '0.024*"yes" + 0.019*"one" + 0.012*"get" + 0.012*"tell" + 0.012*"got"')
(3, '0.014*"know" + 0.014*"think" + 0.012*"us" + 0.011*"see" + 0.010*"like"')
(4, '0.025*"want" + 0.020*"oh" + 0.013*"good" + 0.012*"time" + 0.007*"uh"')


In [54]:
import pandas as pd

# Load the dataset with entities and sentiment
df = pd.read_csv('movie_dialogs_with_entities.csv')

# Function to safely evaluate the string representation of lists
def safe_eval_list(s):
    try:
        return eval(s)
    except:
        return []

# Apply the safe_eval_list function to the 'entities' column
df['entities'] = df['entities'].apply(safe_eval_list)

# Drop rows where 'entities' column is '[]' and 'sentiment' column is 0
df = df[(df['entities'].str.len() > 0) & (df['sentiment'] != 0)]

# Save the cleaned dataframe to a new csv file
cleaned_output_file = 'cleaned_movie_dialogs.csv'
df.to_csv(cleaned_output_file, index=False)
