In [46]:
#DATA CLEANING and MERGING

In [47]:
import pandas as pd

In [48]:
# Load the dataset
movies_metadata = pd.read_csv('./data/movies_metadata.csv')

# Convert budget, popularity, revenue, runtime, vote_average, and vote_count to numeric
numeric_columns = ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']
for col in numeric_columns:
    movies_metadata[col] = pd.to_numeric(movies_metadata[col], errors='coerce').fillna(0)

# Convert release_date to datetime
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')

# Drop unneeded columns
columns_to_drop = ['adult', 'homepage', 'poster_path', 'video']
movies_metadata.drop(columns=columns_to_drop, inplace=True)

  movies_metadata = pd.read_csv('./data/movies_metadata.csv')


In [49]:
# load credits.csv into a DataFrame
credits = pd.read_csv('./data/credits.csv')

# Convert 'id' in credits to integer
credits['id'] = pd.to_numeric(credits['id'], errors='coerce').astype('Int64')

In [50]:
# load keywords.csv into a DataFrame
keywords = pd.read_csv('./data/keywords.csv')

# Convert 'id' in keywords to integer
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce').fillna(0).astype('Int64')

In [51]:
# Convert 'id' in movies_metadata to integer
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce').fillna(0).astype('Int64')

In [52]:
# Merge
movies_metadata = movies_metadata.merge(credits, on='id', how='left')
movies_metadata = movies_metadata.merge(keywords, on='id', how='left')

In [53]:
# Save the merged DataFrame to a new CSV file
movies_metadata.to_csv('./data/cleaned_merged_movies_data.csv', index=False)

In [None]:
# GRAPHS AND INSIGHTS

In [None]:
# MORE CLEANING AND PARSING JSON-LIKE STRING COLUMNS IN A DATAFRAME

import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

# Corrected data dictionary variable name
KG_data = {
    'Movie': ['Corpse Bride', 'The Lord of the Rings: The Fellowship of the Ring', 'Edward Scissorhands'],
    'Director': ['Tim Burton', 'Peter Jackson', 'Tim Burton'],
    'Cast': [['Johnny Depp', 'Christopher Lee'], ['Christopher Lee', 'Elijah Wood'], ['Johnny Depp', 'Dianne Wiest']],
    'Genre': ['Romance', 'Fantasy', 'Fantasy']
}

# Convert the corrected data into a DataFrame
df = pd.DataFrame(KG_data)

# Create a graph
G = nx.Graph()

# Add nodes with the node attribute "type" and edges
for index, row in df.iterrows():
    G.add_node(row['Movie'], type='Movie')
    G.add_node(row['Director'], type='Director')
    for actor in row['Cast']:
        G.add_node(actor, type='Actor')
    G.add_node(row['Genre'], type='Genre')

    # Add edges
    G.add_edge(row['Movie'], row['Director'])
    for actor in row['Cast']:
        G.add_edge(row['Movie'], actor)
    G.add_edge(row['Movie'], row['Genre'])

# Visualization
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42)  
nx.draw(G, pos, with_labels=True, node_size=2000, node_color="lightblue", font_size=10, font_weight="bold")
plt.title("Movie Knowledge Graph")
plt.show()


In [None]:
# Assuming the CSV file is located in the './data/' directory
file_path = './data/cleaned_merged_movies_data.csv'
# Read the CSV file into a DataFrame
cleaned_merged_movies_data = pd.read_csv(file_path)
# Display the DataFrame
cleaned_merged_movies_data


In [None]:
import pandas as pd

# Assuming 'release_date' is in the format 'YYYY-MM-DD' and your DataFrame is cleaned_merged_movies_data
# Convert 'release_date' to datetime format
cleaned_merged_movies_data['release_date'] = pd.to_datetime(cleaned_merged_movies_data['release_date'])

# Extract year from 'release_date'
cleaned_merged_movies_data['year'] = cleaned_merged_movies_data['release_date'].dt.year

# Convert year to decade
cleaned_merged_movies_data['decade'] = (cleaned_merged_movies_data['year'] // 15) * 15


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Make sure this DataFrame exists and has the columns 'decade' and 'vote_average'
# This is just an example; replace 'decade' and 'vote_average' with your actual column names
plt.figure(figsize=(10, 6))
sns.violinplot(x='decade', y='vote_average', data=cleaned_merged_movies_data)

plt.title('Violin plot of Average Vote by Decade')
plt.xlabel('Decade')
plt.ylabel('Average Vote')

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
import ipywidgets as widgets

In [None]:
# Load the CSV file into a DataFrame
tmdb = pd.read_csv('./data/tmdb_5000_movies.csv')

# Display the first few rows of the DataFrame to verify it's loaded correctly
tmdb.head()

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('data/tmdb_5000_movies.csv')

# Define a function to clean up the "genres" column
def clean_genres(genres_str):
    genres_list = genres_str.strip('[]').replace('"', '').split(', ')
    return [genre.strip() for genre in genres_list]

# Apply the clean_genres function to the "genres" column
df['genres'] = df['genres'].apply(clean_genres)

# Convert 'release_date' to datetime data type
df['release_date'] = pd.to_datetime(df['release_date'])

# Group by release date and genre, and calculate aggregate revenue
genre_revenue_over_time = df.explode('genres').groupby(['release_date', 'genres'])['revenue'].sum().reset_index()

# Sort DataFrame by release date
genre_revenue_over_time = genre_revenue_over_time.sort_values(by='release_date')

# Create line chart using Plotly
fig = px.line(
    genre_revenue_over_time,
    x='release_date',
    y='revenue',
    color='genres',
    title='Aggregate Genre Revenue Over Time',
    labels={'release_date': 'Release Date', 'revenue': 'Aggregate Revenue', 'genres': 'Genre'},
)

fig.show()

In [None]:
# Selecting columns containing text data
text_columns = ['genres']

# Concatenate text data from selected columns
text_data = tmdb[text_columns].apply(lambda x: ' '.join(x.dropna()), axis=1)

# Combine all text into a single string
text_combined = ' '.join(text_data)

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_combined)

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Movie Data Word Cloud', fontsize=20)
plt.axis('off')
plt.show()


In [None]:
import pandas as pd
import plotly.express as px

# Load the CSV file into a DataFrame
df = pd.read_csv('data/tmdb_5000_movies.csv')

# Define a function to clean up the "genres" column
def clean_genres(genres_str):
    genres_list = genres_str.strip('[]').replace('"', '').split(', ')
    return [genre.strip() for genre in genres_list]

# Apply the clean_genres function to the "genres" column
df['genres'] = df['genres'].apply(clean_genres)

# Convert 'release_date' to datetime data type
df['release_date'] = pd.to_datetime(df['release_date'])

# Calculate profitability for each genre over time
profitability_by_genre_over_time = df.explode('genres').groupby(['release_date', 'genres'])[['revenue', 'budget']].sum().reset_index()
profitability_by_genre_over_time['profit'] = profitability_by_genre_over_time['revenue'] - profitability_by_genre_over_time['budget']

# Find the least profitable genre for each time period
least_profitable_genre_over_time = profitability_by_genre_over_time.loc[profitability_by_genre_over_time.groupby('release_date')['profit'].idxmin()]

# Sort DataFrame by release date
least_profitable_genre_over_time = least_profitable_genre_over_time.sort_values(by='release_date')

# Create line chart using Plotly
fig = px.line(
    least_profitable_genre_over_time,
    x='release_date',
    y='profit',
    color='genres',
    title='Least Profitable Genre Over Time',
    labels={'release_date': 'Release Date', 'profit': 'Profit', 'genres': 'Genre'},
)

fig.show()

In [None]:
# Assuming the CSV file is located in the './data/' directory
file_path = './data/cleaned_merged_movies_data.csv'

# Read the CSV file into a DataFrame
cleaned_merged_movies_data = pd.read_csv(file_path)



# Display the DataFrame
cleaned_merged_movies_data

In [None]:
# Selecting relevant columns
columns = ['genres', 'runtime', 'popularity']

# Creating a dropdown menu for movie selection
dropdown_options = [{'label': movie, 'value': movie} for movie in cleaned_merged_movies_data['original_title']]

# Define the layout of the radar chart
layout = go.Layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 10]  # Adjust the range as needed
        )),
    showlegend=True
)

# Function to generate radar chart based on selected movie
def generate_radar_chart(selected_movie):
    movie_data = cleaned_merged_movies_data[cleaned_merged_movies_data['original_title'] == selected_movie]
    fig = go.Figure(data=go.Scatterpolar(
        r=movie_data[columns].values.flatten().tolist(),
        theta=columns*2,  # Repeating columns to close the radar chart
        fill='toself',
        name=selected_movie
    ), layout=layout)
    fig.show()

# Creating the dropdown menu
dropdown = go.FigureWidget([
    go.Dropdown(
        id='movie-dropdown',
        options=dropdown_options,
        value=cleaned_merged_movies_data['original_title'].iloc[0],  # Default value
        description='Movie:',
        disabled=False
    )
])

# Display the radar chart based on the selected movie
def response(change):
    generate_radar_chart(change.new)

dropdown.data[0].on_change(response, 'value')
dropdown


In [None]:
# Handle missing values
cleaned_merged_movies_data.dropna(inplace=True)

# Selecting relevant quantitative variables
quantitative_variables = ['budget', 'revenue','runtime']

# Subsetting the DataFrame with only the quantitative variables
movies_subset = cleaned_merged_movies_data[quantitative_variables]

# Creating pairplot
sns.pairplot(movies_subset)
plt.show()

In [None]:
# Selecting relevant quantitative variables
quantitative_variables = ['budget', 'revenue', 'runtime', 'popularity', 'vote_count']

# Subsetting the DataFrame with only the quantitative variables
movies_subset = cleaned_merged_movies_data[quantitative_variables]

# Calculate mean values for each variable
mean_values = movies_subset.mean()

# Number of variables
num_vars = len(quantitative_variables)

# Create a DataFrame to store mean values
stats = pd.DataFrame({
    'mean': mean_values
})

# Create a list of variable names
labels = quantitative_variables

# Number of variables
num_vars = len(labels)

# Compute angle for each axis
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()

# The plot is circular, so we need to "complete the loop" and append the start value to the end.
stats = np.concatenate((stats['mean'].values,[stats['mean'].values[0]]))
angles += angles[:1]

# Plot
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
ax.fill(angles, stats, color='red', alpha=0.25)
ax.plot(angles, stats, color='red', linewidth=2)

# Fill area with color
ax.fill(angles, stats, color='red', alpha=0.25)

# Labels for each point
ax.set_yticklabels([])

# Draw one axe per variable and add labels
plt.xticks(angles[:-1], labels)

# Show the plot
plt.show()


In [56]:
# MORE CLEANING AND PARSING JSON-LIKE STRING COLUMNS IN A DATAFRAME

In [57]:
import ast  # Import the Abstract Syntax Trees module

# Create DataFrame
df = pd.read_csv('./data/cleaned_merged_movies_data.csv')

# Parse the JSON-like strings in columns
def parse_column(text):
    try:
        # Convert the string to Python objects
        return ast.literal_eval(text)
    except ValueError:
        return []  # Return an empty list if there's any error
    except SyntaxError:
        return []  # Handle syntax error by returning an empty list

# Apply this function to the 'genres' column to convert from string to list
df['genres'] = df['genres'].apply(parse_column)

# Now, to extract just the names from the 'genres' column
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# Preview the changes to the 'genres' column to confirm it's been cleaned
df['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

In [58]:
# Apply the parsing function to the 'production_companies' column
df['production_companies'] = df['production_companies'].apply(parse_column)

# Extract company names
df['production_companies'] = df['production_companies'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [59]:
# Apply the parsing function to the 'cast' column
df['cast'] = df['cast'].apply(parse_column)

# Extract names of the first few cast members, 5 for simplicity
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in x[:5]] if isinstance(x, list) else [])

In [60]:
# Apply the parsing function to the 'crew' column
df['crew'] = df['crew'].apply(parse_column)

# Extract names of directors
df['crew'] = df['crew'].apply(lambda x: [i['name'] for i in x if i['job'] == 'Director'] if isinstance(x, list) else [])

In [61]:
# Apply the parsing function to the 'keywords' column
df['keywords'] = df['keywords'].apply(parse_column)

# Extract keywords
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [62]:
# WORKFLOW FOR BUILDING A CONTENT-BASED RECOMMENDATION SYSTEM

In [18]:
#Aggregate Text Features, Combine text features into one string
df['combined_features'] = df['overview'] + " " + df['genres'].apply(" ".join) + " " + df['keywords'].apply(" ".join) + " " + df['cast'].apply(" ".join) + " " + df['crew'].apply(" ".join)

In [19]:
# Fill NaN values in 'combined_features' with an empty string
df['combined_features'] = df['combined_features'].fillna('')

#TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])

In [21]:
# This one will take FOREVER, but that's ok...
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import linear_kernel

# Further reduce the number of components
svd = TruncatedSVD(n_components=50)  # Adjusting n_components further
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

# Compute similarity on the reduced matrix
cosine_sim = linear_kernel(tfidf_matrix_reduced, tfidf_matrix_reduced)

In [35]:
#Recommendation Function
# Function to get movie recommendations based on cosine similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df.loc[df['title'] == title].index[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [37]:
# PROGRAMMATICALLY SELECT A DIVERSE SET OF TEST MOVIES, 
#ENSURE THEY EXIST IN THE DATASET, 
#AND GENERATE RECOMMENDATIONS FOR EACH OF THEM.
#EASY-PEASY #HATERSgonnaHATE #PLAYERSgonnaPLATE

In [38]:
# Example to programmatically select a diverse set of test movies
test_movies = [
    "Toy Story",  # Animation/Comedy
    "Pulp Fiction",  # Crime/Drama
    "The Shining",  # Horror
    "Interstellar",  # Sci-Fi
    "The Grand Budapest Hotel",  # Comedy/Drama
    "Hereditary",  # Horror/Thriller
    "Moonlight",  # Drama
    "Mad Max: Fury Road",  # Action
    # Add more titles as needed to cover a wide range of genres and years
]

# Ensure all test movies are in the dataset
test_movies = [movie for movie in test_movies if movie in df['title'].values]

In [39]:
for movie in test_movies:
    print(f"Recommendations for {movie}:")
    recommendations = get_recommendations(movie)
    print(recommendations)
    print("\n" + "-"*60 + "\n")

Recommendations for Toy Story:
13189                              Bolt
3055                      Stuart Little
2852     Home Alone 2: Lost in New York
22514        Mio in the Land of Faraway
34813                  Love the Coopers
350                     The Flintstones
24369                       The Captive
36753                 Norm of the North
36754                 Norm of the North
31438     Tom and Jerry: The Magic Ring
Name: title, dtype: object

------------------------------------------------------------

Recommendations for Pulp Fiction:
1856      The French Connection
26910             Out of Bounds
3924                    Traffic
9125                     Pusher
23133                     Chiko
27211    The Last of the Finest
9514                 Layer Cake
156                    Clockers
4667               Training Day
451                       Fresh
Name: title, dtype: object

------------------------------------------------------------

Recommendations for The Shining:
58

In [None]:
# Here is the fun part! Type in a movie to replace Troy. Be sure to 
# leave the quotation marks. If our database has your movie title, and you
# have spelled it exaclty as our database has it, buckle up for some 
# awesome recomendations!! 

In [75]:
test_movie_title = "Troy"
recommended_movies = get_recommendations(test_movie_title)
print(f"Recommendations for {test_movie_title}:")
recommended_movies

Recommendations for Troy:


14545                               Brothers
13320                               Defiance
12678                                Shelter
44949                       Het Bombardement
32120            Everything's Gonna Be Great
43949                         The Birch Wood
32290                        Brothers of War
1081                A Streetcar Named Desire
18372                  Don't Worry, I'm Fine
35305    How to Win at Checkers (Every Time)
Name: title, dtype: object

In [40]:
#FIN