In [1]:
#DATA CLEANING and MERGING

In [2]:
import pandas as pd

In [3]:
# Load the dataset
movies_metadata = pd.read_csv('./data/movies_metadata.csv')

# Convert budget, popularity, revenue, runtime, vote_average, and vote_count to numeric
numeric_columns = ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']
for col in numeric_columns:
    movies_metadata[col] = pd.to_numeric(movies_metadata[col], errors='coerce').fillna(0)

# Convert release_date to datetime
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')

# Drop unneeded columns
columns_to_drop = ['adult', 'homepage', 'poster_path', 'video']
movies_metadata.drop(columns=columns_to_drop, inplace=True)

  movies_metadata = pd.read_csv('./data/movies_metadata.csv')


In [4]:
# load credits.csv into a DataFrame
credits = pd.read_csv('./data/credits.csv')

# Convert 'id' in credits to integer
credits['id'] = pd.to_numeric(credits['id'], errors='coerce').astype('Int64')

In [5]:
# load keywords.csv into a DataFrame
keywords = pd.read_csv('./data/keywords.csv')

# Convert 'id' in keywords to integer
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce').fillna(0).astype('Int64')

In [6]:
# Convert 'id' in movies_metadata to integer
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce').fillna(0).astype('Int64')

In [7]:
# Merge
movies_metadata = movies_metadata.merge(credits, on='id', how='left')
movies_metadata = movies_metadata.merge(keywords, on='id', how='left')

In [8]:
# Save the merged DataFrame to a new CSV file
movies_metadata.to_csv('./data/cleaned_merged_movies_data.csv', index=False)

In [None]:
# MORE CLEANING AND PARSING JSON-LIKE STRING COLUMNS IN A DATAFRAME

In [9]:
import ast  # Import the Abstract Syntax Trees module

# Create DataFrame
df = pd.read_csv('./data/cleaned_merged_movies_data.csv')

# Parse the JSON-like strings in columns
def parse_column(text):
    try:
        # Convert the string to Python objects
        return ast.literal_eval(text)
    except ValueError:
        return []  # Return an empty list if there's any error
    except SyntaxError:
        return []  # Handle syntax error by returning an empty list

# Apply this function to the 'genres' column to convert from string to list
df['genres'] = df['genres'].apply(parse_column)

# Now, to extract just the names from the 'genres' column
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# Preview the changes to the 'genres' column to confirm it's been cleaned
df['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

In [10]:
# Apply the parsing function to the 'production_companies' column
df['production_companies'] = df['production_companies'].apply(parse_column)

# Extract company names
df['production_companies'] = df['production_companies'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [11]:
# Apply the parsing function to the 'cast' column
df['cast'] = df['cast'].apply(parse_column)

# Extract names of the first few cast members, 5 for simplicity
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in x[:5]] if isinstance(x, list) else [])

In [12]:
# Apply the parsing function to the 'crew' column
df['crew'] = df['crew'].apply(parse_column)

# Extract names of directors
df['crew'] = df['crew'].apply(lambda x: [i['name'] for i in x if i['job'] == 'Director'] if isinstance(x, list) else [])

In [13]:
# Apply the parsing function to the 'keywords' column
df['keywords'] = df['keywords'].apply(parse_column)

# Extract keywords
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
# WORKFLOW FOR BUILDING A CONTENT-BASED RECOMMENDATION SYSTEM

In [14]:
#Aggregate Text Features, Combine text features into one string
df['combined_features'] = df['overview'] + " " + df['genres'].apply(" ".join) + " " + df['keywords'].apply(" ".join) + " " + df['cast'].apply(" ".join) + " " + df['crew'].apply(" ".join)

In [15]:
# Fill NaN values in 'combined_features' with an empty string
df['combined_features'] = df['combined_features'].fillna('')

#TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])

In [17]:
# This one will take FOREVER, but that's ok...
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import linear_kernel

# Further reduce the number of components
svd = TruncatedSVD(n_components=50)  # Adjusting n_components further
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

# Compute similarity on the reduced matrix
cosine_sim = linear_kernel(tfidf_matrix_reduced, tfidf_matrix_reduced)

In [18]:
#Recommendation Function
# Function to get movie recommendations based on cosine similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df.loc[df['title'] == title].index[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [19]:
test_movie_title = "Toy Story"
recommended_movies = get_recommendations(test_movie_title)
print(f"Recommendations for {test_movie_title}:")
recommended_movies

Recommendations for Toy Story:


13189                              Bolt
22514        Mio in the Land of Faraway
34813                  Love the Coopers
2852     Home Alone 2: Lost in New York
36753                 Norm of the North
36754                 Norm of the North
3055                      Stuart Little
31438     Tom and Jerry: The Magic Ring
15066                          New York
350                     The Flintstones
Name: title, dtype: object

In [20]:
# PROGRAMMATICALLY SELECT A DIVERSE SET OF TEST MOVIES, 
#ENSURE THEY EXIST IN THE DATASET, 
#AND GENERATE RECOMMENDATIONS FOR EACH OF THEM.
#EASY-PEASY #HATERSgonnaHATE #PLAYERSgonnaPLATE

In [21]:
# Example to programmatically select a diverse set of test movies
test_movies = [
    "Toy Story",  # Animation/Comedy
    "Pulp Fiction",  # Crime/Drama
    "The Shining",  # Horror
    "Interstellar",  # Sci-Fi
    "The Grand Budapest Hotel",  # Comedy/Drama
    "Hereditary",  # Horror/Thriller
    "Moonlight",  # Drama
    "Mad Max: Fury Road",  # Action
    # Add more titles as needed to cover a wide range of genres and years
]

# Ensure all test movies are in the dataset
test_movies = [movie for movie in test_movies if movie in df['title'].values]

In [22]:
for movie in test_movies:
    print(f"Recommendations for {movie}:")
    recommendations = get_recommendations(movie)
    print(recommendations)
    print("\n" + "-"*60 + "\n")

Recommendations for Toy Story:
13189                              Bolt
22514        Mio in the Land of Faraway
34813                  Love the Coopers
2852     Home Alone 2: Lost in New York
36753                 Norm of the North
36754                 Norm of the North
3055                      Stuart Little
31438     Tom and Jerry: The Magic Ring
15066                          New York
350                     The Flintstones
Name: title, dtype: object

------------------------------------------------------------

Recommendations for Pulp Fiction:
4424                   Red Heat
1856      The French Connection
9125                     Pusher
27211    The Last of the Finest
12225         American Gangster
3924                    Traffic
23133                     Chiko
26910             Out of Bounds
865                 Bulletproof
4918            Another 48 Hrs.
Name: title, dtype: object

------------------------------------------------------------

Recommendations for The Shining:
15

In [None]:
#FIN