**MOVIE RECOMMENDER SYSTEM**

In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Load Dataset
movies_df = pd.read_csv("movies.csv")

In [None]:
#Inspect Dataset
display(movies_df.head())
display(movies_df.info())

In [None]:
 #check for duplicates
movies_df.duplicated().sum()

In [None]:
#Handle the missing values
movies_df.dropna(inplace=True)
movies_df.drop_duplicates(inplace=True)

In [None]:
movies_df

In [None]:
# Extracting features
movies_df["genres"] = movies_df["genres"].apply(lambda x: [d["name"] for d in x] if isinstance(x, list) else [])
movies_df["keywords"] = movies_df["keywords"].apply(lambda x: [d["name"] for d in x] if isinstance(x, list) else [])
movies_df["cast"] = movies_df["cast"].apply(lambda x: [d["name"] for d in x[:5]] if isinstance(x, list) else [])
movies_df["director"] = movies_df["crew"].apply(lambda x: [d["name"] for d in x if d["job"] == "Director"] if isinstance(x, list) else [])
movies_df["director"] = movies_df["director"].apply(lambda x: x[0] if x else "Unknown")

In [None]:
#Functiom to convert JSON Column to list
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [None]:
# Apply function on JSON-like columns
movies_df['spoken_languages'] = movies_df['spoken_languages'].apply(convert)
movies_df['crew'] = movies_df['crew'].apply(convert)

In [None]:
# Apply function on JSON-like columns
movies_df['production_companies'] = movies_df['production_companies'].apply(convert)
movies_df['production_countries'] = movies_df['production_countries'].apply(convert)
movies_df['production_countries'] = movies_df['production_countries'].apply(convert)

In [None]:
#save cleaned dataset for powerbi
movies_df.to_csv("cleaned_data.csv", index=False)

In [None]:
# Feature Creation: Combined Features for Recommendations
movies_df["combined_features"] = movies_df["title"] + " " + movies_df["director"] + " " + movies_df["keywords"].apply(lambda x: " ".join(x)) + " " + movies_df["cast"].apply(lambda x: " ".join(x)) + " " + movies_df["genres"].apply(lambda x: " ".join(x))

In [None]:
# Normalize Numeric Features
scaler = MinMaxScaler()
movies_df[["budget", "popularity", "revenue"]] = scaler.fit_transform(movies_df[["budget", "popularity", "revenue"]])

In [None]:
#Descriptive Analysis
print("Average Budget:", movies_df["budget"].mean())
print("Average Revenue:", movies_df["revenue"].mean())
print("Vote Average Distribution:")
print(movies_df["vote_average"].describe())
print("Runtime Distribution:")
print(movies_df["runtime"].describe())

In [None]:
## Top 10 Highest-Grossing Movies
top_grossing = movies_df.sort_values(by="revenue", ascending=False).head(10)
print("Top 10 Highest-Grossing Movies:")
print(top_grossing[["title", "revenue"]])

In [None]:
# Genre & Language Analysis
genre_counts = movies_df["genres"].explode().value_counts()
language_counts = movies_df["original_language"].value_counts()
print("Most Common Genres:")
print(genre_counts.head(10))
print("Most Common Languages:")
print(language_counts.head(10))

In [None]:
# Visualization of Budget Distribution
plt.figure(figsize=(10,5))
sns.histplot(movies_df[movies_df["budget"] > 0]["budget"], bins=30, kde=True)
plt.title("Distribution of Movie Budgets")
plt.xlabel("Budget ($)")
plt.show()

In [None]:
#Scatter Plot of Budget vs Revenue
plt.figure(figsize=(10,5))
sns.scatterplot(x=movies_df["budget"], y=movies_df["revenue"], alpha=0.5)
plt.title("Budget vs. Revenue")
plt.xlabel("Budget ($)")
plt.ylabel("Revenue ($)")
plt.xscale("log")
plt.yscale("log")
plt.show()

In [None]:
#Top 10 most common Genres Visualization
plt.figure(figsize=(10,5))
genre_counts.head(10).plot(kind="bar", color="skyblue")
plt.title("Top 10 Most Common Genres")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Movie Recommendation System
vectorizer = TfidfVectorizer(stop_words="english")
movies_df["combined_features"] = movies_df["combined_features"].fillna("Unknown")
tfidf_matrix = vectorizer.fit_transform(movies_df["combined_features"])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
#movie recommending function
def recommend_movies(title, df, cosine_sim):
    indices = pd.Series(df.index, index=movies_df['title']).drop_duplicates()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return movies_df['title'].iloc[movie_indices]

In [None]:
# Example Usage
print(recommend_movies("The Dark Knight", movies_df, cosine_sim))