In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_credits=pd.read_csv("/content/tmdb_5000_credits.csv")
df_credits

df_movies=pd.read_csv("/content/tmdb_5000_movies.csv")
df_movies

In [None]:
print('Credit dataset shape', df_credits.shape)
print('Movie dataset shape', df_movies.shape)
print('Credit dataset size', df_credits.size)
print('Movie dataset size', df_movies.size)
print("* "*100)
print(df_credits.info())
print("* "*100)
print(df_movies.info())

In [None]:
df = df_movies.merge(df_credits,on='title')
print(df)

print(df.info())

print(df.describe())

print(df.isnull().sum())
print(df.nunique())
print(df.duplicated().sum())
print(df.columns)

In [None]:
from collections import Counter
def count_values(column):
    items = [item for sublist in df[column].dropna().apply(lambda x: x.split(',')) for item in sublist]
    return dict(Counter(items))

genres_count = count_values('genres')
genres_df = pd.DataFrame(genres_count.items(), columns=['Genre', 'Count']).sort_values(by='Count', ascending=False)
print(genres_df)

keywords_count = count_values('keywords')
keywords_df = pd.DataFrame(keywords_count.items(), columns=['Keyword', 'Count']).sort_values(by='Count', ascending=False)
print(keywords_df)

prod_companies_count = count_values('production_companies')
prod_companies_df = pd.DataFrame(prod_companies_count.items(), columns=['Company', 'Count']).sort_values(by='Count', ascending=False)
print(prod_companies_df)

spoken_languages_count = count_values('spoken_languages')
spoken_languages_df = pd.DataFrame(spoken_languages_count.items(), columns=['Language', 'Count']).sort_values(by='Count', ascending=False)
print(spoken_languages_df)

cast_count = count_values('cast')
cast_df = pd.DataFrame(cast_count.items(), columns=['Cast', 'Count']).sort_values(by='Count', ascending=False).head(10)
print(cast_df)

crew_count = count_values('crew')
crew_df = pd.DataFrame(crew_count.items(), columns=['Crew', 'Count']).sort_values(by='Count', ascending=False).head(10)
print(crew_df)

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(15, 15))

sns.barplot(x='Count', y='Genre', data=genres_df.head(), ax=axes[0, 0])
axes[0, 0].set_title('Top 5 Genres')

sns.barplot(x='Count', y='Keyword', data=keywords_df.head(), ax=axes[0, 1])
axes[0, 1].set_title('Top 5 Keywords')

sns.barplot(x='Count', y='Company', data=prod_companies_df.head(), ax=axes[1, 0])
axes[1, 0].set_title('Top 5 Production Companies')

sns.barplot(x='Count', y='Language', data=spoken_languages_df.head(), ax=axes[1, 1])
axes[1, 1].set_title('Top 5 Spoken Languages')

sns.barplot(x='Count', y='Cast', data=cast_df.head(), ax=axes[2, 0])
axes[2, 0].set_title('Top 5 Cast Members')

sns.barplot(x='Count', y='Crew', data=crew_df.head(), ax=axes[2, 1])
axes[2, 1].set_title('Top 5 Crew Members')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='budget', y='revenue', data=df, alpha=0.6)
plt.title('Budget vs Revenue')
plt.xlabel('Budget')
plt.ylabel('Revenue')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['revenue'], kde=True, color='skyblue')
plt.title('Revenue Distribution')
plt.xlabel('Revenue')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='vote_average', y='vote_count', data=df, alpha=0.6, color='purple')
plt.title('Vote Average vs Vote Count')
plt.xlabel('Vote Average')
plt.ylabel('Vote Count')
plt.show()

In [None]:
df['has_tagline'] = df['tagline'].notna().astype(int)

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='has_tagline', data=df, palette='viridis')
plt.title('Tagline Presence')
plt.xlabel('Has Tagline (1 = Yes, 0 = No)')
plt.ylabel('Count')
plt.xticks([0, 1], ['No', 'Yes'])
plt.show()

In [None]:
df = df[['movie_id','title','overview','genres','keywords','cast','crew']]
df.shape
df.isnull().sum()
df.duplicated().sum()

In [None]:
df = df[['movie_id','title','overview','genres','keywords','cast','crew']]
print(df.shape)
print(df.isnull().sum())
print(df.duplicated().sum())

NameError: name 'df' is not defined

In [None]:
def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]

def get_director(text):
    data = ast.literal_eval(text)
    for item in data:
        if item['job'] == 'Director':
            return [item['name']]
    return []

def remove_spaces(words):
    return [word.replace(" ", "") for word in words]

df.dropna(inplace=True)

df['genres'] = df['genres'].apply(convert)
df['keywords'] = df['keywords'].apply(convert)

df['cast'] = df['cast'].apply(lambda x: convert(x)[:3])

df['crew'] = df['crew'].apply(get_director)

df['cast'] = df['cast'].apply(remove_spaces)
df['crew'] = df['crew'].apply(remove_spaces)
df['genres'] = df['genres'].apply(remove_spaces)
df['keywords'] = df['keywords'].apply(remove_spaces)
df['overview'] = df['overview'].apply(lambda x: x.split())
df.head()

In [None]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']

new_df = df.drop(columns=['overview', 'genres', 'keywords', 'cast', 'crew'])

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df.head(2)

In [None]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vector = tfidf.fit_transform(new_df['tags']).toarray()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [None]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print("\t",new_df.iloc[i[0]].title)

In [None]:
fav=input("Enter a movie name:")
print("\nMovies similar to ",fav,":\n")
recommend(fav)