In [None]:
!pip install numpy
!pip install pandas
!pip install kagglehub
!pip install requests

In [None]:
import numpy as np
import pandas as pd
import kagglehub

In [None]:
path = kagglehub.dataset_download('tmdb/tmdb-movie-metadata')
print ('Path to dataset files = ', path)

In [None]:
df1 = pd.read_csv(str(path +'/tmdb_5000_credits.csv'))
df2 = pd.read_csv(str(path +'/tmdb_5000_movies.csv'))

In [None]:
df2.head(3)

In [None]:
df1.columns = ['id', 'title_1', 'cast', 'crew']
df2 = df2.merge(df1, on='id')
df2.head()

In [None]:
C = df2['vote_average'].mean()
print (C)

m = df2['vote_count'].quantile(0.9)
print (m)

In [None]:
q_movie = df2.copy().loc[df2['vote_count'] >= m]
q_movie.shape

In [None]:
def weithted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m)*R) + (m/(m+v)*C)

q_movie['score'] = q_movie.apply(weithted_rating, axis=1)

In [None]:
q_movie['score'].head()

In [None]:
q_movie = q_movie.sort_values('score', ascending=False)
q_movie[['title', 'vote_count', 'vote_average', 'score']].head(10)

In [None]:
pop= df2.sort_values('popularity', ascending=False)
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))

plt.barh(pop['title'].head(6),pop['popularity'].head(6), align='center',
        color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")

In [None]:
df2['overview'].head(5)

In [None]:
!pip install scikit-learn

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df2['overview'] = df2['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2['title'].iloc[movie_indices]

In [None]:
get_recommendations('Star Wars')

In [None]:
get_recommendations('The Avengers')

In [None]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

In [None]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [None]:
# Define new director, cast, genres and keywords features that are in a suitable form.
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [None]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
# Reset index of our main DataFrame and construct reverse mapping as before
df2 = df2.reset_index()
indices = pd.Series(df2.index, index=df2['title'])

In [None]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

In [None]:
import requests
import json
import tkinter as tk

api_key = 'fb6f4167faa7c82e54bb60e160679fde'

root = tk.Tk()
root.title ('The Movie DataBase')
root.geometry ('500x500')

label_movie = tk.Label(root, text='Movie ID')
label_movie.pack(pady=(15,5))
entry_movie = tk.Entry(root, width = 25)
entry_movie.pack()

show_variable = tk.StringVar()
show_area = tk.Label(root, 
     textvariable = show_variable,
     font = ('Helvetica', 12),
     bg = 'lightyellow',
     wraplength = 380,
     justify = 'left')
show_area.pack(pady=20, padx=10, fill='both', expand=True)

def print_movie():
    movie_id = entry_movie.get()
    print ('Moive ID = ', movie_id)
    print ('-'*30)

button_print = tk.Button(root, text = 'Print', command = print_movie)
button_print.pack(pady=15)

def print_recommendations():
    movie_id = entry_movie.get()
    url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}'
    response = requests.get(url)
    data = response.json()
    title_name = data.get('title')
    print ('Title Name = ', title_name)
    print ('-' * 30)

    show_variable.set(get_recommendations(title_name, cosine_sim2))

button_revenue = tk.Button(root, text='Recommendations', command=print_recommendations)
button_revenue.pack(pady=15)

root.mainloop()