In [None]:
# !pip install opendatasets
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install scikit-learn

## Importing files


In [3]:
# import opendatasets as op
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# op.download('https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata')

In [4]:
df = pd.read_csv('../tmdb-movie-metadata/tmdb_5000_credits.csv')
df1 = pd.read_csv('../tmdb-movie-metadata/tmdb_5000_movies.csv')

In [None]:
df.head(5)

In [None]:
df.columns = ['id', 'title', 'cast', 'crew']
df1 = df1.merge(df, on = 'id')

In [None]:
df.describe()

In [None]:
df1.describe()

In [None]:
# C mean vote acorss report
C = df1['vote_average'].mean()
C

In [None]:
df1.info()

In [None]:
M = df1['vote_count'].quantile(0.9)
M

In [None]:
qualify_movies = df1.copy().loc[df1['vote_count']>M]
qualify_movies.shape

In [None]:
## weight_rating 
def weighted_rating(x, m=M, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
qualify_movies['score'] = qualify_movies.apply(weighted_rating, axis=1)
qualify_movies['score'].mean()

In [None]:
qualify_movies.describe()

In [None]:
# Ploting graph
pop = df1.sort_values('popularity', ascending=False)
# pop[['title', 'popularity']]
plt.figure(figsize=(12,4))
plt.bar(pop['title_x'].head(5), pop['popularity'].head(5))
plt.xlabel('Popularity')
plt.ylabel('Movie')
plt.title('Popular Movie')


In [None]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df1[feature] = df1[feature].apply(literal_eval)

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [None]:
df1['director'] = df1['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df1[feature] = df1[feature].apply(get_list)

In [None]:
df1[['title_x', 'cast', 'director', 'keywords', 'genres']].head(3)

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df1[feature] = df1[feature].apply(clean_data)

In [None]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df1['soup'] = df1.apply(create_soup, axis=1)

In [8]:
# Using Sklearn
# tfidf = TfidfVectorizer(stop_words='english')
count = CountVectorizer(stop_words='english')
df1['overview'] = df1['overview'].fillna('')
# tfidf_matrix = tfidf.fit_transform(df1['overview'])
count_matrix = count.fit_transform(df1['overview'])
# tfidf_matrix.shape
count_matrix.shape

(4803, 20978)

In [9]:
# now we are using cosine similarity scores. but we can also use euclidean distance and Pearson 
# cosine_similarity_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_similarity_count = cosine_similarity(count_matrix, count_matrix)

In [None]:
# reverse the indices and movie titles
indices = pd.Series(df1.index, index =df1['title_x']).drop_duplicates()

In [None]:
df1.head()

In [12]:
# Count Matrix
# df1 = df1.reset_index()
# indices = pd.Series(df1.index, index=df1['title_x'])

indices = pd.Series(df1.index, index=df1['title'])

In [None]:
df.head(2)

In [22]:
# Using TFIDF
def get_recommendations(title, cosine_sim=cosine_similarity_count):
    try:
        # Get the index of the movie that matches the title
        idx = indices[title]

        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:11]  # Adjust this number as needed

        # Get the titles of the recommended movies
        recommended_movies = df1['title'].iloc[sim_scores[0][0]]

        return recommended_movies
    except KeyError:
        return "Sorry, we don't have enough data about this movie title."

In [23]:
get_recommendations('The Dark Knight Rises')

'Batman Forever'

In [None]:
# Assuming cosine_similarity_count is a numpy array
np.save('cosine_similarity_count.npy', cosine_similarity_count)


### Saving the model

In [None]:
cosine_similarity_count = np.load('cosine_similarity_count.npy')
recommendations = get_recommendations("Noice", cosine_sim=cosine_similarity_count)
recommendations


In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df1 = pd.read_csv('../tmdb-movie-metadata/tmdb_5000_movies.csv')


count = CountVectorizer(stop_words='english')
df1['overview'] = df1['overview'].fillna('')
count_matrix = count.fit_transform(df1['overview'])


cosine_similarity_count = cosine_similarity(count_matrix, count_matrix)

indices = pd.Series(df1.index, index =df1['title']).drop_duplicates()

# Using TFIDF
def get_recommendations(title, cosine_sim=cosine_similarity_count):
    try:
        # Get the index of the movie that matches the title
        idx = indices[title]

        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:2]  # Adjust this number as needed

        # Get the titles of the recommended movies
        recommended_movies = 

        return recommended_movies
    except KeyError:
        return "Sorry, we don't have enough data about this movie title."
    

# cosine_similarity_count = np.load('cosine_similarity_count.npy')
recommendations = get_recommendations('The Dark Knight Rises')
print(recommendations[0])


Batman Forever
