### Import Dependencies

In [1]:
import pandas as pd
import numpy as np  

In [2]:
# Load data set

movie_list = pd.read_csv('moviesData.csv')
movie_list

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811
...,...,...,...,...,...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy",en,"The story follows the adventures of Aang, a yo...",98.322,2010-06-30,4.7,3347
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",en,The sharks take bite out of the East Coast whe...,12.490,2015-07-22,4.7,417
9997,13995,Captain America,"Action,Science Fiction,War",en,"During World War II, a brave, patriotic Americ...",18.333,1990-12-14,4.6,332
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",en,A man named Farmer sets out to rescue his kidn...,15.159,2007-11-29,4.7,668


### Data Cleaning

In [4]:
# Check for null values
movie_list.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

In [5]:
# Fill NaN values in 'Genre' with 0
movie_list['genre'] = movie_list['genre'].fillna(0)
# Convert 'genre' column to strings
movie_list['genre'] = movie_list['genre'].astype(str)
movie_list

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811
...,...,...,...,...,...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy",en,"The story follows the adventures of Aang, a yo...",98.322,2010-06-30,4.7,3347
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",en,The sharks take bite out of the East Coast whe...,12.490,2015-07-22,4.7,417
9997,13995,Captain America,"Action,Science Fiction,War",en,"During World War II, a brave, patriotic Americ...",18.333,1990-12-14,4.6,332
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",en,A man named Farmer sets out to rescue his kidn...,15.159,2007-11-29,4.7,668


### Data Exploration

In [3]:
# Check movies info
movie_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


### Feature Engineering

In [6]:
# check columns
movie_list.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [7]:
# select features
movie_list = movie_list[['id', 'title',  'genre', 'vote_average']]
movie_list

Unnamed: 0,id,title,genre,vote_average
0,278,The Shawshank Redemption,"Drama,Crime",8.7
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",8.7
2,238,The Godfather,"Drama,Crime",8.7
3,424,Schindler's List,"Drama,History,War",8.6
4,240,The Godfather: Part II,"Drama,Crime",8.6
...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy",4.7
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",4.7
9997,13995,Captain America,"Action,Science Fiction,War",4.6
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",4.7


### Similarity Computation

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [9]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movie_list['genre'])

In [10]:
# Compute the cosine similarity
genre_cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

### Recommendation

In [11]:
# Funciton to Recommend movies based on the genre

def recommend_movies_genres(genre_string, genre_cosine_sim=genre_cosine_sim):
    # Convert the comma-separated string of genres to a list
    genres = [genre.strip() for genre in genre_string.split(',')]

    # Get the indices of movies that have all of the selected genres
    genre_indices = movie_list[movie_list['genre'].apply(lambda x: all(genre in x for genre in genres))].index

    # Sort movies based on 'vote_average' in descending order
    sorted_indices = movie_list.loc[genre_indices].sort_values(by='vote_average', ascending=False).index

    # Return the top 5 highest-rated movies that fall into all selected genres
    return movie_list.loc[sorted_indices[:5], ['id', 'title', 'vote_average']]


In [12]:
# Checking the built model
recommend_movies_genres('Crime')

Unnamed: 0,id,title,vote_average
0,278,The Shawshank Redemption,8.7
2,238,The Godfather,8.7
4,240,The Godfather: Part II,8.6
13,497,The Green Mile,8.5
15,155,The Dark Knight,8.5


In [13]:
recommend_movies_genres('Action,Science')

Unnamed: 0,id,title,vote_average
34,283566,Evangelion: 3.0+1.0 Thrice Upon a Time,8.4
41,18491,Neon Genesis Evangelion: The End of Evangelion,8.4
42,1891,The Empire Strikes Back,8.4
54,27205,Inception,8.4
35,324857,Spider-Man: Into the Spider-Verse,8.4


In [14]:
recommend_movies_genres('Comedy,Drama,Romance')

Unnamed: 0,id,title,vote_average
1,19404,Dilwale Dulhania Le Jayenge,8.7
20,13,Forrest Gump,8.5
57,901,City Lights,8.4
85,572154,Rascal Does Not Dream of a Dreaming Girl,8.3
92,522924,The Art of Racing in the Rain,8.3


In [15]:
recommend_movies_genres('Action,TV Movie,Science Fiction,Comedy,Adventure')

Unnamed: 0,id,title,vote_average
9905,438970,Sharknado 5: Global Swarming,4.9
9996,331446,Sharknado 3: Oh Hell No!,4.7


### Save the model

In [16]:
# Save the model
import pickle 

#movie_list
pickle.dump(movie_list, open('genre_movie_list.pkl', 'wb'))

#similarities
pickle.dump(genre_cosine_sim, open('genre_cosine_sim.pkl', 'wb'))
