### Import EDA Libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

### Collection of Movie Data

In [2]:
df = pd.DataFrame()

In [3]:
df

In [4]:
import requests
for i in range(1, 401):
    response = requests.get('https://api.themoviedb.org/3/movie/top_rated?api_key=[Enter Your Api Key]&language=en-US&page={}'.format(i))
    movies = pd.DataFrame(response.json()['results'])[['id', 'original_title', 'title', 'overview', 'genre_ids', 'original_language', 'release_date', 'popularity', 'adult', 'video', 'vote_average', 'vote_count']]
    movie = df.append(movies, ignore_index=True)

In [5]:
movie.head()

Unnamed: 0,id,original_title,title,overview,genre_ids,original_language,release_date,popularity,adult,video,vote_average,vote_count
0,43930,"Tomorrow, When the War Began","Tomorrow, When the War Began","Ellie Linton, a teen from an Australian coasta...","[28, 12, 18]",en,2010-08-08,12.537,False,False,6.2,466
1,22327,Youth in Revolt,Youth in Revolt,"As a fan of Albert Camus and Jean-Luc Godard, ...","[35, 10749]",en,2009-09-11,10.775,False,False,6.2,822
2,10739,Anything Else,Anything Else,"Jerry Falk, an aspiring writer in New York, fa...","[18, 35, 10749]",en,2003-08-27,10.145,False,False,6.2,551
3,9457,Deep Rising,Deep Rising,A group of heavily armed hijackers board a lux...,"[12, 28, 27, 878]",en,1998-01-30,22.224,False,False,6.2,609
4,1727,Bird on a Wire,Bird on a Wire,An FBI informant has kept his new identity sec...,"[28, 12, 35, 10749]",en,1990-05-18,12.59,False,False,6.2,828


### Feature Engineering

In [6]:
genre_ids  = [28, 12, 16, 35, 80, 99, 18, 10751, 14, 36, 27, 10402, 9648, 10749, 878, 10770, 53, 10752, 37]
genre_names = [
    "Action",
    "Adventure",
    "Animation",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Family",
    "Fantasy",
    "History",
    "Horror",
    "Music",
    "Mystery",
    "Romance",
    "Science Fiction",
    "TV Movie",
    "Thriller",
    "War",
    "Western"
]

genre_mapping = dict(zip(genre_ids, genre_names))

In [7]:
movie['genre_ids']

0            [28, 12, 18]
1             [35, 10749]
2         [18, 35, 10749]
3       [12, 28, 27, 878]
4     [28, 12, 35, 10749]
5     [28, 12, 53, 10752]
6                    [18]
7                    [28]
8           [878, 53, 27]
9     [10751, 35, 16, 12]
10        [35, 14, 10749]
11       [28, 36, 12, 18]
12           [28, 53, 18]
13            [16, 10751]
14               [35, 80]
15            [35, 10749]
16               [14, 28]
17               [27, 53]
18               [18, 53]
19        [18, 10749, 35]
Name: genre_ids, dtype: object

In [8]:
genre_mapping

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [9]:
def convert_genre_ids_to_names(genre_ids):
    return [genre_mapping[genre_id] for genre_id in genre_ids]

In [10]:
movie['genre_name'] = movie['genre_ids'].apply(convert_genre_ids_to_names)

In [11]:
movie['original_language'].unique()

array(['en', 'fr', 'ko'], dtype=object)

movie = pd.read_csv('movie.csv')
column = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.2']
movie = movie.drop(columns=column)

In [12]:
ori_language = ['en', 'hi', 'ja', 'ko', 'it', 'es', 'zh', 'pt', 'ru', 'fr', 'tr', 
                'sv', 'hu', 'ar', 'de', 'cn', 'da', 'pl', 'bn', 'nl', 'fa', 'th',
                'te', 'sr', 'sh', 'et', 'id', 'cs', 'no', 'uk', 'ro', 'gl', 'ga',
                'el', 'fi', 'bs', 'hy', 'xx', 'is', 'ml', 'la', 'tn', 'eu', 'nb',
                'he', 'km']

language = ['English', 'Hindi', 'Japanese', 'Korean', 'Italian', 'Spanish; Castilian', 
            'Chinese', 'Portuguese', 'Russian', 'French', 'Turkish','Swedish',
            'Hungarian','Arabic','German', 'Cn', 'Danish', 'Polish', 'Bengali', 
            'Dutch; Flemish', 'Persian', 'Thai','Telugu','Serbian','Sh', 
            'Estonian', 'Indonesian', 'Czech', 'Norwegian', 'Ukrainian', 'Romanian', 
            'Galician', 'Irish','Greek, Modern(1453-)','Finnish','Bosnian', 
            'Armenian', 'Xhosa','Icelandic', 'Malayalam', 'Latin', 
            'Tswana', 'Basque', 'Nogai','Hebrew','Central Khmer']

language_mapping = dict(zip(ori_language, language))

In [13]:
language_mapping

{'en': 'English',
 'hi': 'Hindi',
 'ja': 'Japanese',
 'ko': 'Korean',
 'it': 'Italian',
 'es': 'Spanish; Castilian',
 'zh': 'Chinese',
 'pt': 'Portuguese',
 'ru': 'Russian',
 'fr': 'French',
 'tr': 'Turkish',
 'sv': 'Swedish',
 'hu': 'Hungarian',
 'ar': 'Arabic',
 'de': 'German',
 'cn': 'Cn',
 'da': 'Danish',
 'pl': 'Polish',
 'bn': 'Bengali',
 'nl': 'Dutch; Flemish',
 'fa': 'Persian',
 'th': 'Thai',
 'te': 'Telugu',
 'sr': 'Serbian',
 'sh': 'Sh',
 'et': 'Estonian',
 'id': 'Indonesian',
 'cs': 'Czech',
 'no': 'Norwegian',
 'uk': 'Ukrainian',
 'ro': 'Romanian',
 'gl': 'Galician',
 'ga': 'Irish',
 'el': 'Greek, Modern(1453-)',
 'fi': 'Finnish',
 'bs': 'Bosnian',
 'hy': 'Armenian',
 'xx': 'Xhosa',
 'is': 'Icelandic',
 'ml': 'Malayalam',
 'la': 'Latin',
 'tn': 'Tswana',
 'eu': 'Basque',
 'nb': 'Nogai',
 'he': 'Hebrew',
 'km': 'Central Khmer'}

In [14]:
movie['language'] = ""

In [15]:
for index, row in movie.iterrows():
    original_language = row["original_language"]
    if original_language in language_mapping:
        movie.at[index, "language"] = language_mapping[original_language]

In [16]:
columns_to_drop = ['original_title', 'genre_ids', 'original_language', 'video']
movie = movie.drop(columns=columns_to_drop, axis=1)

In [17]:
movie.head()

Unnamed: 0,id,title,overview,release_date,popularity,adult,vote_average,vote_count,genre_name,language
0,43930,"Tomorrow, When the War Began","Ellie Linton, a teen from an Australian coasta...",2010-08-08,12.537,False,6.2,466,"[Action, Adventure, Drama]",English
1,22327,Youth in Revolt,"As a fan of Albert Camus and Jean-Luc Godard, ...",2009-09-11,10.775,False,6.2,822,"[Comedy, Romance]",English
2,10739,Anything Else,"Jerry Falk, an aspiring writer in New York, fa...",2003-08-27,10.145,False,6.2,551,"[Drama, Comedy, Romance]",English
3,9457,Deep Rising,A group of heavily armed hijackers board a lux...,1998-01-30,22.224,False,6.2,609,"[Adventure, Action, Horror, Science Fiction]",English
4,1727,Bird on a Wire,An FBI informant has kept his new identity sec...,1990-05-18,12.59,False,6.2,828,"[Action, Adventure, Comedy, Romance]",English


In [18]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            20 non-null     int64  
 1   title         20 non-null     object 
 2   overview      20 non-null     object 
 3   release_date  20 non-null     object 
 4   popularity    20 non-null     float64
 5   adult         20 non-null     bool   
 6   vote_average  20 non-null     float64
 7   vote_count    20 non-null     int64  
 8   genre_name    20 non-null     object 
 9   language      20 non-null     object 
dtypes: bool(1), float64(2), int64(2), object(5)
memory usage: 1.6+ KB


In [19]:
movie.isnull().sum()

id              0
title           0
overview        0
release_date    0
popularity      0
adult           0
vote_average    0
vote_count      0
genre_name      0
language        0
dtype: int64

In [20]:
movie['year'] = pd.to_datetime(movie['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [21]:
movie.head()

Unnamed: 0,id,title,overview,release_date,popularity,adult,vote_average,vote_count,genre_name,language,year
0,43930,"Tomorrow, When the War Began","Ellie Linton, a teen from an Australian coasta...",2010-08-08,12.537,False,6.2,466,"[Action, Adventure, Drama]",English,2010
1,22327,Youth in Revolt,"As a fan of Albert Camus and Jean-Luc Godard, ...",2009-09-11,10.775,False,6.2,822,"[Comedy, Romance]",English,2009
2,10739,Anything Else,"Jerry Falk, an aspiring writer in New York, fa...",2003-08-27,10.145,False,6.2,551,"[Drama, Comedy, Romance]",English,2003
3,9457,Deep Rising,A group of heavily armed hijackers board a lux...,1998-01-30,22.224,False,6.2,609,"[Adventure, Action, Horror, Science Fiction]",English,1998
4,1727,Bird on a Wire,An FBI informant has kept his new identity sec...,1990-05-18,12.59,False,6.2,828,"[Action, Adventure, Comedy, Romance]",English,1990


### Weighted Rating

$$Weighted Rating(WR) = (\frac {v} {v+m}.R) + (\frac {m} {v+m}.C) $$

where,
  - v is the number of votes for the movie.
  - m is the minimum votes required to be listed in the chart.
  - R is the average rating of the movie.
  - C is the mean vote across the whole report.

In [22]:
C = movie['vote_average'].mean()
m = movie['vote_count'].quantile(0.98)

In [23]:
def rating(mv):
    v = mv['vote_count']
    R = mv['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [24]:
movie['weighted_rating'] = movie.apply(rating, axis=1).round(2)

In [25]:
movie = movie.sort_values('weighted_rating', ascending=False)

In [26]:
movie.head()

Unnamed: 0,id,title,overview,release_date,popularity,adult,vote_average,vote_count,genre_name,language,year,weighted_rating
0,43930,"Tomorrow, When the War Began","Ellie Linton, a teen from an Australian coasta...",2010-08-08,12.537,False,6.2,466,"[Action, Adventure, Drama]",English,2010,6.2
1,22327,Youth in Revolt,"As a fan of Albert Camus and Jean-Luc Godard, ...",2009-09-11,10.775,False,6.2,822,"[Comedy, Romance]",English,2009,6.2
18,13200,Red,"Avery, a reclusive older man, has a best frien...",2008-08-08,12.674,False,6.2,230,"[Drama, Thriller]",English,2008,6.2
17,13474,P2,A businesswoman finds herself locked with a un...,2007-11-09,13.752,False,6.2,745,"[Horror, Thriller]",English,2007,6.2
16,452015,Psychokinesis,An ordinary guy suddenly finds he has superpow...,2018-01-31,11.142,False,6.2,264,"[Fantasy, Action]",Korean,2018,6.2


## Build a recommendation model with data

###  Genre recommendation system.

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
#from sklearn.metrics.pairwise import cosine_similarity

In [31]:
# Combine genre and title into a single column
movie['combined_gen_tle'] = movie.apply(lambda x: ' '.join(x['genre_name']) + ' ' + x['title'], axis=1)

In [32]:
# Create a TF-IDF vectorizer to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer()
# Fit and transform the combined_features column
tfidf_matrix = tfidf_vectorizer.fit_transform(movie['combined_gen_tle'])

In [33]:
# Calculate cosine similarity between movies
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [39]:
movie_title = 'Red'
# Find the index of the user's movie
movie_index = movie[movie['title'] == movie_title].index[0]

In [40]:
# Get the pairwise similarity scores for the user's movie
similarity_scores = list(enumerate(cosine_sim[movie_index]))
# Sort movies by similsarity scores in descending order
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
# Extract movie indices from the sorted list
movie_indices = [i[0] for i in similarity_scores]

In [41]:
# Display top_N recommendations
top_N = 5
top_recommendations = movie['title'].iloc[movie_indices[1:top_N+1]]
print(top_recommendations)

19     St. Elmo's Fire
10           Teen Wolf
1      Youth in Revolt
4       Bird on a Wire
15    Run, Fatboy, Run
Name: title, dtype: object


### Overview recommendation model

In [42]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movie['overview'].fillna(""))

In [43]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [44]:
def get_recommendations(title, num_recommendations=10):
    idx = movie.index[movie['title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]
    return movie['title'].iloc[movie_indices]

In [46]:
movie_name = "Tomorrow, When the War Began"
recommendations = get_recommendations(movie_name)
print(recommendations)

9            Early Man
19     St. Elmo's Fire
8                  Tau
6             No Limit
7             Vendetta
1      Youth in Revolt
18                 Red
17                  P2
16       Psychokinesis
15    Run, Fatboy, Run
Name: title, dtype: object


# Movie Recommendation System

## Data Collection
- Choosing a movie data API like IMDB, TMDB OR OMDB.
- Retrieve movie data from TMDB by using the API.

In [35]:
import requests

API_KEY = "API KEY"
BASE_URL = "https://api.themoviedb.org/3"

def get_movie_data(movie_name):
    search_url = f"{BASE_URL}/search/movie"
    params = {
        "api_key": API_KEY,
        "query": movie_name
    }
    response = requests.get(search_url, params=params)
    data = response.json()
    return data["results"]

## Data Preprocessing

- Clean and preprocess the retrieved data to extract relevant information.

In [36]:
def preprocess_data(movie_data):
    processed_data = []
    for movie in movie_data:
        processed_data.append({
            "title": movie["title"],
            "overview": movie["overview"],
            "release_date": movie["release_date"],
            "rating": movie["vote_average"]
        })
    return processed_data     

## Recommendation Algorithm

- Implement a recommendation algorithm.
- One simple approach is to recommend movies based on user ratings or movie genres.

In [37]:
def recommend_movies(user_preferences, movie_data, num_recommendations=10):
    recommended_movies = [movie for movie in movie_data if movie["rating"] >= user_preferences["min_rating"]]
    recommended_movies = sorted(recommended_movies, key=lambda x: x["rating"], reverse=True)[:num_recommendations]
    return recommended_movies

## User Interaction

- Interact with the user to gather their preferences and display recommended movies.

In [38]:
def main():
    movie_name = input("Enter a movie name --> ")
    movie_data = get_movie_data(movie_name)
    processed_data = preprocess_data(movie_data)
    
    user_preferences = {
        "min_rating": 5.0
    }
    
    recommended_movies = recommend_movies(user_preferences, processed_data)
    
    print("Recommended Movies:")
    for movie in recommended_movies:
        print(f"Title: {movie['title']}")
        print(f"Overview: {movie['overview']}")
        print(f"Release Date: {movie['release_date']}")
        print(f"Rating: {movie['rating']}")
        print("===")
        
if __name__ == "__main__":
    main()

Enter a movie name --> Harry Potter
Recommended Movies:
Title: Harry Potter and the Deathly Hallows: Part 2
Overview: Harry, Ron and Hermione continue their quest to vanquish the evil Voldemort once and for all. Just as things begin to look hopeless for the young wizards, Harry discovers a trio of magical objects that endow him with powers to rival Voldemort's formidable skills.
Release Date: 2011-07-07
Rating: 8.1
===
Title: Harry Potter and the Prisoner of Azkaban
Overview: Year three at Hogwarts means new fun and challenges as Harry learns the delicate art of approaching a Hippogriff, transforming shape-shifting Boggarts into hilarity and even turning back time. But the term also brings danger: soul-sucking Dementors hover over the school, an ally of the accursed He-Who-Cannot-Be-Named lurks within the castle walls, and fearsome wizard Sirius Black escapes Azkaban. And Harry will confront them all.
Release Date: 2004-05-31
Rating: 8.0
===
Title: Harry Potter and the Philosopher's St