# Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# EDA

In [2]:
main_df = pd.read_csv('TMDB_movie_dataset_v11.csv')

In [3]:
df = main_df[main_df['vote_average']!=0]

In [4]:
df.reset_index(inplace=True)

In [5]:
df.shape

(348623, 25)

In [6]:
df.head()

Unnamed: 0,index,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


## Features Selection

In [7]:
df.columns

Index(['index', 'id', 'title', 'vote_average', 'vote_count', 'status',
       'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path',
       'budget', 'homepage', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [8]:
df = df.drop( ['index','id' , 'vote_count' , 'status' , 'release_date', 'revenue' , 'backdrop_path',
              'budget','homepage','imdb_id','original_title' , 'overview','poster_path',
              'tagline' , 'production_companies','production_countries' ,'spoken_languages' ,'keywords'], axis=1)

In [9]:
df.head()

Unnamed: 0,title,vote_average,runtime,adult,original_language,popularity,genres
0,Inception,8.364,148,False,en,83.952,"Action, Science Fiction, Adventure"
1,Interstellar,8.417,169,False,en,140.241,"Adventure, Drama, Science Fiction"
2,The Dark Knight,8.512,152,False,en,130.643,"Drama, Action, Crime, Thriller"
3,Avatar,7.573,162,False,en,79.932,"Action, Adventure, Fantasy, Science Fiction"
4,The Avengers,7.71,143,False,en,98.082,"Science Fiction, Action, Adventure"


In [10]:
df['org_title']=df['title']

In [11]:
df.isna().sum()

title                    0
vote_average             0
runtime                  0
adult                    0
original_language        0
popularity               0
genres               58860
org_title                0
dtype: int64

In [12]:
df['genres'] = df['genres'].fillna('unknown')

In [13]:
df.isna().sum()

title                0
vote_average         0
runtime              0
adult                0
original_language    0
popularity           0
genres               0
org_title            0
dtype: int64

In [14]:
df.duplicated().sum()

39

In [15]:
df = df.drop_duplicates()

In [16]:
df.duplicated().sum()

0

In [17]:
dff= df.copy()

## MultiLabel Encoder

In [18]:
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

genre_l = dff['genres'].apply(lambda x: x.split(','))
genre_l = pd.DataFrame(genre_l)

In [19]:
genre_l

Unnamed: 0,genres
0,"[Action, Science Fiction, Adventure]"
1,"[Adventure, Drama, Science Fiction]"
2,"[Drama, Action, Crime, Thriller]"
3,"[Action, Adventure, Fantasy, Science Fiction]"
4,"[Science Fiction, Action, Adventure]"
...,...
348618,[unknown]
348619,"[Mystery, Romance]"
348620,[unknown]
348621,[unknown]


In [20]:
genre_l['genres'] = genre_l['genres'].apply(lambda x :[ y.strip().lower().replace(' ','') for y in x] )

In [21]:
MLB = MultiLabelBinarizer()

genre_encoded = MLB.fit_transform(genre_l['genres'])



In [22]:
genre_encoded

array([[1, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
genre_encoded_df = pd.DataFrame(genre_encoded, columns=MLB.classes_)
genre_encoded_df=genre_encoded_df.reset_index()

In [24]:
mod_df = dff.drop(['genres'],axis=1)
mod_df=mod_df.reset_index()

In [25]:
df = pd.concat([mod_df,genre_encoded_df],axis=1).drop('index',axis=1)

In [26]:
df.head()

Unnamed: 0,title,vote_average,runtime,adult,original_language,popularity,org_title,action,adventure,animation,...,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,unknown,war,western
0,Inception,8.364,148,False,en,83.952,Inception,1,1,0,...,0,0,0,0,1,0,0,0,0,0
1,Interstellar,8.417,169,False,en,140.241,Interstellar,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,The Dark Knight,8.512,152,False,en,130.643,The Dark Knight,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,Avatar,7.573,162,False,en,79.932,Avatar,1,1,0,...,0,0,0,0,1,0,0,0,0,0
4,The Avengers,7.71,143,False,en,98.082,The Avengers,1,1,0,...,0,0,0,0,1,0,0,0,0,0


## Features Engineering 

In [27]:
df['title'] = df['title'].apply(lambda x :x.strip().lower().replace(' ','') )
df['original_language'] = df['original_language'].apply(lambda x :x.strip().lower().replace(' ','') )

In [28]:
df.head()

Unnamed: 0,title,vote_average,runtime,adult,original_language,popularity,org_title,action,adventure,animation,...,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,unknown,war,western
0,inception,8.364,148,False,en,83.952,Inception,1,1,0,...,0,0,0,0,1,0,0,0,0,0
1,interstellar,8.417,169,False,en,140.241,Interstellar,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,thedarkknight,8.512,152,False,en,130.643,The Dark Knight,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,avatar,7.573,162,False,en,79.932,Avatar,1,1,0,...,0,0,0,0,1,0,0,0,0,0
4,theavengers,7.71,143,False,en,98.082,The Avengers,1,1,0,...,0,0,0,0,1,0,0,0,0,0


In [29]:
df.loc[~( (df['original_language']=='en')|(df['original_language']=='fr')|(df['original_language']=='es')|(df['original_language']=='de')|(df['original_language']=='ja')),'original_language'] = 'else'

## One-Hot Encoding 

In [30]:
OHE = OneHotEncoder(sparse_output=False)

In [31]:
df['adult'] = df['adult'].astype('str')
adult_enc = OHE.fit_transform(df[['adult']])
adult_enc_df = pd.DataFrame(adult_enc,columns=OHE.get_feature_names_out())

In [32]:
adult_enc_df = adult_enc_df.drop('adult_True',axis=1)

In [33]:
lang_enc = OHE.fit_transform(df[['original_language']])
lang_enc_df = pd.DataFrame(lang_enc,columns=OHE.get_feature_names_out())

In [34]:
mod_df = df.drop(['adult','original_language'],axis=1)

In [35]:
df = pd.concat([mod_df,adult_enc_df,lang_enc_df],axis=1)

In [36]:
df.head()

Unnamed: 0,title,vote_average,runtime,popularity,org_title,action,adventure,animation,comedy,crime,...,unknown,war,western,adult_False,original_language_de,original_language_else,original_language_en,original_language_es,original_language_fr,original_language_ja
0,inception,8.364,148,83.952,Inception,1,1,0,0,0,...,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,interstellar,8.417,169,140.241,Interstellar,0,1,0,0,0,...,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,thedarkknight,8.512,152,130.643,The Dark Knight,1,0,0,0,1,...,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,avatar,7.573,162,79.932,Avatar,1,1,0,0,0,...,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,theavengers,7.71,143,98.082,The Avengers,1,1,0,0,0,...,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


## Normalization 

In [37]:
from sklearn.preprocessing import StandardScaler
SC = StandardScaler()

In [38]:
df_norm = SC.fit_transform(df.drop(['title','org_title'],axis=1))
df_norm_df = pd.DataFrame(df_norm, columns=[x for x in df.columns if x not in ['title', 'org_title']])


In [39]:
df = pd.concat([df[['title','org_title']],df_norm_df],axis=1)

In [40]:
df.head()

Unnamed: 0,title,org_title,vote_average,runtime,popularity,action,adventure,animation,comedy,crime,...,unknown,war,western,adult_False,original_language_de,original_language_else,original_language_en,original_language_es,original_language_fr,original_language_ja
0,inception,Inception,1.159022,1.19251,6.051027,3.467897,4.803832,-0.255002,-0.524992,-0.246693,...,-0.450621,-0.132889,-0.114231,0.2431,-0.214727,-0.589825,0.962491,-0.273032,-0.274811,-0.201785
1,interstellar,Interstellar,1.186048,1.515463,10.23842,-0.288359,4.803832,-0.255002,-0.524992,-0.246693,...,-0.450621,-0.132889,-0.114231,0.2431,-0.214727,-0.589825,0.962491,-0.273032,-0.274811,-0.201785
2,thedarkknight,The Dark Knight,1.234489,1.254025,9.524416,3.467897,-0.208167,-0.255002,-0.524992,4.053617,...,-0.450621,-0.132889,-0.114231,0.2431,-0.214727,-0.589825,0.962491,-0.273032,-0.274811,-0.201785
3,avatar,Avatar,0.755682,1.407812,5.751976,3.467897,4.803832,-0.255002,-0.524992,-0.246693,...,-0.450621,-0.132889,-0.114231,0.2431,-0.214727,-0.589825,0.962491,-0.273032,-0.274811,-0.201785
4,theavengers,The Avengers,0.82554,1.115617,7.102172,3.467897,4.803832,-0.255002,-0.524992,-0.246693,...,-0.450621,-0.132889,-0.114231,0.2431,-0.214727,-0.589825,0.962491,-0.273032,-0.274811,-0.201785


## Handling Duplicates 

In [41]:
df = df.drop_duplicates(subset=['title'])

In [42]:
df=df.set_index(['title'])
df_fin=df.drop('org_title',axis=1)

# Cosine-Similatry

In [43]:
movie_name = 'the dark knight'
movie_name=movie_name.strip().lower().replace(' ','')
new_df= df_fin.loc[[movie_name]]


In [44]:
new_df = new_df.values.reshape(1,-1)


In [45]:
from sklearn.metrics.pairwise import cosine_similarity
df_other = df_fin.loc[df_fin.index!=movie_name,:]
df_titles = df.loc[df.index!=movie_name,'org_title']
cosine_sim_matrix = cosine_similarity(new_df,df_other)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix,index=[movie_name],columns=df_titles)

cosine_sim_df

org_title,Inception,Interstellar,Avatar,The Avengers,Deadpool,Avengers: Infinity War,Fight Club,Guardians of the Galaxy,Pulp Fiction,Forrest Gump,...,"Présidentielle, une épreuve d'artistes",Don Gio,Shimai jiken-bo: Etchi ni marumie,Money. Murder. Zurich.: Borchert and the fatal error,The Fiend with the Electronic Brain,Haminiyah Leretzach,Wakazuma triangle: Gyutto shimeru,For det fælles bedste,El Silbón,Tab
thedarkknight,0.580905,0.667455,0.492582,0.622346,0.618574,0.728363,0.777721,0.333799,0.885865,0.692117,...,-0.118244,-0.112587,-0.120576,0.322637,0.119939,-0.191128,-0.099215,-0.224258,-0.159176,-0.127118


In [46]:
sorted_row = cosine_sim_df.loc[movie_name].sort_values(ascending=False)[0:20]

In [47]:
sorted_row.index[5]

'John Wick: Chapter 3 - Parabellum'

# Deployment 

In [48]:
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
def predict(movie_name,no_movies):
    movie_name=movie_name.strip().lower().replace(' ','')
    if(movie_name in df_fin.index):
        new_df= df_fin.loc[[movie_name]]
        new_df = new_df.values.reshape(1,-1)
        df_other = df_fin.loc[df_fin.index!=movie_name,:]
        df_titles = df.loc[df.index!=movie_name,'org_title']
        cosine_sim_matrix = cosine_similarity(new_df,df_other)
        cosine_sim_df = pd.DataFrame(cosine_sim_matrix,index=[movie_name],columns=df_titles)
        sorted_row = cosine_sim_df.loc[movie_name].sort_values(ascending=False)[0:int(no_movies)]
        rec=''
        for i in range(int(no_movies)):
            rec+=(sorted_row.index[i])+'\n'
        return rec
    else:
        return "Sorry, this movie isn't in our database. \n try another one !"

# Define the Gradio interface
interface = gr.Interface(
    fn=predict,  # The function to be called
    inputs=[gr.Textbox(label="Movie Name : "),
            gr.Textbox(label='No.of Recommendations: ',value= '5')],# Input type
    outputs=gr.Textbox(label="Recommendations : ")  # Output type
)
interface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


