## **Content-Based Recommender System for TMDB Movie Dataset**

In [1]:
# Import libraries
import re
import ast
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

### *Preprocessing the Data*

In [2]:
# Read data from csv file
credits = pd.read_csv('tmdb_5000_credits.csv', encoding="utf-8")
movies = pd.read_csv('tmdb_5000_movies.csv', encoding="utf-8")

In [3]:
df = pd.merge(credits.drop(columns=['title']), movies, left_on='movie_id', right_on='id', how='inner')

In [4]:
df.shape

(4803, 23)

In [5]:
df.columns

Index(['movie_id', 'cast', 'crew', 'budget', 'genres', 'homepage', 'id',
       'keywords', 'original_language', 'original_title', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'title', 'vote_average', 'vote_count'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              4803 non-null   int64  
 1   cast                  4803 non-null   object 
 2   crew                  4803 non-null   object 
 3   budget                4803 non-null   int64  
 4   genres                4803 non-null   object 
 5   homepage              1712 non-null   object 
 6   id                    4803 non-null   int64  
 7   keywords              4803 non-null   object 
 8   original_language     4803 non-null   object 
 9   original_title        4803 non-null   object 
 10  overview              4800 non-null   object 
 11  popularity            4803 non-null   float64
 12  production_companies  4803 non-null   object 
 13  production_countries  4803 non-null   object 
 14  release_date          4802 non-null   object 
 15  revenue              

In [7]:
df.isna().sum()

movie_id                   0
cast                       0
crew                       0
budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [8]:
# Convert string representations of lists in selected columns to actual lists
cols = ['cast', 'genres', 'keywords', 'production_companies', 'production_countries',  'spoken_languages']
for col in cols:
    df[col] = df[col].apply(ast.literal_eval)
    df[col] = df[col].apply(lambda x: [d['name'] for d in x])

In [9]:
df[cols].head()

Unnamed: 0,cast,genres,keywords,production_companies,production_countries,spoken_languages
0,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Ingenious Film Partners, Twentieth Century Fo...","[United States of America, United Kingdom]","[English, Español]"
1,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Walt Disney Pictures, Jerry Bruckheimer Films...",[United States of America],[English]
2,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Columbia Pictures, Danjaq, B24]","[United Kingdom, United States of America]","[Français, English, Español, Italiano, Deutsch]"
3,"[Christian Bale, Michael Caine, Gary Oldman, A...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Legendary Pictures, Warner Bros., DC Entertai...",[United States of America],[English]
4,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",[Walt Disney Pictures],[United States of America],[English]


In [10]:
cols.append(['title', 'tagline', 'overview'])

In [11]:
# Concatenate values of specified columns into a new column 'mixed'
df['mixed'] = df.apply(lambda row: " ".join([" ".join(map(str, row[col])) for col in cols if col]), axis=1)

In [12]:
# Convert the strings to lowercase
df['mixed'] = df['mixed'].apply(lambda x: x.lower())

In [13]:
# Remove special characters
df['mixed'] = df['mixed'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [14]:
df['mixed'][0]

'sam worthington zoe saldana sigourney weaver stephen lang michelle rodriguez giovanni ribisi joel david moore cch pounder wes studi laz alonso dileep rao matt gerald sean anthony moran jason whyte scott lawrence kelly kilgour james patrick pitt sean patrick murphy peter dillon kevin dorman kelson henderson david van horn jacob tomuri michael blainrozgay jon curry luke hawker woody schultz peter mensah sonia yee jahnel curfman ilram choi kyla warren lisa roumain debra wilson chris mala taylor kibby jodie landau julie lamm cullen b madden joseph brady madden frankie torres austin wilson sara wilson tamica washingtonmiller lucy briant nathan meister gerry blair matthew chamberlain paul yates wray wilson james gaylyn melvin leno clark iii carvon futrell brandon jelkes micah moch hanniyah muhammad christopher nolen christa oliver april marie thomas bravita a threatt colin bleasdale mike bodnar matt clayton nicole dionne jamie harrison allan henry anthony ingruber ashley jeffery dean knowsl

In [15]:
df['mixed'].duplicated().sum()

0

In [16]:
df['mixed'].isna().sum()

0

In [17]:
mixed_list = list(df['mixed'])

#### *CountVectorizer (convert text to features)*

In [18]:
#Importing Count Vectorizer
cnt_vec = CountVectorizer(stop_words='english')

#### *Cosine similarity*

In [19]:
#Cosine similarity
def find_similarity(cosine_sim_matrix, index, n):
    
    # calculate cosine similarity between each vectors
    result = list(enumerate(cosine_sim_matrix[index]))
    
    # Sorting the Score
    sorted_result = sorted(result, key=lambda x:x[1], reverse=True)[1:n+1]
    
    similar_movies = [{'value': df.iloc[x[0]]['title'],
                         'score' : round(x[1], 2)} for x in sorted_result]
    
    return similar_movies

#### *1. Build a Model Using CountVectorizer & Cosine similarity*

In [20]:
#Comparing similarity to get the top matches using count Vec
def get_recommendation_cv(movie_title, df, n):
    try:
        row = df.loc[df['title'] == movie_title.title()]
        index = list(row.index)[0]
    except:
        print('Invalid movie title!')
        return None
    else:
        #Create vector using Count Vectorizer
        count_vector = cnt_vec.fit_transform(mixed_list)
        sim_matrix = cosine_similarity(count_vector)
        products = find_similarity(sim_matrix , index, n)
        return products

In [21]:
movie_title = 'Inside Out'     # The movie title for which you need similar items
data = df       # Preprocessed data
n = 10      # Number of recommendations
pd.DataFrame(get_recommendation_cv(movie_title, data, n=n)).rename(
    columns={'value': 'Movie Title', 'score': 'Score'}).style.set_properties(**{'text-align': 'left'}).set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center')]}]).format(precision=2)

Unnamed: 0,Movie Title,Score
0,Up,0.28
1,"Monsters, Inc.",0.26
2,WALL·E,0.25
3,Toy Story 3,0.25
4,Monsters University,0.22
5,The Secret Life of Pets,0.2
6,Despicable Me 2,0.2
7,Cars 2,0.2
8,Cloudy with a Chance of Meatballs,0.2
9,A Bug's Life,0.19


#### *TF-IDF (convert text to features)*

In [22]:
# Importing TF-IDF
tfidf_vec = TfidfVectorizer(stop_words='english', analyzer='word', ngram_range=(1,3))

#### *2. Build a Model Using TF-IDF & Cosine similarity*

In [23]:
# Comparing similarity to get the top matches using TF-IDF
def get_recommendation_tfidf(movie_title, df, n):
    try:
        row = df.loc[df['title'] == movie_title.title()]
        index = list(row.index)[0]
    except:
        print('Invalid movie title!')
        return None
    else:
        #Create vector using tf-idf
        tfidf_matrix = tfidf_vec.fit_transform(mixed_list)
        sim_matrix = cosine_similarity(tfidf_matrix)
        products = find_similarity(sim_matrix , index, n)
        return products

In [24]:
movie_title = 'Inside Out'     # The movie title for which you need similar items
data = df       # Preprocessed data
n = 10      # Number of recommendations
pd.DataFrame(get_recommendation_tfidf(movie_title, data, n=n)).rename(
    columns={'value': 'Movie Title', 'score': 'Score'}).style.set_properties(**{'text-align': 'left'}).set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center')]}]).format(precision=2)

Unnamed: 0,Movie Title,Score
0,Toy Story 3,0.09
1,WALL·E,0.08
2,"Monsters, Inc.",0.08
3,Up,0.06
4,Monsters University,0.05
5,Despicable Me 2,0.05
6,The Incredibles,0.05
7,Cloudy with a Chance of Meatballs,0.04
8,Treasure Planet,0.04
9,A Bug's Life,0.04
