# Content Based Recommendation System

# Q1 Read the Dataset `movies_metadata.csv`

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
movies_ratings_df = pd.read_csv("movies_metadata-1.csv",encoding = "ISO-8859-1")

In [4]:
movies_ratings_df.shape

(45466, 24)

In [5]:
movies_ratings_df.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [6]:
movies_ratings_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0



# Q2 Create a new column with name 'description' combining 'overview' and 'tagline' columns in the given dataset

In [7]:
movies_ratings_df['description'] = movies_ratings_df['overview'].astype(str) + movies_ratings_df['title']

# Q3  Lets drop the null values in `description` column

In [8]:
movies_ratings_df.isna().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
description                  6
dtype: int64

In [9]:
movies_ratings_df.dropna(subset=['description'], inplace = True)

In [10]:
movies_ratings_df.shape

(45460, 25)

# Q4 Keep the first occurance and drop duplicates of each title in column 'title'

In [11]:
movies_ratings_df.drop_duplicates(subset=['title'], inplace = True)

In [12]:
movies_ratings_df.shape

(42277, 25)

In [13]:
movies_ratings_df.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
description               object
dtype: object

# Q5   As we might have dropped a few rows with duplicate `title` in above step, just reset the index [make sure you are not adding any new column to the dataframe while doing reset index]

In [14]:
movies_ratings_df.reset_index(drop=True, inplace=True)

In [15]:
movies_ratings_df.shape

(42277, 25)

# Q6    Generate tf-idf matrix using the column `description`. Consider till 3-grams, with minimum document frequency as 0.

Hint:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english', min_df = 0)
tf_idf_vect.fit(movies_ratings_df['description'])
title_matrix = tf_idf_vect.transform(movies_ratings_df['description'])

In [17]:
features = tf_idf_vect.get_feature_names()
features

['00',
 '00 blood',
 '00 blood sweat',
 '00 body',
 '00 body vanish',
 '00 editor',
 '00 editor jabez',
 '00 foot',
 '00 foot tall',
 '00 furnish',
 '00 furnish ascent',
 '00 middle',
 '00 middle dance',
 '00 pm',
 '00 pm 12',
 '00 pm clock',
 '00 rescue',
 '00 rescue dr',
 '00 schneider',
 '00 schneider asked',
 '00 schneider jagd',
 '00 wakening',
 '00 wakening trailblazer',
 '000',
 '000 00',
 '000 00 editor',
 '000 000',
 '000 000 bloody',
 '000 000 brazilian',
 '000 000 cash',
 '000 000 citizens',
 '000 000 dollars',
 '000 000 fmk',
 '000 000 instead',
 '000 000 instructions',
 '000 000 kroner',
 '000 000 pounds',
 '000 000 rebuild',
 '000 000 stashed',
 '000 000 stolen',
 '000 000 tournament',
 '000 000 worth',
 '000 000 years',
 '000 002',
 '000 24',
 '000 24 hours',
 '000 30',
 '000 30 000',
 '000 300',
 '000 300 spartans',
 '000 50',
 '000 50 000',
 '000 accept',
 '000 accept denard',
 '000 additional',
 '000 additional tons',
 '000 aliens',
 '000 aliens newcomers',
 '000 amer

# Q7  Create cosine similarity matrix

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim_titles = cosine_similarity(title_matrix)

# Q8  Write a function with name `recommend` which takes `title` as argument and returns a list of 10 recommended title names in the output based on the above cosine similarities

Hint:

titles = df['title'] <br>
indices = pd.Series(df.index, index=df['title']) <br>

def recommend(title): <br>
    idx = indices[title] <br>
    sim_scores = list(enumerate(cosine_similarities[idx])) <br>
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) <br>
    sim_scores = sim_scores[1:31] <br>
    movie_indices = [i[0] for i in sim_scores] <br>
    return titles.iloc[movie_indices] <br>

In [19]:
titles = movies_ratings_df['title'] 
indices = pd.Series(movies_ratings_df.index, index=movies_ratings_df['title']) 

In [20]:
def recommend(title): 
    idx = indices[title] 
    sim_scores = list(enumerate(cosine_sim_titles[idx])) 
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) 
    sim_scores = sim_scores[1:11] 
    movie_indices = [i[0] for i in sim_scores] 
    return titles.iloc[movie_indices] 

# Q9 Give the recommendations from above functions for movies `The Godfather` and `The Dark Knight Rises`

In [21]:
recommend('The Dark Knight Rises')

12075                                      The Dark Knight
150                                         Batman Forever
585                                                 Batman
18902              Batman: The Dark Knight Returns, Part 1
8993                    Batman Beyond: Return of the Joker
20194    Batman Unmasked: The Psychology of the Dark Kn...
12425                                Batman: Gotham Knight
19308              Batman: The Dark Knight Returns, Part 2
1325                                        Batman Returns
14924                           Batman: Under the Red Hood
Name: title, dtype: object

In [22]:
recommend('The Godfather')

1176               The Godfather: Part II
40982    The Godfather Trilogy: 1972-1990
29997                    Honor Thy Father
11608          The Cave of the Yellow Dog
17533                     The Outside Man
1908              The Godfather: Part III
35502            A Mother Should Be Loved
21969                          Blood Ties
28369                             Maqbool
29                         Shanghai Triad
Name: title, dtype: object

In [23]:
recommend('Transformers')

13342    Transformers: Revenge of the Fallen
41715          Transformers: The Last Knight
3839             The Transformers: The Movie
16584         Transformers: Dark of the Moon
22407        Transformers: Age of Extinction
35560                             Son of Sam
23625                            Amira & Sam
4825                                I Am Sam
22823                                 Career
22486                    Bullet for a Badman
Name: title, dtype: object

# Popularity Based Recommendation System

### About Dataset

Anonymous Ratings on jokes.

1. Ratings are real values ranging from -10.00 to +10.00 (the value "99" corresponds to "null" = "not rated").

2. One row per user

3. The first column gives the number of jokes rated by that user. The next 100 columns give the ratings for jokes 01 - 100.

# Q10 Read the dataset(jokes.csv)

Take care about the header in read_csv() as there are no column names given in the dataset. 

In [24]:
jokes_df = pd.read_csv("jokes-1.csv",encoding = "ISO-8859-1")

In [25]:
jokes_df.head()

Unnamed: 0,NumJokes,Joke1,Joke2,Joke3,Joke4,Joke5,Joke6,Joke7,Joke8,Joke9,...,Joke91,Joke92,Joke93,Joke94,Joke95,Joke96,Joke97,Joke98,Joke99,Joke100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [26]:
jokes_df.shape

(24983, 101)

# Q11 Consider `ratings` named dataframe with only first 200 rows and all columns from 1(first column is 0) of dataset

In [27]:
ratings_df = jokes_df.iloc[0:200,1:]

In [28]:
ratings_df.shape

(200, 100)

In [29]:
ratings_df.head()

Unnamed: 0,Joke1,Joke2,Joke3,Joke4,Joke5,Joke6,Joke7,Joke8,Joke9,Joke10,...,Joke91,Joke92,Joke93,Joke94,Joke95,Joke96,Joke97,Joke98,Joke99,Joke100
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,1.84,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


# Q12 Change the column indices from 0 to 99

In [30]:
ratings_df.columns = np.arange(len(ratings_df.columns))

In [31]:
ratings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,1.84,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


# Q13 In the dataset, the null ratings are given as 99.00, so replace all 99.00s with 0
Hint: You can use `ratings.replace(<the given value>, <new value you wanted to change with>)`

In [32]:
ratings_df.replace(99.00,0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.50,-9.85,4.17,-8.98,-4.76,...,2.82,0.00,0.00,0.00,0.00,0.00,-5.63,0.00,0.00,0.00
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,0.00,0.00,0.00,0.00,9.03,9.27,9.03,9.27,0.00,0.00,...,0.00,0.00,0.00,9.08,0.00,0.00,0.00,0.00,0.00,0.00
3,0.00,8.35,0.00,0.00,1.80,8.16,-2.82,6.21,0.00,1.84,...,0.00,0.00,0.00,0.53,0.00,0.00,0.00,0.00,0.00,0.00
4,8.50,4.61,-4.17,-5.39,1.36,1.60,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.80,1.60
5,-6.17,-3.54,0.44,-8.50,-7.09,-4.32,-8.69,-0.87,-6.65,-1.80,...,-3.54,-6.89,-0.68,-2.96,-2.18,-3.35,0.05,-9.08,-5.05,-3.45
6,0.00,0.00,0.00,0.00,8.59,-9.85,7.72,8.79,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,2.33,0.00,0.00,0.00,0.00
7,6.84,3.16,9.17,-6.21,-8.16,-1.70,9.27,1.41,-5.19,-4.42,...,7.23,-1.12,-0.10,-5.68,-3.16,-3.35,2.14,-0.05,1.31,0.00
8,-3.79,-3.54,-9.42,-6.89,-8.74,-0.29,-5.29,-8.93,-7.86,-1.60,...,4.37,-0.29,4.17,-0.29,-0.29,-0.29,-0.29,-0.29,-3.40,-4.95
9,3.01,5.15,5.15,3.01,6.41,5.15,8.93,2.52,3.01,8.16,...,0.00,4.47,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


# Q14 Normalize the ratings using StandardScaler and save them in `ratings_diff` variable

In [33]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [34]:
ratings_diff =  sc.fit(ratings_df)

### Popularity based recommendation system

# Q15  Find the mean for each column  in `ratings_diff` i.e, for each joke
Consider all the mean ratings and find the jokes with highest mean value and display the top 10 joke IDs.

In [35]:
data = sc.mean_

In [36]:
mean_df = pd.DataFrame(data)

In [37]:
mean_df.head()

Unnamed: 0,0
0,29.73505
1,18.9089
2,34.88765
3,45.334
4,-0.0379


In [38]:
mean_df.sort_values(by=[0], ascending = False).head(10)

Unnamed: 0,0
88,68.26785
74,68.16485
76,68.03305
75,67.99045
71,67.946
79,67.89735
72,67.681
78,67.09945
77,67.05365
73,66.91115


In [39]:
# Above are the Top 10 Joke IDs basis the average of the ratings for the jokes. 
# Index Ids are the joke IDs