# Content-Based Filtering

### Load Data

In [24]:
import pandas as pd

In [25]:
movies = pd.read_csv("movies.csv", sep=",")

In [26]:
movies[:1]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


### Create TfidfVectorizer Object

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")

In [28]:
movies["overview"] = movies["overview"].fillna("")

In [29]:
movies["overview"]

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bond’s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4798    El Mariachi just wants to play his guitar and ...
4799    A newlywed couple's honeymoon is upended by th...
4800    "Signed, Sealed, Delivered" introduces a dedic...
4801    When ambitious New York attorney Sam is sent t...
4802    Ever since the second grade when he first saw ...
Name: overview, Length: 4803, dtype: object

### Create tfidf matrix of overviews

In [30]:
overview_tfidf = tfidf.fit_transform(movies["overview"])

In [31]:
pd.DataFrame(overview_tfidf.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,00,000,007,07am,10,100,1000,101,108,10th,...,zuckerberg,zula,zuzu,zyklon,æon,éloigne,émigré,été,única,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
overview_tfidf.shape

(4803, 20978)

### Similarity Matrix

In [33]:
from sklearn.metrics.pairwise import linear_kernel

In [34]:
similarity_matrix = linear_kernel(overview_tfidf, overview_tfidf)

In [35]:
similarity_matrix

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]])

In [36]:
similarity_matrix.shape

(4803, 4803)

In [37]:
similarity_matrix[1]

array([0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
       0.        ])

### Find the most similar movies to a specific movie

In [38]:
movie_title = "John Carter"

In [39]:
movie_index = movies.loc[movies["title"] == movie_title].index[0]
movie_index

np.int64(4)

In [40]:
scores = list(enumerate(similarity_matrix[movie_index]))
scores[:10]

[(0, np.float64(0.0)),
 (1, np.float64(0.033368675996424305)),
 (2, np.float64(0.0)),
 (3, np.float64(0.010433403719159351)),
 (4, np.float64(0.9999999999999998)),
 (5, np.float64(0.0)),
 (6, np.float64(0.009339192776152496)),
 (7, np.float64(0.037407042075763064)),
 (8, np.float64(0.0)),
 (9, np.float64(0.01714819056424443))]

In [41]:
sort_similarity = sorted(scores, reverse=True, key=lambda x: x[1])
sort_similarity[:10]

[(4, np.float64(0.9999999999999998)),
 (1254, np.float64(0.20497253140891997)),
 (4161, np.float64(0.16370347641323713)),
 (2932, np.float64(0.12239400129620456)),
 (3349, np.float64(0.11887151040572358)),
 (1307, np.float64(0.11468689553107403)),
 (3068, np.float64(0.11237609168095009)),
 (345, np.float64(0.09079218287485014)),
 (581, np.float64(0.0900594125581888)),
 (2998, np.float64(0.08877535757173034))]

In [42]:
similar_movies_indexes = [tpls[0] for tpls in sort_similarity[1:4]]
similar_movies_indexes

[1254, 4161, 2932]

In [43]:
similar_movies = list(movies["title"].iloc[similar_movies_indexes])
similar_movies

['Get Carter', 'The Marine 4: Moving Target', 'Raising Cain']

In [44]:
def get_similar_movies(movie_title, nr_movies):
    movie_index = movies.loc[movies["title"] == movie_title].index[0]
    scores = list(enumerate(similarity_matrix[movie_index]))
    sort_similarity = sorted(scores, reverse=True, key=lambda x: x[1])
    similar_movies_indexes = [tpls[0] for tpls in sort_similarity[1:nr_movies+1]]
    similar_movies = list(movies["title"].iloc[similar_movies_indexes])
    return similar_movies

In [45]:
get_similar_movies("Kung Fu Panda 3", 3)

['Kung Fu Panda 2',
 'My Big Fat Greek Wedding 2',
 'Once Upon a Time in the West']