In [85]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/million-song-data-set-subset/10000.txt
/kaggle/input/million-song-data-set-subset/song_data.csv
/kaggle/input/movie-lens-small-latest-dataset/movies.csv
/kaggle/input/movie-lens-small-latest-dataset/ratings.csv
/kaggle/input/movie-lens-small-latest-dataset/README.txt
/kaggle/input/movie-lens-small-latest-dataset/tags.csv
/kaggle/input/movie-lens-small-latest-dataset/links.csv


# Step 1 : Import required files

In [86]:
# import file
movie = pd.read_csv("/kaggle/input/movie-lens-small-latest-dataset/movies.csv")
movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


# Step 2 : Data Preprocesing and cleaning

In [87]:
# Check unique movie title
movie["title"].unique()

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Flint (2017)', 'Bungo Stray Dogs: Dead Apple (2018)',
       'Andrew Dice Clay: Dice Rules (1991)'], dtype=object)

In [88]:
# create ne coloumn name as year from title
movie["Year"] = movie.title.str.extract("(\(\d\d\d\d\))" ,expand = True)
movie["Year"]

0       (1995)
1       (1995)
2       (1995)
3       (1995)
4       (1995)
         ...  
9737    (2017)
9738    (2017)
9739    (2017)
9740    (2018)
9741    (1991)
Name: Year, Length: 9742, dtype: object

In [89]:
# Removing extra brakets 
movie["Year"] = movie.title.str.extract("(\d\d\d\d)" ,expand = True)
movie["Year"]

0       1995
1       1995
2       1995
3       1995
4       1995
        ... 
9737    2017
9738    2017
9739    2017
9740    2018
9741    1991
Name: Year, Length: 9742, dtype: object

In [90]:
# replace year and add whitespaces
movie["title"]= movie.title.str.replace("(\(\d\d\d\d\))","")
movie

  


Unnamed: 0,movieId,title,genres,Year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017
9738,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017
9739,193585,Flint,Drama,2017
9740,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018


In [91]:
# Removing extra spaces from begining and ending
movie["title"]= movie["title"].apply(lambda x: x.strip())
movie.head(3)

Unnamed: 0,movieId,title,genres,Year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995


In [92]:
# apply lower to the genres
movie["genres"] = movie["genres"].apply(lambda x:x.lower())
movie

Unnamed: 0,movieId,title,genres,Year
0,1,Toy Story,adventure|animation|children|comedy|fantasy,1995
1,2,Jumanji,adventure|children|fantasy,1995
2,3,Grumpier Old Men,comedy|romance,1995
3,4,Waiting to Exhale,comedy|drama|romance,1995
4,5,Father of the Bride Part II,comedy,1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,action|animation|comedy|fantasy,2017
9738,193583,No Game No Life: Zero,animation|comedy|fantasy,2017
9739,193585,Flint,drama,2017
9740,193587,Bungo Stray Dogs: Dead Apple,action|animation,2018


In [93]:
# import tags file
tag = pd.read_csv("/kaggle/input/movie-lens-small-latest-dataset/tags.csv")
tag.head(3)


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


In [94]:
# merge  movie and tag file 
df = pd.merge(movie,tag,on="movieId",how="left")
df.head(3)

Unnamed: 0,movieId,title,genres,Year,userId,tag,timestamp
0,1,Toy Story,adventure|animation|children|comedy|fantasy,1995,336.0,pixar,1139046000.0
1,1,Toy Story,adventure|animation|children|comedy|fantasy,1995,474.0,pixar,1137207000.0
2,1,Toy Story,adventure|animation|children|comedy|fantasy,1995,567.0,fun,1525286000.0


In [95]:
# create metadata by adding genres and tag
df.fillna("", inplace =True)
df = pd.DataFrame(df.groupby("movieId")["tag"].apply(lambda x: "%s" % " ".join(x)))
df


Unnamed: 0_level_0,tag
movieId,Unnamed: 1_level_1
1,pixar pixar fun
2,fantasy magic board game Robin Williams game
3,moldy old
4,
5,pregnancy remake
...,...
193581,
193583,
193585,
193587,


In [96]:
#merge movie and df dataset
new_df = pd.merge(movie,df ,on = "movieId" ,how="left")
new_df["metadata"] = new_df[["tag","genres"]].apply(lambda x: " ".join(x),axis=1)
new_df["metadata"]

0       pixar pixar fun adventure|animation|children|c...
1       fantasy magic board game Robin Williams game a...
2                                moldy old comedy|romance
3                                    comedy|drama|romance
4                                 pregnancy remake comedy
                              ...                        
9737                      action|animation|comedy|fantasy
9738                             animation|comedy|fantasy
9739                                                drama
9740                                     action|animation
9741                                               comedy
Name: metadata, Length: 9742, dtype: object

In [97]:
# store movieId title metadata year in new_df
new_df = new_df[["movieId","title","metadata","Year"]]
new_df

Unnamed: 0,movieId,title,metadata,Year
0,1,Toy Story,pixar pixar fun adventure|animation|children|c...,1995
1,2,Jumanji,fantasy magic board game Robin Williams game a...,1995
2,3,Grumpier Old Men,moldy old comedy|romance,1995
3,4,Waiting to Exhale,comedy|drama|romance,1995
4,5,Father of the Bride Part II,pregnancy remake comedy,1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,action|animation|comedy|fantasy,2017
9738,193583,No Game No Life: Zero,animation|comedy|fantasy,2017
9739,193585,Flint,drama,2017
9740,193587,Bungo Stray Dogs: Dead Apple,action|animation,2018


In [98]:
# split |  from metadata 
new_df["metadata"]=new_df.metadata.str.split("|")
new_df

Unnamed: 0,movieId,title,metadata,Year
0,1,Toy Story,"[pixar pixar fun adventure, animation, childre...",1995
1,2,Jumanji,[fantasy magic board game Robin Williams game ...,1995
2,3,Grumpier Old Men,"[moldy old comedy, romance]",1995
3,4,Waiting to Exhale,"[ comedy, drama, romance]",1995
4,5,Father of the Bride Part II,[pregnancy remake comedy],1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[ action, animation, comedy, fantasy]",2017
9738,193583,No Game No Life: Zero,"[ animation, comedy, fantasy]",2017
9739,193585,Flint,[ drama],2017
9740,193587,Bungo Stray Dogs: Dead Apple,"[ action, animation]",2018


In [99]:
# find shape of final dataset
new_df.shape

(9742, 4)

In [100]:
# Remove all extra spaces from metadata column
new_df['metadata']=new_df['metadata'].apply(lambda x:[i.replace(" ","") for i in x])
new_df

Unnamed: 0,movieId,title,metadata,Year
0,1,Toy Story,"[pixarpixarfunadventure, animation, children, ...",1995
1,2,Jumanji,[fantasymagicboardgameRobinWilliamsgameadventu...,1995
2,3,Grumpier Old Men,"[moldyoldcomedy, romance]",1995
3,4,Waiting to Exhale,"[comedy, drama, romance]",1995
4,5,Father of the Bride Part II,[pregnancyremakecomedy],1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[action, animation, comedy, fantasy]",2017
9738,193583,No Game No Life: Zero,"[animation, comedy, fantasy]",2017
9739,193585,Flint,[drama],2017
9740,193587,Bungo Stray Dogs: Dead Apple,"[action, animation]",2018


In [101]:
# join metadata spaces
new_df['metadata']=new_df['metadata'].apply(lambda x:" ".join(x))
new_df

Unnamed: 0,movieId,title,metadata,Year
0,1,Toy Story,pixarpixarfunadventure animation children come...,1995
1,2,Jumanji,fantasymagicboardgameRobinWilliamsgameadventur...,1995
2,3,Grumpier Old Men,moldyoldcomedy romance,1995
3,4,Waiting to Exhale,comedy drama romance,1995
4,5,Father of the Bride Part II,pregnancyremakecomedy,1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,action animation comedy fantasy,2017
9738,193583,No Game No Life: Zero,animation comedy fantasy,2017
9739,193585,Flint,drama,2017
9740,193587,Bungo Stray Dogs: Dead Apple,action animation,2018


In [102]:
# check new_df 
new_df

Unnamed: 0,movieId,title,metadata,Year
0,1,Toy Story,pixarpixarfunadventure animation children come...,1995
1,2,Jumanji,fantasymagicboardgameRobinWilliamsgameadventur...,1995
2,3,Grumpier Old Men,moldyoldcomedy romance,1995
3,4,Waiting to Exhale,comedy drama romance,1995
4,5,Father of the Bride Part II,pregnancyremakecomedy,1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,action animation comedy fantasy,2017
9738,193583,No Game No Life: Zero,animation comedy fantasy,2017
9739,193585,Flint,drama,2017
9740,193587,Bungo Stray Dogs: Dead Apple,action animation,2018


In [103]:
# Find indexing position
x = new_df[new_df["title"]=="Jumanji"].index
x

Int64Index([1], dtype='int64')

In [104]:
# check 0th location of new_df
new_df['metadata'][0]

'pixarpixarfunadventure animation children comedy fantasy'

In [105]:
# check values of new_df
new_df.values

array([[1, 'Toy Story',
        'pixarpixarfunadventure animation children comedy fantasy',
        '1995'],
       [2, 'Jumanji',
        'fantasymagicboardgameRobinWilliamsgameadventure children fantasy',
        '1995'],
       [3, 'Grumpier Old Men', 'moldyoldcomedy romance', '1995'],
       ...,
       [193585, 'Flint', 'drama', '2017'],
       [193587, 'Bungo Stray Dogs: Dead Apple', 'action animation',
        '2018'],
       [193609, 'Andrew Dice Clay: Dice Rules', 'comedy', '1991']],
      dtype=object)

In [106]:
# check type of new_df
type(new_df)

pandas.core.frame.DataFrame

In [107]:
# check shape of dataframe
new_df.shape

(9742, 4)

# Step 3 : Convert Dataframe into sparse matrix

In [108]:
# for create vector using TfidfVectorizer library
from sklearn.feature_extraction.text import TfidfVectorizer 
tfid = TfidfVectorizer(stop_words='english')
new_df['metadata'] = new_df['metadata'].fillna('')

In [109]:
tfv_matrix = tfid.fit_transform(new_df['metadata'])
tfv_matrix

<9742x1360 sparse matrix of type '<class 'numpy.float64'>'
	with 23318 stored elements in Compressed Sparse Row format>

In [110]:
# check shape of matrix
tfv_matrix.shape

(9742, 1360)

In [111]:
# import linear_kernel from sklearn
from sklearn.metrics.pairwise import linear_kernel
### compute the sigmoid kernel
sig = linear_kernel(tfv_matrix, tfv_matrix)

In [112]:
# store index of title 
indices = pd.Series(new_df.index, index=new_df["title"])

In [113]:
indices

title
Toy Story                                0
Jumanji                                  1
Grumpier Old Men                         2
Waiting to Exhale                        3
Father of the Bride Part II              4
                                      ... 
Black Butler: Book of the Atlantic    9737
No Game No Life: Zero                 9738
Flint                                 9739
Bungo Stray Dogs: Dead Apple          9740
Andrew Dice Clay: Dice Rules          9741
Length: 9742, dtype: int64

In [114]:
indices["Toy Story"]

0

# Step 4 : Create a function for movie recomendation

In [115]:
def recomend_movie(title,cosine_sin=sig):
    idx = indices[title]
    sim_scores = enumerate(cosine_sin[idx])
    sim_scores = sorted(sim_scores, key=lambda x : x[1], reverse=True)
    # how many movies will recomend is set here
    sim_scores = sim_scores[1:10]            
    movies_indices = [i[0] for i in sim_scores]
    movie_rec = new_df['title'].iloc[movies_indices].reset_index
    print(movie_rec)

#  Step 5 : Function Call

In [116]:
recomend_movie("Toy Story")

<bound method Series.reset_index of 7184                          Partly Cloudy
7917                                 Presto
8273    Cloudy with a Chance of Meatballs 2
8674      Stuart Little 3: Call of the Wild
9536           Last Year's Snow Was Falling
9560                   Wow! A Talking Fish!
1584                  All Dogs Go to Heaven
2160                             Thumbelina
3937                  Care Bears Movie, The
Name: title, dtype: object>


In [117]:
recomend_movie("Grumpier Old Men")

<bound method Series.reset_index of 864                         Hustler White
1128           Kama Sutra: A Tale of Love
1130                           Love Jones
1140          Love and Other Catastrophes
1151            Temptress Moon (Feng Yue)
1182                                 Fall
1745                       Meet Joe Black
1879                  Message in a Bottle
2033    Autumn Tale, An (Conte d'automne)
Name: title, dtype: object>


# End of the Project using content base filtering