In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# data importing

In [2]:
credits=pd.read_csv(r"C:\HBO-MAX\credits.csv\credits.csv")
titles=pd.read_csv(r"C:\HBO-MAX\titles.csv\titles.csv")

In [3]:
credits.head()

Unnamed: 0,person_id,id,name,character,role
0,60017,tm155702,Judy Garland,Dorothy Gale,ACTOR
1,53496,tm155702,Ray Bolger,Hunk / Scarecrow,ACTOR
2,79549,tm155702,Jack Haley,Hickory / Tin Man,ACTOR
3,79548,tm155702,Bert Lahr,Zeke / Cowardly Lion,ACTOR
4,60995,tm155702,Margaret Hamilton,Elmira Gulch / Wicked Witch of the West,ACTOR


In [4]:
credits.shape

(66393, 5)

In [5]:
credits.drop_duplicates(inplace=True)
credits.shape

(66393, 5)

In [6]:
credits.isnull().sum()

person_id       0
id              0
name            0
character    4505
role            0
dtype: int64

In [7]:
titles.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,tm155702,The Wizard of Oz,MOVIE,Young Dorothy finds herself in a magical world...,1939,G,102,"['fantasy', 'family']",['US'],,tt0032138,8.1,389774.0,41.442,7.6
1,tm83648,Citizen Kane,MOVIE,"Newspaper magnate, Charles Foster Kane is take...",1941,PG,119,['drama'],['US'],,tt0033467,8.3,433804.0,14.383,8.0
2,tm77588,Casablanca,MOVIE,"In Casablanca, Morocco in December 1941, a cyn...",1942,PG,102,"['drama', 'romance', 'war']",['US'],,tt0034583,8.5,558849.0,20.087,8.2
3,tm82363,The Big Sleep,MOVIE,Private Investigator Philip Marlowe is hired b...,1946,,116,"['thriller', 'crime']",['US'],,tt0038355,7.9,84494.0,12.911,7.7
4,tm84701,The Maltese Falcon,MOVIE,A private detective takes on a case that invol...,1941,,100,"['thriller', 'romance', 'crime']",['US'],,tt0033870,8.0,156603.0,12.788,7.8


In [8]:
titles.isnull().sum()

id                         0
title                      0
type                       0
description               12
release_year               0
age_certification       1208
runtime                    0
genres                     0
production_countries       0
seasons                 2538
imdb_id                  326
imdb_score               372
imdb_votes               383
tmdb_popularity           33
tmdb_score               268
dtype: int64

In [9]:
titles.shape

(3294, 15)

In [10]:
titles.drop_duplicates(inplace=True)
titles.shape

(3294, 15)

In [11]:
titles['description'].fillna(' ',inplace=True)
titles.isnull().sum()

id                         0
title                      0
type                       0
description                0
release_year               0
age_certification       1208
runtime                    0
genres                     0
production_countries       0
seasons                 2538
imdb_id                  326
imdb_score               372
imdb_votes               383
tmdb_popularity           33
tmdb_score               268
dtype: int64

In [12]:
type(titles['genres'][0])

str

In [13]:
titles['description'][3200]

'In the late ‘90s, “Sex and the City” took television by storm with its honest and hilarious perspective on love, relationships… and sex, earning legions of devoted fans. Over 20 years later, this exclusive and immersive documentary offers a unique behind-the-scenes look at the filming of the new chapter, “And Just Like That…”.'

# data cleaning

In [14]:
import re

In [15]:
def cleaning(text):
    text=re.sub('[^a-zA-Z0-9]',' ',text) #remove other than aplhabet,num
    text=re.sub('\s+',' ',text) #removing extra whitespaces
    text=text.lower()
    return text.strip()

In [16]:
cleaning(titles['description'][3200])

'in the late 90s sex and the city took television by storm with its honest and hilarious perspective on love relationships and sex earning legions of devoted fans over 20 years later this exclusive and immersive documentary offers a unique behind the scenes look at the filming of the new chapter and just like that'

In [17]:
titles['description']=titles['description'].apply(lambda x: cleaning(x))
titles['description'].head()

0    young dorothy finds herself in a magical world...
1    newspaper magnate charles foster kane is taken...
2    in casablanca morocco in december 1941 a cynic...
3    private investigator philip marlowe is hired b...
4    a private detective takes on a case that invol...
Name: description, dtype: object

In [18]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [19]:
def lemmatization(text):
    doc=nlp(text)
    y=' '
    for token in doc:
        x=token.lemma_
        y=y+' '+x
    return y.strip()

In [20]:
lemmatization(titles['description'][3200])

'in the late 90 sex and the city take television by storm with its honest and hilarious perspective on love relationship and sex earn legion of devoted fan over 20 year later this exclusive and immersive documentary offer a unique behind the scene look at the filming of the new chapter and just like that'

In [21]:
titles['description'][3200]

'in the late 90s sex and the city took television by storm with its honest and hilarious perspective on love relationships and sex earning legions of devoted fans over 20 years later this exclusive and immersive documentary offers a unique behind the scenes look at the filming of the new chapter and just like that'

In [22]:
from spacy.lang.en.stop_words import STOP_WORDS
len(STOP_WORDS)

326

In [23]:
def stop_words(text):
    doc=nlp(text)
    y=' '
    for token in doc:
        if not token.is_stop:
            x=token.text
            y=y+' '+x
    return y.strip()

In [24]:
stop_words(titles['description'][3200])

'late 90s sex city took television storm honest hilarious perspective love relationships sex earning legions devoted fans 20 years later exclusive immersive documentary offers unique scenes look filming new chapter like'

In [25]:
titles['description']=titles['description'].apply(lambda x: lemmatization(x))
titles['description']=titles['description'].apply(lambda x: stop_words(x))

In [26]:
titles['description'][3200]

'late 90 sex city television storm honest hilarious perspective love relationship sex earn legion devoted fan 20 year later exclusive immersive documentary offer unique scene look filming new chapter like'

In [27]:
titles['genres']=titles['genres'].apply(lambda x: cleaning(x))

In [28]:
titles['genres'].head()

0            fantasy family
1                     drama
2         drama romance war
3            thriller crime
4    thriller romance crime
Name: genres, dtype: object

In [29]:
titles['tags']=titles['description']+' '+titles['genres']

In [30]:
titles['genres'][3200]

'documentation'

In [31]:
len(titles['tags'][3200].split())

31

In [32]:
titles.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,tags
0,tm155702,The Wizard of Oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,102,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...
1,tm83648,Citizen Kane,MOVIE,newspaper magnate charles foster kane mother b...,1941,PG,119,drama,['US'],,tt0033467,8.3,433804.0,14.383,8.0,newspaper magnate charles foster kane mother b...
2,tm77588,Casablanca,MOVIE,casablanca morocco december 1941 cynical ameri...,1942,PG,102,drama romance war,['US'],,tt0034583,8.5,558849.0,20.087,8.2,casablanca morocco december 1941 cynical ameri...
3,tm82363,The Big Sleep,MOVIE,private investigator philip marlowe hire wealt...,1946,,116,thriller crime,['US'],,tt0038355,7.9,84494.0,12.911,7.7,private investigator philip marlowe hire wealt...
4,tm84701,The Maltese Falcon,MOVIE,private detective case involve eccentric crimi...,1941,,100,thriller romance crime,['US'],,tt0033870,8.0,156603.0,12.788,7.8,private detective case involve eccentric crimi...


# vectorization

# bow

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=500)

In [34]:
requried_text=titles['tags']

In [35]:
x_bow=cv.fit_transform(requried_text).toarray()

In [210]:
x_bow[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# tf-idf

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
tfidf=TfidfVectorizer(
    sublinear_tf=True,
    max_features=500
)

In [39]:
x_tf=tfidf.fit_transform(requried_text).toarray()

In [40]:
x_tf.shape

(3294, 500)

# content filtering

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
similarity1=cosine_similarity(x_bow)

In [43]:
similarity1.shape

(3294, 3294)

In [44]:
sorted(list(enumerate(similarity1[5])),reverse=True,key=lambda x:x[1])[1:11]

[(336, 0.5636018619766344),
 (1543, 0.5504818825631803),
 (41, 0.547722557505166),
 (233, 0.5333333333333332),
 (1435, 0.5333333333333332),
 (2, 0.5270462766947299),
 (86, 0.5163977794943222),
 (60, 0.4898979485566356),
 (75, 0.48304589153964794),
 (1932, 0.48304589153964794)]

In [45]:
similarity2=cosine_similarity(x_tf)

In [46]:
similarity2.shape

(3294, 3294)

In [47]:
sorted(list(enumerate(similarity2[5])),reverse=True,key=lambda x:x[1])[1:10]

[(86, 0.4247720599814495),
 (233, 0.4074707213239203),
 (1757, 0.4000313351600295),
 (41, 0.38866659518248725),
 (1435, 0.3763487768967593),
 (336, 0.3752457793837427),
 (2, 0.35123094321039583),
 (1543, 0.3501650152923922),
 (158, 0.33668450884151335)]

In [48]:
movie_index=titles[titles['title']=='The Wizard of Oz'].index[0]
movie_index

0

In [49]:
(titles.iloc[0]).title

'The Wizard of Oz'

In [50]:
titles['title']=titles['title'].apply(lambda x: cleaning(x))

In [51]:
(titles.iloc[0]).title

'the wizard of oz'

In [52]:
def recommend(movie):
    movie=cleaning(movie)
    movie_index=titles[titles['title']==movie].index[0]
    distances=similarity1[movie_index]
    movie_list=sorted(enumerate(distances),reverse=True,key=lambda x:x[1])[1:11]
    
    for i,j in movie_list:
        if(j>0.30):
            print(titles.iloc[i].title,'->',j) 

In [53]:
recommend('The big Sleep')

the maltese falcon -> 0.5
shaft -> 0.4564354645876385
murder by numbers -> 0.44721359549995804
the man who knew too much -> 0.4216370213557839
vengeance killer coworkers -> 0.4
sharp objects -> 0.38138503569823695
nancy drew -> 0.37210420376762543
double jeopardy -> 0.3651483716701108
the conjuring the devil made me do it -> 0.3651483716701108
seance on a wet afternoon -> 0.3627381250550058


# collebrative filtering

In [99]:
credits

Unnamed: 0,person_id,id,name,character,role
0,60017,tm155702,Judy Garland,Dorothy Gale,ACTOR
1,53496,tm155702,Ray Bolger,Hunk / Scarecrow,ACTOR
2,79549,tm155702,Jack Haley,Hickory / Tin Man,ACTOR
3,79548,tm155702,Bert Lahr,Zeke / Cowardly Lion,ACTOR
4,60995,tm155702,Margaret Hamilton,Elmira Gulch / Wicked Witch of the West,ACTOR
...,...,...,...,...,...
66388,1224011,tm1067128,Stella Lauri,Marta,ACTOR
66389,2305342,tm1067128,Fabiola Sánchez,Claudia,ACTOR
66390,1245864,tm1067128,Ash Olivera,Nora,ACTOR
66391,2305203,tm1067128,Mariel Garcia Spooner,,DIRECTOR


In [100]:
titles.isnull().sum()

id                         0
title                      0
type                       0
description                0
release_year               0
age_certification       1208
runtime                    0
genres                     0
production_countries       0
seasons                 2538
imdb_id                  326
imdb_score                 0
imdb_votes               383
tmdb_popularity           33
tmdb_score                 0
tags                       0
rating                     0
dtype: int64

In [101]:
titles['imdb_score'].fillna(0,inplace=True)
titles['tmdb_score'].fillna(0,inplace=True)

In [102]:
titles['rating']=(titles['imdb_score']+titles['tmdb_score'])/2
titles['rating'].head()

0    7.85
1    8.15
2    8.35
3    7.80
4    7.90
Name: rating, dtype: float64

In [103]:
titles.isnull().sum()

id                         0
title                      0
type                       0
description                0
release_year               0
age_certification       1208
runtime                    0
genres                     0
production_countries       0
seasons                 2538
imdb_id                  326
imdb_score                 0
imdb_votes               383
tmdb_popularity           33
tmdb_score                 0
tags                       0
rating                     0
dtype: int64

In [105]:
titles['rating'].describe()

count    3294.000000
mean        6.199651
std         1.866789
min         0.000000
25%         5.650000
50%         6.700000
75%         7.450000
max         9.500000
Name: rating, dtype: float64

In [106]:
print(titles['id'].shape)
print(credits['id'].unique().shape)

(3294,)
(3120,)


In [107]:
final_data=pd.merge(credits,titles,on='id',how='inner')
final_data

Unnamed: 0,person_id,id,name,character,role,title,type,description,release_year,age_certification,...,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,tags,rating
0,60017,tm155702,Judy Garland,Dorothy Gale,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
1,53496,tm155702,Ray Bolger,Hunk / Scarecrow,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
2,79549,tm155702,Jack Haley,Hickory / Tin Man,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
3,79548,tm155702,Bert Lahr,Zeke / Cowardly Lion,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
4,60995,tm155702,Margaret Hamilton,Elmira Gulch / Wicked Witch of the West,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66388,1224011,tm1067128,Stella Lauri,Marta,ACTOR,algo azul,MOVIE,romantic comedy come soon,2021,,...,comedy,['PA'],,tt9257620,5.9,50.0,1.400,2.0,romantic comedy come soon comedy,3.95
66389,2305342,tm1067128,Fabiola Sánchez,Claudia,ACTOR,algo azul,MOVIE,romantic comedy come soon,2021,,...,comedy,['PA'],,tt9257620,5.9,50.0,1.400,2.0,romantic comedy come soon comedy,3.95
66390,1245864,tm1067128,Ash Olivera,Nora,ACTOR,algo azul,MOVIE,romantic comedy come soon,2021,,...,comedy,['PA'],,tt9257620,5.9,50.0,1.400,2.0,romantic comedy come soon comedy,3.95
66391,2305203,tm1067128,Mariel Garcia Spooner,,DIRECTOR,algo azul,MOVIE,romantic comedy come soon,2021,,...,comedy,['PA'],,tt9257620,5.9,50.0,1.400,2.0,romantic comedy come soon comedy,3.95


In [108]:
final_data[final_data['id']=='tm155702']

Unnamed: 0,person_id,id,name,character,role,title,type,description,release_year,age_certification,...,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,tags,rating
0,60017,tm155702,Judy Garland,Dorothy Gale,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
1,53496,tm155702,Ray Bolger,Hunk / Scarecrow,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
2,79549,tm155702,Jack Haley,Hickory / Tin Man,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
3,79548,tm155702,Bert Lahr,Zeke / Cowardly Lion,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
4,60995,tm155702,Margaret Hamilton,Elmira Gulch / Wicked Witch of the West,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
5,53493,tm155702,Frank Morgan,Professor Marvel / Wizard of Oz,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
6,79551,tm155702,Clara Blandick,Auntie Em,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
7,41661,tm155702,Charley Grapewin,Uncle Henry,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
8,72816,tm155702,Billie Burke,Glinda,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
9,79553,tm155702,Pat Walshe,Nikko,ACTOR,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,...,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85


In [109]:
print(final_data['person_id'].unique().shape)
print(credits['person_id'].unique().shape)
print(final_data['id'].unique().shape)
print(credits['id'].unique().shape)

(44872,)
(44872,)
(3120,)
(3120,)


In [110]:
rating=pd.concat([final_data['person_id'],final_data['id'],final_data['rating']],axis=1)
rating

Unnamed: 0,person_id,id,rating
0,60017,tm155702,7.85
1,53496,tm155702,7.85
2,79549,tm155702,7.85
3,79548,tm155702,7.85
4,60995,tm155702,7.85
...,...,...,...
66388,1224011,tm1067128,3.95
66389,2305342,tm1067128,3.95
66390,1245864,tm1067128,3.95
66391,2305203,tm1067128,3.95


In [181]:
x=rating['person_id'].value_counts()
x[x>=10]

14142    60
529      50
21759    44
7997     44
20372    43
         ..
2860     10
4444     10
60743    10
71801    10
4441     10
Name: person_id, Length: 230, dtype: int64

In [182]:
y=x[x>=10].index
y

Int64Index([14142,   529, 21759,  7997, 20372,  1950, 18723,  6821, 12050,
             6050,
            ...
             3776,  8987, 36538, 25159,  8641,  2860,  4444, 60743, 71801,
             4441],
           dtype='int64', length=230)

In [183]:
rating=rating[rating['person_id'].isin(y)]
rating.head()

Unnamed: 0,person_id,id,rating
23,15246,tm83648,8.15
157,22474,tm83648,8.15
175,15246,tm83648,8.15
306,14166,tm82363,7.8
351,17900,tm84701,7.9


In [184]:
rating.shape

(3137, 3)

In [185]:
titles.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,tags,rating
0,tm155702,the wizard of oz,MOVIE,young dorothy find magical world friend lion s...,1939,G,102,fantasy family,['US'],,tt0032138,8.1,389774.0,41.442,7.6,young dorothy find magical world friend lion s...,7.85
1,tm83648,citizen kane,MOVIE,newspaper magnate charles foster kane mother b...,1941,PG,119,drama,['US'],,tt0033467,8.3,433804.0,14.383,8.0,newspaper magnate charles foster kane mother b...,8.15
2,tm77588,casablanca,MOVIE,casablanca morocco december 1941 cynical ameri...,1942,PG,102,drama romance war,['US'],,tt0034583,8.5,558849.0,20.087,8.2,casablanca morocco december 1941 cynical ameri...,8.35
3,tm82363,the big sleep,MOVIE,private investigator philip marlowe hire wealt...,1946,,116,thriller crime,['US'],,tt0038355,7.9,84494.0,12.911,7.7,private investigator philip marlowe hire wealt...,7.8
4,tm84701,the maltese falcon,MOVIE,private detective case involve eccentric crimi...,1941,,100,thriller romance crime,['US'],,tt0033870,8.0,156603.0,12.788,7.8,private detective case involve eccentric crimi...,7.9


In [186]:
rating_titles=rating.merge(titles,on=['id','rating'])

In [187]:
num_rating=rating_titles.groupby('title')['rating'].count().reset_index()

In [188]:
num_rating.rename(columns={'rating':'num_of_rating'},inplace=True)

In [192]:
num_rating=num_rating[num_rating['num_of_rating']>=10]

In [193]:
num_rating

Unnamed: 0,title,num_of_rating
120,batman the dark knight returns part 1,10
137,ben 10,15
414,harry potter 20th anniversary return to hogwarts,17
415,harry potter and the chamber of secrets,11
416,harry potter and the deathly hallows part 1,15
417,harry potter and the deathly hallows part 2,18
418,harry potter and the goblet of fire,12
419,harry potter and the half blood prince,13
420,harry potter and the order of the phoenix,17
421,harry potter and the philosopher s stone,10


In [194]:
final_rating=num_rating.merge(rating_titles,on='title')

In [195]:
final_rating.shape

(325, 19)

In [196]:
final_rating.head()

Unnamed: 0,title,num_of_rating,person_id,id,rating,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,tags
0,batman the dark knight returns part 1,10,14964,tm62126,3.9,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...
1,batman the dark knight returns part 1,10,6653,tm62126,3.9,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...
2,batman the dark knight returns part 1,10,18723,tm62126,3.9,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...
3,batman the dark knight returns part 1,10,14142,tm62126,3.9,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...
4,batman the dark knight returns part 1,10,22043,tm62126,3.9,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...


In [197]:
final_rating.drop_duplicates(['person_id','id'],inplace=True)

In [198]:
final_rating.shape

(320, 19)

In [199]:
final_rating

Unnamed: 0,title,num_of_rating,person_id,id,rating,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,tags
0,batman the dark knight returns part 1,10,14964,tm62126,3.90,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...
1,batman the dark knight returns part 1,10,6653,tm62126,3.90,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...
2,batman the dark knight returns part 1,10,18723,tm62126,3.90,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...
3,batman the dark knight returns part 1,10,14142,tm62126,3.90,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...
4,batman the dark knight returns part 1,10,22043,tm62126,3.90,MOVIE,batman year new breed criminal ravage gotham c...,2012,PG-13,76,thriller scifi action animation,['US'],,,0.0,,28.785,7.8,batman year new breed criminal ravage gotham c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,tmnt,12,524,tm82442,6.15,MOVIE,defeat old arch nemesis shredder turtle grow a...,2007,PG,87,animation comedy family fantasy scifi action,"['US', 'HK']",,tt0453556,6.2,64062.0,22.181,6.1,defeat old arch nemesis shredder turtle grow a...
321,tmnt,12,14142,tm82442,6.15,MOVIE,defeat old arch nemesis shredder turtle grow a...,2007,PG,87,animation comedy family fantasy scifi action,"['US', 'HK']",,tt0453556,6.2,64062.0,22.181,6.1,defeat old arch nemesis shredder turtle grow a...
322,tmnt,12,19828,tm82442,6.15,MOVIE,defeat old arch nemesis shredder turtle grow a...,2007,PG,87,animation comedy family fantasy scifi action,"['US', 'HK']",,tt0453556,6.2,64062.0,22.181,6.1,defeat old arch nemesis shredder turtle grow a...
323,tmnt,12,53,tm82442,6.15,MOVIE,defeat old arch nemesis shredder turtle grow a...,2007,PG,87,animation comedy family fantasy scifi action,"['US', 'HK']",,tt0453556,6.2,64062.0,22.181,6.1,defeat old arch nemesis shredder turtle grow a...


In [200]:
rating_pivot=final_rating.pivot_table(columns='person_id',index='title',values='rating')

In [201]:
rating_pivot

person_id,51,53,524,529,1110,1283,1396,1537,1858,1950,...,19828,20372,22037,22043,25157,25282,27098,37977,80830,424477
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
batman the dark knight returns part 1,,,,3.9,,,,,,,...,,3.9,,3.9,,,,,,
ben 10,,,7.75,,,,,,,6.2,...,7.75,6.2,,,,,,,,
harry potter 20th anniversary return to hogwarts,,,,,7.9,,,7.9,,,...,,,,,,,,,,
harry potter and the chamber of secrets,,,,,7.55,,,,,,...,,,,,,,,,,
harry potter and the deathly hallows part 1,,,,,7.75,,,,,,...,,,,,,,,,,
harry potter and the deathly hallows part 2,,,,,8.1,,,8.1,,,...,,,,,,,,,,
harry potter and the goblet of fire,,,,,7.75,,,,,,...,,,,,,,,,,
harry potter and the half blood prince,,,,,7.65,,,7.65,,,...,,,,,,,,,,
harry potter and the order of the phoenix,,,,,7.6,,,,,,...,,,,,,,,,,
harry potter and the philosopher s stone,,,,,7.75,,,,,,...,,,,,,,,,,


In [202]:
rating_pivot.fillna(0,inplace=True)

In [203]:
rating_pivot

person_id,51,53,524,529,1110,1283,1396,1537,1858,1950,...,19828,20372,22037,22043,25157,25282,27098,37977,80830,424477
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
batman the dark knight returns part 1,0.0,0.0,0.0,3.9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.9,0.0,3.9,0.0,0.0,0.0,0.0,0.0,0.0
ben 10,0.0,0.0,7.75,0.0,0.0,0.0,0.0,0.0,0.0,6.2,...,7.75,6.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harry potter 20th anniversary return to hogwarts,0.0,0.0,0.0,0.0,7.9,0.0,0.0,7.9,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harry potter and the chamber of secrets,0.0,0.0,0.0,0.0,7.55,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harry potter and the deathly hallows part 1,0.0,0.0,0.0,0.0,7.75,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harry potter and the deathly hallows part 2,0.0,0.0,0.0,0.0,8.1,0.0,0.0,8.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harry potter and the goblet of fire,0.0,0.0,0.0,0.0,7.75,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harry potter and the half blood prince,0.0,0.0,0.0,0.0,7.65,0.0,0.0,7.65,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harry potter and the order of the phoenix,0.0,0.0,0.0,0.0,7.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harry potter and the philosopher s stone,0.0,0.0,0.0,0.0,7.75,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
from scipy.sparse import csr_matrix

In [205]:
movie_sparse=csr_matrix(rating_pivot)

In [206]:
movie_sparse

<27x73 sparse matrix of type '<class 'numpy.float64'>'
	with 318 stored elements in Compressed Sparse Row format>

In [207]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=15, n_jobs=-1)
knn.fit(movie_sparse)

In [208]:
def get_movie_recommendation(movie_name):
    n_movies_to_reccomend = 5
    movie_list = final_rating[final_rating['title'].str.contains(movie_name)]  
    if len(movie_list):        
        movie_idx= movie_list.iloc[0]['id']
        movie_idx = final_rating[final_rating['id'] == movie_idx].index[0]
        distances , indices = knn.kneighbors(movie_sparse[movie_idx],n_neighbors=n_movies_to_reccomend+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = final_rating.iloc[val[0]]['id']
            idx = final_rating[final_rating['id'] == movie_idx].index
            recommend_frame.append({'Title':final_rating.iloc[idx]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found. Please check your input its all lower case :)"

In [209]:
get_movie_recommendation('12 years a slave')

'No movies found. Please check your input its all lower case :)'

# weighted average

In [230]:
weight_data=titles.drop(columns=['description','release_year','age_certification','runtime','genres','production_countries','seasons','tags'],axis=1)

In [231]:
weight_data.head()

Unnamed: 0,id,title,type,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,rating
0,tm155702,the wizard of oz,MOVIE,tt0032138,8.1,389774.0,41.442,7.6,7.85
1,tm83648,citizen kane,MOVIE,tt0033467,8.3,433804.0,14.383,8.0,8.15
2,tm77588,casablanca,MOVIE,tt0034583,8.5,558849.0,20.087,8.2,8.35
3,tm82363,the big sleep,MOVIE,tt0038355,7.9,84494.0,12.911,7.7,7.8
4,tm84701,the maltese falcon,MOVIE,tt0033870,8.0,156603.0,12.788,7.8,7.9


In [232]:
from sklearn.preprocessing import MinMaxScaler

In [233]:
scaler=MinMaxScaler()
movie_scaled=scaler.fit_transform(weight_data[['rating','tmdb_popularity']])
movie_normalized=pd.DataFrame(movie_scaled,columns=['norm_rating','norm_popularity'])
movie_normalized.head()

Unnamed: 0,norm_rating,norm_popularity
0,0.826316,0.009306
1,0.857895,0.003222
2,0.878947,0.004505
3,0.821053,0.002891
4,0.831579,0.002864


In [235]:
weight_data=pd.concat([weight_data,movie_normalized],axis=1)

In [236]:
weight_data.head()

Unnamed: 0,id,title,type,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,rating,norm_rating,norm_popularity
0,tm155702,the wizard of oz,MOVIE,tt0032138,8.1,389774.0,41.442,7.6,7.85,0.826316,0.009306
1,tm83648,citizen kane,MOVIE,tt0033467,8.3,433804.0,14.383,8.0,8.15,0.857895,0.003222
2,tm77588,casablanca,MOVIE,tt0034583,8.5,558849.0,20.087,8.2,8.35,0.878947,0.004505
3,tm82363,the big sleep,MOVIE,tt0038355,7.9,84494.0,12.911,7.7,7.8,0.821053,0.002891
4,tm84701,the maltese falcon,MOVIE,tt0033870,8.0,156603.0,12.788,7.8,7.9,0.831579,0.002864


In [237]:
weight_data['score']=weight_data['norm_rating']*0.5+weight_data['norm_popularity']*0.5

In [238]:
weight_data.head()

Unnamed: 0,id,title,type,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,rating,norm_rating,norm_popularity,score
0,tm155702,the wizard of oz,MOVIE,tt0032138,8.1,389774.0,41.442,7.6,7.85,0.826316,0.009306,0.417811
1,tm83648,citizen kane,MOVIE,tt0033467,8.3,433804.0,14.383,8.0,8.15,0.857895,0.003222,0.430559
2,tm77588,casablanca,MOVIE,tt0034583,8.5,558849.0,20.087,8.2,8.35,0.878947,0.004505,0.441726
3,tm82363,the big sleep,MOVIE,tt0038355,7.9,84494.0,12.911,7.7,7.8,0.821053,0.002891,0.411972
4,tm84701,the maltese falcon,MOVIE,tt0033870,8.0,156603.0,12.788,7.8,7.9,0.831579,0.002864,0.417221


In [242]:
weight_data=weight_data.sort_values(['score'],ascending=False)
weight_data.head(20)

Unnamed: 0,id,title,type,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,rating,norm_rating,norm_popularity,score
2974,tm244142,the batman,MOVIE,tt1877830,8.0,456054.0,4447.894,7.8,7.9,0.831579,1.0,0.915789
1706,ts2,game of thrones,SHOW,tt0944947,9.2,1983794.0,492.101,8.4,8.8,0.926316,0.110627,0.518471
2662,tm1082305,euphoria trouble don t last always,MOVIE,tt10636622,9.0,,31.564,10.0,9.5,1.0,0.007085,0.503543
1878,ts20233,rick and morty,SHOW,tt2861424,9.2,473951.0,238.434,8.8,9.0,0.947368,0.053595,0.500482
3139,tm1082045,euphoria f ck anyone who s not a sea blob,MOVIE,tt13608984,8.8,,1.4,10.0,9.4,0.989474,0.000303,0.494889
2529,ts87074,euphoria,SHOW,tt8772296,8.4,156483.0,445.399,8.4,8.4,0.884211,0.100127,0.492169
984,ts15128,crashbox,SHOW,tt0321777,8.5,341.0,6.175,10.0,9.25,0.973684,0.001377,0.487531
1133,ts22225,regular show,SHOW,tt1710308,8.5,40562.0,282.607,8.8,8.65,0.910526,0.063527,0.487026
2740,ts89481,the world between us,SHOW,tt10073114,9.0,2333.0,4.244,9.4,9.2,0.968421,0.000943,0.484682
2534,ts222852,jujutsu kaisen,SHOW,tt12343534,8.6,48949.0,303.199,8.5,8.55,0.9,0.068156,0.484078


In [243]:
weight_data[['title','score']].head(20)

Unnamed: 0,title,score
2974,the batman,0.915789
1706,game of thrones,0.518471
2662,euphoria trouble don t last always,0.503543
1878,rick and morty,0.500482
3139,euphoria f ck anyone who s not a sea blob,0.494889
2529,euphoria,0.492169
984,crashbox,0.487531
1133,regular show,0.487026
2740,the world between us,0.484682
2534,jujutsu kaisen,0.484078


In [244]:
#have to do with correlation