In [159]:
import numpy as np
import pandas as pd
import re
import random
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine
from scipy.spatial.distance import correlation
from scipy.spatial.distance import hamming
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
df_imdb = pd.read_csv('data/imdb_new.csv')
#df_imdb = df_imdb.rename(columns={"Unnamed: 0":"movie_id"})
df_imdb.head()

Unnamed: 0.1,Unnamed: 0,title,year,genre,rating,director,actors,votes,url
0,0,The Dark Knight,2008,"Action, Crime, Drama",9.0,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",2222222.0,https://www.imdb.com/title/tt0468569/?ref_=adv...
1,1,The Mountain II,2016,"Action, Drama, War",8.9,Alper Caglar,"Caglar Ertugrul, Ufuk Bayraktar, Ahu T?rkpen?e...",103623.0,https://www.imdb.com/title/tt5813916/?ref_=adv...
2,2,Inception,2010,"Action, Adventure, Sci-Fi",8.8,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen...",1978620.0,https://www.imdb.com/title/tt1375666/?ref_=adv...
3,3,The Lord of the Rings: The Fellowship of the Ring,2001,"Action, Adventure, Drama",8.8,Peter Jackson,"Elijah Wood, Ian McKellen, Orlando Bloom, Sean...",1606922.0,https://www.imdb.com/title/tt0120737/?ref_=adv...
4,4,The Matrix,1999,"Action, Sci-Fi",8.7,"Lana Wachowski, Lilly Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",1617768.0,https://www.imdb.com/title/tt0133093/?ref_=adv...


In [3]:
df_imdb['genre'].isna().value_counts()

False    5208
Name: genre, dtype: int64

In [4]:
df_imdb['title'].duplicated().value_counts()

False    2688
True     2520
Name: title, dtype: int64

In [5]:
df_imdb = df_imdb.drop_duplicates('title')

In [6]:
df_imdb.shape

(2688, 9)

In [7]:
df_imdb = df_imdb.reset_index(0)

In [8]:
df_imdb = df_imdb.drop(columns=['index','Unnamed: 0'],axis=1)

In [9]:
df_imdb['movie_id'] = df_imdb.index

In [10]:
df_imdb.head()

Unnamed: 0,title,year,genre,rating,director,actors,votes,url,movie_id
0,The Dark Knight,2008,"Action, Crime, Drama",9.0,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",2222222.0,https://www.imdb.com/title/tt0468569/?ref_=adv...,0
1,The Mountain II,2016,"Action, Drama, War",8.9,Alper Caglar,"Caglar Ertugrul, Ufuk Bayraktar, Ahu T?rkpen?e...",103623.0,https://www.imdb.com/title/tt5813916/?ref_=adv...,1
2,Inception,2010,"Action, Adventure, Sci-Fi",8.8,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen...",1978620.0,https://www.imdb.com/title/tt1375666/?ref_=adv...,2
3,The Lord of the Rings: The Fellowship of the Ring,2001,"Action, Adventure, Drama",8.8,Peter Jackson,"Elijah Wood, Ian McKellen, Orlando Bloom, Sean...",1606922.0,https://www.imdb.com/title/tt0120737/?ref_=adv...,3
4,The Matrix,1999,"Action, Sci-Fi",8.7,"Lana Wachowski, Lilly Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",1617768.0,https://www.imdb.com/title/tt0133093/?ref_=adv...,4


In [11]:
df_imdb['genre'] = df_imdb['genre'].apply(lambda x: x.replace(" ",""))
df_imdb['genre'] = df_imdb['genre'].apply(lambda x: x.split(sep=','))
df_imdb['director'] = df_imdb['director'].apply(lambda x: x.replace(", ",","))
df_imdb['director'] = df_imdb['director'].apply(lambda x: x.split(sep=','))
df_imdb['actors'] = df_imdb['actors'].apply(lambda x: x.replace(", ",","))
df_imdb['actors'] = df_imdb['actors'].apply(lambda x: x.split(sep=','))

In [12]:
mlb = MultiLabelBinarizer()
df_imdb_genres = pd.DataFrame(mlb.fit_transform(df_imdb['genre']), columns=mlb.classes_)
df_imdb_director = pd.DataFrame(mlb.fit_transform(df_imdb['director']), columns=mlb.classes_)
df_imdb_actors = pd.DataFrame(mlb.fit_transform(df_imdb['actors']), columns=mlb.classes_)

In [13]:
#drop the name if the name contains "?"
df_imdb_director = df_imdb_director.drop([i for i in df_imdb_director.columns if "?" in i],axis=1)
df_imdb_actors = df_imdb_actors.drop([i for i in df_imdb_actors.columns if "?" in i],axis=1)

In [14]:
print(df_imdb_director.shape)
print(df_imdb_actors.shape)

(2688, 1380)
(2688, 5091)


In [15]:
df_imdb_for_normalize = pd.concat([df_imdb[['rating', 'votes']],df_imdb_genres,df_imdb_director,df_imdb_actors], axis=1)

In [16]:
df_imdb_for_normalize

Unnamed: 0,rating,votes,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,...,Zero Mostel,Ziyi Zhang,Zlatko Buric,Zoe Kazan,Zoe Margaret Colletti,Zoe Saldana,Zooey Deschanel,Zorion Eguileor,Zulay Henao,Zuleikha Robinson
0,9.0,2222222.0,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,8.9,103623.0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,8.8,1978620.0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8.8,1606922.0,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,8.7,1617768.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683,5.7,33496.0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2684,5.6,122011.0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2685,4.9,150209.0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2686,4.8,42316.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_imdb_for_normalize['rating'] = df_imdb_for_normalize['rating'].fillna((df_imdb_for_normalize['rating'].mean()))
df_imdb_for_normalize['votes'] = df_imdb_for_normalize['votes'].fillna((df_imdb_for_normalize['votes'].mean()))
# fill the nan value in raintg and votes by the mean

In [18]:
df_imdb__normalized = pd.DataFrame(normalize(df_imdb_for_normalize, axis=0))
df_imdb__normalized.columns = df_imdb_for_normalize.columns
df_imdb__normalized.index = df_imdb_for_normalize.index
df_imdb__normalized.head()
#here the index of df_imdb__normalized is the same as movie_id

Unnamed: 0,rating,votes,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,...,Zero Mostel,Ziyi Zhang,Zlatko Buric,Zoe Kazan,Zoe Margaret Colletti,Zoe Saldana,Zooey Deschanel,Zorion Eguileor,Zulay Henao,Zuleikha Robinson
0,0.023688,0.151814,0.040859,0.0,0.0,0.0,0.0,0.045222,0.024868,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.023424,0.007079,0.040859,0.0,0.0,0.0,0.0,0.0,0.024868,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.023161,0.135172,0.040859,0.039841,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.023161,0.109779,0.040859,0.039841,0.0,0.0,0.0,0.0,0.024868,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.022898,0.11052,0.040859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [165]:
def find_similar_movie(distance_method, movie_name, N):
    import random
    try:
        movie_id = df_imdb[df_imdb['title'].apply(lambda x: False if re.search(r"{}".format(movie_name),x,flags=re.IGNORECASE) == None else True)]['movie_id'].item()
    except:
        try:
            movie_id = df_imdb[df_imdb['title'].apply(lambda x: False if re.search(r"{}".format(movie_name),x,flags=re.IGNORECASE) == None else True)]['movie_id'][0]
        except:
            #random suggest a popular movie if cannot identify movie_name from user
            random_popular_ind = random.randint(0,19)
            movie_dict_popular = {"title":df_imdb.sort_values("rating",ascending=False).head(20).iloc[random_popular_ind,:]['title'],
                                  "url":df_imdb.sort_values("rating",ascending=False).head(20).iloc[random_popular_ind,:]['url']}
            return movie_dict_popular
        
    #create_new_dataframe containing the movie_id
    allMovies = pd.DataFrame(df_imdb__normalized.index,columns=["movie_id"])
    allMovies = allMovies[allMovies['movie_id']!= movie_id]
    allMovies["distance"] = allMovies['movie_id'].apply(lambda x: distance_method(df_imdb__normalized.loc[movie_id], df_imdb__normalized.loc[x]))
    TopNRecommendation = allMovies.sort_values(["distance"]).head(N)  
    
    recommend_movie_id_list = TopNRecommendation['movie_id'].values
    random_index = random.randint(0,len(recommend_movie_id_list)-1)
    recommend_movie_id = recommend_movie_id_list[random_index]
    new_df = df_imdb[df_imdb['movie_id'] == recommend_movie_id]
    
    movie_dict = {"title":new_df['title'].item(),"url":new_df['url'].item()}
    
    return movie_dict

In [166]:
find_similar_movie(euclidean,"The Dark Knight", 5)

{'title': 'ARIF V 216',
 'url': 'https://www.imdb.com/title/tt6697582/?ref_=adv_li_i'}

In [172]:
find_similar_movie(euclidean,"The secx sdhadoa", 5)

{'title': 'The Shawshank Redemption',
 'url': 'https://www.imdb.com/title/tt0111161/?ref_=adv_li_i'}

In [178]:
df_imdb__normalized.to_csv("df_imdb__normalized.csv",index=False)

In [179]:
df_imdb.to_csv("df_imdb.csv",index=False)

In [180]:
df = pd.read_csv('df_imdb.csv')
df

Unnamed: 0,title,year,genre,rating,director,actors,votes,url,movie_id
0,The Dark Knight,2008,"['Action', 'Crime', 'Drama']",9.0,['Christopher Nolan'],"['Christian Bale', 'Heath Ledger', 'Aaron Eckh...",2222222.0,https://www.imdb.com/title/tt0468569/?ref_=adv...,0
1,The Mountain II,2016,"['Action', 'Drama', 'War']",8.9,['Alper Caglar'],"['Caglar Ertugrul', 'Ufuk Bayraktar', 'Ahu T?r...",103623.0,https://www.imdb.com/title/tt5813916/?ref_=adv...,1
2,Inception,2010,"['Action', 'Adventure', 'Sci-Fi']",8.8,['Christopher Nolan'],"['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ...",1978620.0,https://www.imdb.com/title/tt1375666/?ref_=adv...,2
3,The Lord of the Rings: The Fellowship of the Ring,2001,"['Action', 'Adventure', 'Drama']",8.8,['Peter Jackson'],"['Elijah Wood', 'Ian McKellen', 'Orlando Bloom...",1606922.0,https://www.imdb.com/title/tt0120737/?ref_=adv...,3
4,The Matrix,1999,"['Action', 'Sci-Fi']",8.7,"['Lana Wachowski', 'Lilly Wachowski']","['Keanu Reeves', 'Laurence Fishburne', 'Carrie...",1617768.0,https://www.imdb.com/title/tt0133093/?ref_=adv...,4
...,...,...,...,...,...,...,...,...,...
2683,Bandidas,2006,"['Action', 'Comedy', 'Crime']",5.7,"['Joachim R?nning', 'Espen Sandberg']","['Pen?lope Cruz', 'Salma Hayek', 'Steve Zahn',...",33496.0,https://www.imdb.com/title/tt0416496/?ref_=adv...,2683
2684,The Dark Tower,2017,"['Action', 'Adventure', 'Fantasy']",5.6,['Nikolaj Arcel'],"['Idris Elba', 'Matthew McConaughey', 'Tom Tay...",122011.0,https://www.imdb.com/title/tt1648190/?ref_=adv...,2684
2685,Wild Wild West,1999,"['Action', 'Comedy', 'Sci-Fi']",4.9,['Barry Sonnenfeld'],"['Will Smith', 'Kevin Kline', 'Kenneth Branagh...",150209.0,https://www.imdb.com/title/tt0120891/?ref_=adv...,2685
2686,The Ridiculous 6,2015,"['Comedy', 'Western']",4.8,['Frank Coraci'],"['Adam Sandler', 'Terry Crews', 'Jorge Garcia'...",42316.0,https://www.imdb.com/title/tt2479478/?ref_=adv...,2686


In [181]:
df2 = pd.read_csv('df_imdb__normalized.csv')
df2

Unnamed: 0,rating,votes,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,...,Zero Mostel,Ziyi Zhang,Zlatko Buric,Zoe Kazan,Zoe Margaret Colletti,Zoe Saldana,Zooey Deschanel,Zorion Eguileor,Zulay Henao,Zuleikha Robinson
0,0.023688,0.151814,0.040859,0.000000,0.0,0.0,0.000000,0.045222,0.024868,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.023424,0.007079,0.040859,0.000000,0.0,0.0,0.000000,0.000000,0.024868,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.023161,0.135172,0.040859,0.039841,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.023161,0.109779,0.040859,0.039841,0.0,0.0,0.000000,0.000000,0.024868,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.022898,0.110520,0.040859,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683,0.015002,0.002288,0.040859,0.000000,0.0,0.0,0.035601,0.045222,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2684,0.014739,0.008335,0.040859,0.039841,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2685,0.012897,0.010262,0.040859,0.000000,0.0,0.0,0.035601,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2686,0.012633,0.002891,0.000000,0.000000,0.0,0.0,0.035601,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
