# Import 

In [41]:
import pandas as pd
import numpy as np
from surprise import SVD, Dataset
import os
import warnings
import spacy
import pickle

In [8]:
# import warnings
warnings.filterwarnings('ignore')

In [3]:
os.chdir("E:/Internship/")

# Recommendation 

In [42]:
ratings_cleaned = pd.read_csv('./data/ratings_cleaned.csv')
df_exp = pd.read_csv('./data/genre_utility_matrix_with_movies.csv')
movies = pd.read_csv('./data/movies_cleaned_v3.csv')
indices = movies[['id', 'new title']]
indices_overview = pd.Series(movies.index, index= movies['new title'])
nlp = spacy.load("en_core_web_lg")
with open('./model/svd.pkl','rb') as f:
    svd = pickle.load(f)

In [61]:
def recommend_through_genre(uid: int, sample_size: int = 1000, n: int = 10,seed= None, scaler: 'str|None' = 'standard'):
    user_ratings = ratings_cleaned[ratings_cleaned['userId'] == uid]
    dx =  df_exp.loc[df_exp['id'].isin(user_ratings['movieId']),df_exp.columns != 'id'].T
    dx = pd.DataFrame( dx * user_ratings['rating'].values ).T
    transformed_dx = pd.melt(dx, var_name='genre').groupby('genre')['value'].sum()
    if scaler == 'normal':
        scaled = (transformed_dx)/ transformed_dx.sum()
    elif scaler =='standard':
        scaled = (transformed_dx - transformed_dx.mean())/ transformed_dx.std()
    elif scaler =='minmax':
        scaled = (transformed_dx - transformed_dx.min())/ (transformed_dx.max() - transformed_dx.min())
    elif scaler =='None':
        scaled = transformed_dx
    elif scaler == 'sigmoid':
        scaled = 1/(1 + np.exp(-transformed_dx))
    else:
        raise AttributeError("The scaler you mentioned doesnt exist,\
        please enter one of the following: 'standard', 'normal', 'None', 'minmax', sigmoid")
    # print(scaled)
    norm_add_data = pd.DataFrame(scaled)
    other_data = df_exp[~df_exp.isin(user_ratings['movieId'])]
    other_data = other_data.sample(sample_size, random_state=seed)
    other_data.loc[:, other_data.columns != 'id'] = pd.DataFrame(other_data.loc[:, other_data.columns != 'id'] * norm_add_data.values.T)
    # print(other_data)
    other_data['scores'] = other_data.drop('id',axis=1).sum(axis=1)
    prediction = other_data[['id','scores']].reset_index(drop=True)
    titles = indices[indices['id'].isin(prediction['id'])]
    output = pd.merge(titles, prediction, on='id').sort_values('scores',ascending=False).head(n)
    return output
    

In [67]:
recommend_through_genre(uid= 465, sample_size=45000,seed=123)

Unnamed: 0,id,new title,scores
38049,241004,Tintin and the Lake of Sharks (1972),5.339378
24797,136459,Victor and the Secret of Crocodile Mansion (2012),5.335721
5141,110989,Spooky House (2004),5.335721
41959,389272,Lost & Found (2016),5.335721
40478,24886,Minor Details (2009),5.335721
29019,158852,Tomorrowland (2015),5.083381
43140,142802,Гонгофер (1992),4.889554
31912,42515,The Man Called Flintstone (1966),4.794469
23548,185460,Willow Creek (2013),4.776184
4665,31703,Death Ship (1980),4.776184


In [46]:
def similarity(string: str, sample_size = 100, n = 10, seed=None):
    overview1 = movies.iloc[indices_overview.loc[string]]['overview_cleaned']
    doc1 = nlp(overview1)
    # w1 = set(ss for word in overview1 for ss in wordnet.synsets(word))
    samples = indices_overview.sample(sample_size, random_state=seed)
    listin = []
    for i in samples.index:
        overview12 = movies.iloc[indices_overview.loc[i]]['overview_cleaned']
        doc2 = nlp(overview12)
        # w2 = set(ss for word in overview12 for ss in wordnet.synsets(word))
        sim = doc1.similarity(doc2)
        # sim = (wordnet.wup_similarity(s1,s2) for s1,s2, in product(w1,w2))
        listin.append(sim)

    series = pd.Series(listin, index= samples.index, name="similarity")
    series = series.reset_index().sort_values("similarity",ascending=False).head(n)
    return series

In [47]:
similarity("Interstellar (2014)")

Unnamed: 0,new title,similarity
19,Gunsmith Cats: Bulletproof! (1995),0.793768
14,Fail-Safe (1964),0.784266
2,Free Fall (2013),0.763519
86,The Twelve Trees of Christmas (2013),0.75443
11,Mirrored Mind (2006),0.75247
57,Sommersby (1993),0.746525
46,North Shore (1987),0.743443
4,Comet in Moominland (1992),0.743162
73,Torremolinos 73 (2003),0.738289
17,This Earth Is Mine (1959),0.733709


In [48]:
def collaborative(uid,sampling_num=1000, seed=None, n=10):
    sample_data = movies[['id','new title']].sample(n=sampling_num, random_state=seed)
    listin = []
    for id in sample_data['id']:
        sii = svd.predict(uid=uid,iid=id).est
        listin.append(sii)

    sek = pd.Series(listin, index= sample_data['new title'],name='predicted_rating')
    sek = sek.reset_index().sort_values('predicted_rating',ascending=False).head(n)
    return sek

In [58]:
collaborative(6)

Unnamed: 0,new title,predicted_rating
78,Wonder Bar (1934),4.141394
483,Prick Up Your Ears (1987),4.071391
631,The 3 Rs (2011),4.016282
199,Ghost (2012),3.932747
735,The Third Reich: The Rise & Fall (2010),3.919533
317,She's Out of My League (2010),3.901839
37,Farsan (2010),3.8964
191,The Magic Box (1951),3.872782
133,Rain (2001),3.865321
663,Saving Grace (2000),3.839032


In [52]:
def top_n_popular(n:int = 10):
    return movies[['id','new title','popularity']].sort_values('popularity', ascending=False).head(n)

In [59]:
top_n_popular(14)

Unnamed: 0,id,new title,popularity
30671,211672,Minions (2015),547.488298
33326,297762,Wonder Woman (2017),294.337037
42176,321612,Beauty and the Beast (2017),287.253654
43596,339403,Baby Driver (2017),228.032744
24434,177572,Big Hero 6 (2014),213.849907
26541,293660,Deadpool (2016),187.860492
26543,283995,Guardians of the Galaxy Vol. 2 (2017),185.330992
14547,19995,Avatar (2009),185.070892
24330,245891,John Wick (2014),183.870374
23656,210577,Gone Girl (2014),154.801009


In [54]:
def most_critically_acclaimed_movies(n: int = 10):
    return movies[['id','new title','critically_acclaimed']].sort_values('critically_acclaimed', ascending=False).head(n)

In [55]:
most_critically_acclaimed_movies()

Unnamed: 0,id,new title,critically_acclaimed
15476,27205,Inception (2010),7403.997896
12478,155,The Dark Knight (2008),7212.815991
22861,157336,Interstellar (2014),5884.797475
2842,550,Fight Club (1999),5689.594356
314,278,The Shawshank Redemption (1994),5568.287808
292,680,Pulp Fiction (1994),5097.001764
23734,118340,Guardians of the Galaxy (2014),4766.301761
20041,68718,Django Unchained (2012),4678.328033
17811,24428,The Avengers (2012),4613.61015
351,13,Forrest Gump (1994),4523.598001


In [267]:
movies_with_poster = pd.read_csv('./data/movies_cleaned_v5.csv')

In [268]:
movies_with_poster['poster_link']

0        https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1        https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2        https://m.media-amazon.com/images/M/MV5BMDkwYT...
3        https://m.media-amazon.com/images/M/MV5BYzcyMD...
4        https://m.media-amazon.com/images/M/MV5BOTEyNz...
                               ...                        
45407                                                  NaN
45408                                                  NaN
45409                                                  NaN
45410                                                  NaN
45411                                                  NaN
Name: poster_link, Length: 45412, dtype: object

In [269]:
movies_with_poster['age_rating'].value_counts()

age_rating
Not Rated    26131
R            10000
PG-13         3787
PG            3606
G             1320
TV-14          300
TV-PG          187
NC-17           79
TV-13            2
Name: count, dtype: int64

In [270]:
pd.options.display.max_columns = None

In [271]:
# movies_with_poster = pd.read_csv('E:/Internship/data/movies_cleaned_v4.csv')

In [272]:

movies_with_poster['age_rating'].value_counts()

age_rating
Not Rated    26131
R            10000
PG-13         3787
PG            3606
G             1320
TV-14          300
TV-PG          187
NC-17           79
TV-13            2
Name: count, dtype: int64

In [299]:
final_temp = movies_with_poster.drop(
    ['adult','genres','homepage','video',
     'belongs_to_collection','production_companies','poster_path',
     'production_countries','status'], axis = 1)

In [287]:
len(final_temp.columns)


22


NEEDED_COLUMNS = (budget, id, imdb_id, original_language, original_title,
       overview, popularity,poster_link, release_date, revenue, runtime,spoken_languages,tagline, title,
       vote_average, vote_count, year, new_title,
       overview_cleaned, critical_score, age_rating)

In [301]:
final_temp = final_temp.head(1000)
final_temp.rename({'new title':'new_title',
                   'poster_link':'poster_link',
                   'critically_acclaimed':'critical_score',
                   'poster_done':'poster_available','poster_done':'poster_available',
                  'release_year':'year'},axis=1,inplace=True)
final_temp = final_temp[['id','original_title','title','new_title',
                         'imdb_id','overview','overview_cleaned',
                         'budget','revenue','tagline','spoken_languages',
                        'original_language','year','release_date','age_rating',
                        'vote_average','vote_count','critical_score','poster_available','poster_link','popularity','runtime']]
final_temp['runtime'].fillna(0,inplace=True)
final_temp.to_csv('./data/test_movies.csv', index=False, sep='\t',encoding=None)

In [293]:
final_temp.columns

Index(['id', 'original_title', 'title', 'new_title', 'imdb_id', 'overview',
       'overview_cleaned', 'budget', 'revenue', 'tagline', 'spoken_languages',
       'original_language', 'year', 'release_date', 'age_rating',
       'vote_average', 'vote_count', 'critical_score', 'poster_available',
       'poster_link', 'popularity', 'runtime'],
      dtype='object')

In [294]:
final_temp.iloc[635]

IndexError: single positional indexer is out-of-bounds

In [282]:
test = pd.read_csv('./data/test_movies.csv',sep='\t')

In [283]:
test

Unnamed: 0,budget,id,imdb_id,original_language,original_title,overview,popularity,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,year,new_title,overview_cleaned,critical_score,age_rating,poster_link
0,30000000,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",N/R,Toy Story,7.7,5415.0,1995,Toy Story (1995),"led woody, andi toy live happili room andi bir...",2353.324641,G,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,65000000,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995,Jumanji (1995),sibl judi peter discov enchant board game open...,778.136085,PG,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,0,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,1995,Grumpier Old Men (1995),famili wed reignit ancient feud next-door neig...,26.278206,PG-13,https://m.media-amazon.com/images/M/MV5BMDkwYT...
3,16000000,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,1995,Waiting to Exhale (1995),"cheat on, mistreat step on, women hold breath,...",8.715714,R,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,0,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,1995,Father of the Bride Part II (1995),"georg bank recov daughter wedding, receiv news...",40.223204,PG,https://m.media-amazon.com/images/M/MV5BOTEyNz...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,15947,tt0038166,en,The Three Caballeros,For Donald's birthday he receives a box with t...,8.702472,1944-07-21,0.0,71.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",N/R,The Three Caballeros,6.2,108.0,1944,The Three Caballeros (1944),donald birthday receiv box three gift inside. ...,28.413575,G,https://m.media-amazon.com/images/M/MV5BNTYzMm...
996,3000000,9078,tt0057546,en,The Sword in the Stone,Wart is a young boy who aspires to be a knight...,10.636458,1963-12-25,22182353.0,79.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Tired of living in a Medieval mess...Merlin us...,The Sword in the Stone,6.9,935.0,1963,The Sword in the Stone (1963),wart young boy aspir knight squire. hunt trip ...,301.515640,G,https://m.media-amazon.com/images/M/MV5BODc2Mj...
997,0,29682,tt0041890,en,So Dear to My Heart,The tale of Jeremiah Kincaid and his quest to ...,0.901443,1948-11-29,0.0,79.0,"[{'iso_639_1': 'en', 'name': 'English'}]",N/R,So Dear to My Heart,6.3,5.0,1948,So Dear to My Heart (1948),tale jeremiah kincaid quest rais champion lamb...,1.350986,G,https://m.media-amazon.com/images/M/MV5BMWFmMD...
998,48000000,8367,tt0102798,en,Robin Hood: Prince of Thieves,When the dastardly Sheriff of Nottingham murde...,10.639244,1991-06-14,390493908.0,143.0,"[{'iso_639_1': 'en', 'name': 'English'}]","For the good of all men, and the love of one w...",Robin Hood: Prince of Thieves,6.6,937.0,1991,Robin Hood: Prince of Thieves (1991),dastard sheriff nottingham murder robin father...,275.507204,PG-13,https://m.media-amazon.com/images/M/MV5BNjUxMz...


In [280]:
test

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,release_year,new title,overview_cleaned,critically_acclaimed,age_rating,poster_done,poster_link
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,N/R,Toy Story,False,7.7,5415.0,1995,Toy Story (1995),"led woody, andi toy live happili room andi bir...",2353.324641,G,True,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,False,N/R,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",N/R,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Jumanji (1995),sibl judi peter discov enchant board game open...,778.136085,PG,True,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",N/R,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,Grumpier Old Men (1995),famili wed reignit ancient feud next-door neig...,26.278206,PG-13,True,https://m.media-amazon.com/images/M/MV5BMDkwYT...
3,False,N/R,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",N/R,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,Waiting to Exhale (1995),"cheat on, mistreat step on, women hold breath,...",8.715714,R,True,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",N/R,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,Father of the Bride Part II (1995),"georg bank recov daughter wedding, receiv news...",40.223204,PG,True,https://m.media-amazon.com/images/M/MV5BOTEyNz...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45407,False,N/R,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,[],"[{'iso_3166_1': 'IR', 'name': 'Iran'}]",2017-06-14,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0,2017,Subdue (2017),rise fall man woman.,0.166639,Unrated,False,
45408,False,N/R,0,"[{'id': 18, 'name': 'Drama'}]",N/R,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,N/R,Century of Birthing,False,9.0,3.0,2011,Century of Birthing (2011),artist struggl finish work storylin cult play ...,2.997003,Unrated,False,
45409,False,N/R,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",N/R,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003,Betrayal (2003),"one hit goe wrong, profession assassin end sui...",0.967586,R,False,
45410,False,N/R,0,[],N/R,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,0.0,87.0,[],Released,N/R,Satan Triumphant,False,0.0,0.0,1917,Satan Triumphant (1917),"small town live two brothers, one minist one h...",0.000000,Unrated,False,


In [278]:
final_temp.shape

(1000, 21)