Učitavanje potrebnih paketa

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings; warnings.simplefilter('ignore')

Učitavamo skupove podataka koje ćemo koristiti

In [2]:
# Uuzeli smo manji skup, ali ako nije dovoljno, mozemo uzeti i veliki skup; sa velikim neće da radi, preveliki je 
ratings = pd.read_csv('input/ratings_small.csv')
# Učitvamo i podatke o filmovima
movies = pd.read_csv('input/movies_metadata.csv')

In [3]:
# Ispisujemo samo da bismo videli šta sve imamo od podataka
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


Sada sređujemo učitane skupove. Izbacujemo one one koji nam ne odgovaraju i kastujemo u odgovarajuće formate

In [5]:
# Ovde vidimo da imamo 3 filma kojima je ID u obliku datuma, pa ćemo ih izbaciti, zbog lakšeg rukovanja
for i in movies.id:
    if "-" in i:
        print(i)

1997-08-20
2012-09-29
2014-01-01


In [6]:
# Izbacujemo ih
movies = movies.drop(movies[movies.id.str.contains("-")].index)

In [7]:
# Kastujemo podatke iz kolone ID u skupu podataka movies u long; isto i za movieId u skupu podataka ratings
movies['id'] = movies['id'].astype('long')
ratings['movieId'] = ratings['movieId'].astype('long')

In [8]:
# Za kolonu genre: želimo da pišu samo imena žanrova
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
# zelimo samo godinu, ne ceo datum
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

Pravimo matricu od DataFrame-a ratings

In [9]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Normalizacija podataka ?????

In [10]:
#  Pronaci neki drugi nacin ili se snaci sa StandardScaler
# Za Milosa: Mozda bismo mogli da pitamo Andjelku sta sa ovim????
R = Ratings.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

SVD

In [11]:
# Primenjujemo SVD na normalizovanu matricu Ratings; naci cemo 50 sopstvenih vrednosti i odgovarajucih vektora
U, sigma, Vt = svds(Ratings_demeaned, k = 50)

a

In [12]:
# Pravimo dijagonalnu matricu od sigma vrednosti
sigma = np.diag(sigma)

In [13]:
# Sta smo ovde uradili?
#ovde dobijemo matricu na osnovu koje dovijemo preporuke, tj. neka slicnost izmedju korisnika
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [14]:
all_user_predicted_ratings

array([[-5.42390333e-02,  4.51304289e-02, -4.83461757e-03, ...,
        -6.36490827e-03, -6.09765683e-03, -4.81885918e-03],
       [ 4.19835043e-01,  1.40644018e+00, -1.88807492e-01, ...,
         1.04679741e-03, -1.46825089e-03, -6.57708637e-03],
       [ 1.34561891e+00,  2.66504861e-01, -1.19621144e-02, ...,
         1.06114277e-02,  6.79214329e-03, -6.35703450e-03],
       ...,
       [ 7.28688090e-01, -1.35383733e-01,  1.89809902e-01, ...,
         1.57367435e-03, -3.40631718e-04, -7.00072377e-03],
       [ 1.58186999e+00,  8.45780940e-02, -4.63631293e-02, ...,
         7.31008459e-03,  6.69828780e-03,  4.58389107e-03],
       [ 3.50790455e+00,  3.28823135e-01, -6.74216131e-02, ...,
         1.27775344e-02,  1.10018934e-02, -1.68375103e-02]])

In [15]:
# Pravimo DataFrame od dobijene matrice, to su nam ustv predictions
predictions = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)

Preporucujemo filmove sa najvecim predvidjenim vrednostima koje korisnik nije jos pogledao i to tako sto:
 - prvo iskljucujemo sve filmove koje je korisnik pogledao
 - zatim skup podataka fulmova koje imamo spajamo sa dobijenim sortiranim predvidjanjima prema id-u filma
 - onda preimenujemo dodatu kolonu i sortiramo opadajuce, kako bi filmovi sa najvecim rejtingom bili na vrhu
 - na samom kraju izdvojimo onoliko filmova koliko zelimo, pri cemu hocemo da se ispisu samo odredjene kolone
 
Funkcija vraca podatke o filmovima koje je korisnik vec ocenio i preporuke

In [16]:
def recommend_movies(predictions, userID, movies, ratings, num_recommendations):
    
    user_row_number = userID - 1 # jer krece od 1, a treba nam od 0
    
    # Sortiramo vrednosti datog reda u matrici predictions
    sorted_user_predictions = predictions.iloc[user_row_number].sort_values(ascending=False) 
    
    # Uzimamo podatke o korisniku i spajamo sa info o filmovima
    user_data = ratings[ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'inner', left_on = 'movieId', right_on = 'id').
                     sort_values(['rating'], ascending=False))
    
    # Preporucujemo filmove sa najvecim predvidjenim vrednostima koje korisnik nije jos pogledao
    recommendations = (movies[~movies['id'].isin(user_full['movieId'])].
                       merge(pd.DataFrame(sorted_user_predictions).reset_index(), 
                             how = 'left', left_on = 'id', right_on = 'movieId').
                       rename(columns = {user_row_number: 'predictions'}).
                       sort_values('predictions', ascending = False).
                       iloc[:num_recommendations])
    
    return user_full, recommendations[['title', 'genres', 'vote_average', 'vote_count', 'predictions']]

Testiramo uradjeno: Trazimo 20 preporuka za korisnika 99 prema navedenim skupovima podataka 

In [17]:
already_rated, recommendations = recommend_movies(predictions, 99, movies, ratings, 20)

In [18]:
recommendations

Unnamed: 0,title,genres,vote_average,vote_count,predictions
2080,Young and Innocent,"[Drama, Crime]",6.8,42.0,2.518254
9217,Blood: The Last Vampire,"[Fantasy, Animation, Horror, Comedy, Thriller,...",6.6,76.0,2.428378
10901,Cars,"[Animation, Adventure, Comedy, Family]",6.6,3991.0,2.21114
1313,Batman Returns,"[Action, Fantasy]",6.6,1706.0,2.187512
6814,Battle Royale,"[Drama, Thriller, Adventure]",7.3,992.0,2.103466
5806,The Hours,[Drama],7.0,461.0,1.999539
1226,The Big Sleep,"[Crime, Drama, Mystery, Thriller]",7.6,244.0,1.884463
11486,Lonely Hearts,"[Drama, Thriller, Crime, Romance]",6.0,88.0,1.774396
3217,The Searchers,[Western],7.7,332.0,1.738041
22249,Miffo,"[Comedy, Drama]",4.0,5.0,1.640496


In [19]:
already_rated

Unnamed: 0,userId,movieId,rating,timestamp,adult,belongs_to_collection,budget,genres,homepage,id,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
105,99,5991,5.0,1044754204,False,,0,[Drama],,5991,...,0.0,90.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Released,,The Last Laugh,False,7.7,63.0,1924
75,99,2067,5.0,938624807,False,,90000000,[Science Fiction],,2067,...,60874615.0,114.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Let There Be Life.,Mission to Mars,False,5.7,374.0,2000
35,99,912,5.0,938587795,False,,4300000,"[Romance, Crime, Thriller, Drama]",,912,...,0.0,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The Thomas Crown Affair,False,6.9,95.0,1968
43,99,953,5.0,938587876,False,"{'id': 14740, 'name': 'Madagascar Collection',...",75000000,"[Family, Animation]",,953,...,532680671.0,86.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Someone's got a zoo loose.,Madagascar,False,6.6,3322.0,2005
33,99,903,5.0,938588142,False,,3000000,"[Crime, Drama]",,903,...,16217773.0,126.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What we've got here is failure to communicate.,Cool Hand Luke,False,7.7,390.0,1967
90,99,2692,5.0,938549686,False,,0,[Documentary],,2692,...,0.0,90.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,,The Red Elvis,False,8.0,1.0,2007
98,99,4034,5.0,982280553,False,,0,"[Action, Adventure, Comedy]",,4034,...,0.0,110.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,Look What They Call a Quiet Week in Rio!,That Man from Rio,False,7.1,32.0,1964
56,99,1619,5.0,938585094,False,,8500000,"[Action, Crime, Drama, Thriller]",,1619,...,19125401.0,119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,The Way of the Gun,False,6.4,106.0,2000
96,99,3160,5.0,947519369,False,,0,"[Horror, Science Fiction]",,3160,...,0.0,93.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Frankenstein Conquers the World,False,5.8,9.0,1965
58,99,1721,5.0,938586999,False,,0,"[Adventure, Action, Comedy]",,1721,...,0.0,106.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,,All the Way Boys,False,6.5,71.0,1972


####  Funkcija za Top n filmova :D
- Izvuci cemo info o prosecnoj oceni nekog filma iz skupa podataka movies, kao i o broju ocena
- Uzimamo u obzir samo filmove koji imaju vise glasova od 80% filmova 
- Izracunavamo rejting prema formuli ??? 
- Na kraju sortiramo prema rejtingu i uzimamo prvih n
- Ukoliko se navede poseban zanr, uzima prvih n iz tog zanra

In [20]:

#racunamo rating prema ovoj formuli
def weighted_rating(x, C, m):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [21]:
def top_n_movies(n=250, genre='', percentile=0.8):
    
    # Ukoliko je dat odredjen zanr, uzimamo u obzir samo te filmove
    if genre != '':
        s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
        s.name = 'genre'
        gen_movies = movies.drop('genres', axis=1).join(s)
        top_movies = gen_movies[gen_movies['genre'] == genre]
    else:
        top_movies = movies
    
    #racunamo prosecnu ocenu koju su filmovi dobijali
    vote_counts = top_movies[top_movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = top_movies[top_movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    # uzimamo samo one filmove koji imaju vise glasova percentile% filmova u listi
    m = vote_counts.quantile(percentile)
    
    #vidimo samo koji filmovi nas zanimaju(imaju vise od 50 glasova) i uzimamo kolone koje nam trebaju
    qualified = top_movies[(top_movies['vote_count'] >= m) & (top_movies['vote_count'].notnull()) & (top_movies['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    # Racunamo rejting
    qualified['wr'] = weighted_rating(qualified, C, m)
    
    # Sortiramo vrednosti i uzimamo prvih n
    qualified = qualified.sort_values('wr', ascending=False).head(n)
    
    return qualified

Testiramo uradjeno prvo za sve zanrove, a zatim za odredjeni zanr

In [22]:
top_n_movies()

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.735928
15480,Inception,2010,14075,8,29.1081,7.990247
12481,The Dark Knight,2008,12269,8,123.167,7.988818
22879,Interstellar,2014,11187,8,32.2135,7.987741
2843,Fight Club,1999,9678,8,63.8696,7.985839
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.984595
292,Pulp Fiction,1994,8670,8,140.95,7.984202
314,The Shawshank Redemption,1994,8358,8,51.6454,7.983616
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.983355
351,Forrest Gump,1994,8147,8,48.3072,7.983194


Top 250 prema zanru

In [23]:
top_n_movies(20, 'Thriller')

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.973053
12481,The Dark Knight,2008,12269,8,123.167,7.969131
292,Pulp Fiction,1994,8670,8,140.95,7.956516
46,Se7en,1995,5915,8,18.4574,7.936721
24860,The Imitation Game,2014,5895,8,31.5959,7.936511
586,The Silence of the Lambs,1991,4549,8,4.30722,7.918275
11354,The Prestige,2006,4510,8,16.9456,7.917589
289,Leon: The Professional,1994,4293,8,20.4773,7.913552
4099,Memento,2000,4168,8,15.4508,7.911042
1213,The Shining,1980,3890,8,19.6116,7.904901


Metadata Based Recommender

In [24]:
credits = pd.read_csv('input/credits.csv')
keywords = pd.read_csv('input/keywords.csv')

In [25]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

In [26]:
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

In [27]:
links_small = pd.read_csv('input/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
links_small.shape

(9112,)

In [28]:
smovies = movies[movies['id'].isin(links_small)]
smovies.shape

(9219, 28)

In [29]:
smovies['cast'] = smovies['cast'].apply(literal_eval)
smovies['crew'] = smovies['crew'].apply(literal_eval)
smovies['keywords'] = smovies['keywords'].apply(literal_eval)
smovies['cast_size'] = smovies['cast'].apply(lambda x: len(x))
smovies['crew_size'] = smovies['crew'].apply(lambda x: len(x))

In [30]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [31]:
smovies['director'] = smovies['crew'].apply(get_director)

In [32]:
#uzimamo samo prva tri glumca
smovies['cast'] = smovies['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smovies['cast'] = smovies['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [33]:
smovies['keywords'] = smovies['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [34]:
smovies['cast'] = smovies['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [35]:
smovies['director'] = smovies['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smovies['director'] = smovies['director'].apply(lambda x: [x,x, x])

In [36]:
s = smovies.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [37]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [38]:
s = s[s > 1]

In [39]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [40]:
smovies['keywords'] = smovies['keywords'].apply(filter_keywords)
smovies['keywords'] = smovies['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [41]:
smovies['soup'] = smovies['keywords'] + smovies['cast'] + smovies['director'] + smovies['genres']
smovies['soup'] = smovies['soup'].apply(lambda x: ' '.join(x))

In [42]:

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smovies['soup'])

In [43]:

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [44]:
smovies = smovies.reset_index()
titles = smovies['title']
indices = pd.Series(smovies.index, index=smovies['title'])

In [45]:


def improved_recommendations(title):
    #uzmemo indeks za taj title koji treba
    idx = indices[title]
    #onda pretvorimo to u listu i stavimo u ovaj sim_scores, sortiramo i uzmemo prvih sledecih 26-25 vrednosti
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    #ovo i[0] je valjda id tih filmova sto nam trebaju, ali nisam siguran
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smovies.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id', 'genres']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.50)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = weighted_rating(qualified, C, m)
    qualified = qualified.sort_values('wr', ascending=False)
    return qualified



In [46]:
improved_recommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,id,genres,wr
7648,Inception,14075,8,2010,27205,"[Action, Thriller, Science Fiction, Mystery, A...",7.951997
8613,Interstellar,11187,8,2014,157336,"[Adventure, Drama, Science Fiction]",7.940093
6623,The Prestige,4510,8,2006,1124,"[Drama, Mystery, Thriller]",7.859593
3381,Memento,4168,8,2000,77,"[Mystery, Thriller]",7.849215
8031,The Dark Knight Rises,9263,7,2012,49026,"[Action, Crime, Drama, Thriller]",6.975449
6218,Batman Begins,7511,7,2005,272,"[Action, Crime, Drama]",6.970053
7659,Batman: Under the Red Hood,459,7,2010,40662,"[Action, Animation]",6.74
2131,Superman,1042,6,1978,1924,"[Action, Adventure, Fantasy, Science Fiction]",6.146782
4145,Insomnia,1181,6,2002,320,"[Crime, Mystery, Thriller]",6.134341
1134,Batman Returns,1706,6,1992,364,"[Action, Fantasy]",6.101764


In [47]:
improved_recommendations('Mean Girls')

Unnamed: 0,title,vote_count,vote_average,year,id,genres,wr
1547,The Breakfast Club,2189,7,1985,2108,"[Comedy, Drama]",6.808159
390,Dazed and Confused,588,7,1993,9571,"[Comedy, Drama]",6.472123
8883,The DUFF,1372,6,2015,272693,"[Romance, Comedy]",5.907845
3712,The Princess Diaries,1063,6,2001,9880,"[Comedy, Family, Romance]",5.887343
4763,Freaky Friday,919,6,2003,10330,[Comedy],5.874313
6277,Just Like Heaven,595,6,2005,9007,"[Comedy, Fantasy, Romance]",5.830098
6959,The Spiderwick Chronicles,593,6,2008,8204,"[Adventure, Family, Fantasy]",5.829728
6449,Aquamarine,372,5,2006,14191,"[Fantasy, Romance, Family, Comedy]",5.242865
2005,She's All That,425,5,1999,10314,"[Comedy, Romance]",5.225726
7494,American Pie Presents: The Book of Love,454,5,2009,26123,[Comedy],5.217333


In [51]:
def hybrid(userId, title):
    
    #uzmemo indeks za taj title koji treba
    idx = indices[title]
    #onda pretvorimo to u listu i stavimo u ovaj sim_scores, sortiramo i uzmemo prvih sledecih 26-25 vrednosti
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    #ovo i[0] je valjda id tih filmova sto nam trebaju, ali nisam siguran
    movie_indices = [i[0] for i in sim_scores]
    
    recomended_movies = smovies.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id', 'genres']]
    user_full, recomendations = recommend_movies(predictions, userId, recomended_movies, ratings,20)
    return recomendations

In [52]:
hybrid(500, 'Avatar')

Unnamed: 0,title,genres,vote_average,vote_count,predictions
10,The Time Machine,"[Thriller, Adventure, Fantasy, Science Fiction...",7.5,217.0,0.539241
0,Aliens,"[Horror, Action, Thriller, Science Fiction]",7.7,3282.0,-0.021333
13,Hercules in New York,"[Action, Adventure, Comedy, Fantasy, Science F...",3.7,63.0,-0.022014
20,Teenage Mutant Ninja Turtles III,"[Action, Adventure, Comedy, Family, Fantasy, S...",5.1,189.0,-0.092514
1,Terminator 2: Judgment Day,"[Action, Thriller, Science Fiction]",7.7,4274.0,-0.100456
2,The Terminator,"[Action, Thriller, Science Fiction]",7.4,4208.0,-0.105171
3,The Abyss,"[Adventure, Action, Thriller, Science Fiction]",7.1,822.0,
4,Piranha Part Two: The Spawning,[Horror],3.9,41.0,
5,True Lies,"[Action, Thriller]",6.8,1138.0,
6,Star Trek Into Darkness,"[Action, Adventure, Science Fiction]",7.4,4479.0,


In [50]:
hybrid(1, 'Avatar')

Unnamed: 0,title,genres,vote_average,vote_count,predictions
11,The Time Machine,"[Thriller, Adventure, Fantasy, Science Fiction...",7.5,217.0,0.085253
21,Teenage Mutant Ninja Turtles III,"[Action, Adventure, Comedy, Family, Fantasy, S...",5.1,189.0,0.031115
1,Terminator 2: Judgment Day,"[Action, Thriller, Science Fiction]",7.7,4274.0,0.00142
2,The Terminator,"[Action, Thriller, Science Fiction]",7.4,4208.0,0.000731
14,Hercules in New York,"[Action, Adventure, Comedy, Fantasy, Science F...",3.7,63.0,-0.001029
0,Aliens,"[Horror, Action, Thriller, Science Fiction]",7.7,3282.0,-0.005803
6,Titanic,"[Drama, Romance, Thriller]",7.5,7770.0,-0.08264
3,The Abyss,"[Adventure, Action, Thriller, Science Fiction]",7.1,822.0,
4,Piranha Part Two: The Spawning,[Horror],3.9,41.0,
5,True Lies,"[Action, Thriller]",6.8,1138.0,
