In [121]:
import pandas as pd
import numpy as np

In [122]:
# Pravimo skup podataka; sve sto nam treba; uzelii smo mali, ali ako nije dovoljno, mozemo uzeti i veliki skup 
ratings = pd.read_csv('input/ratings_small.csv')

In [123]:
# Ucitavamo podatke o filmovima, id, naslov i zanrove
movies = pd.read_csv('input/movies_metadata.csv', usecols=['id', 'title', 'genres'])
movies.head()

Unnamed: 0,genres,id,title
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,Toy Story
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,Jumanji
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,Grumpier Old Men
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,Waiting to Exhale
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II


In [124]:
for i in movies.id:
    if "-" in i:
        print(i)

1997-08-20
2012-09-29
2014-01-01


In [125]:
# Sada dropujemo kolone koje imaju -a u formi datuma
movies = movies.drop(movies[movies.id.str.contains("-")].index)

In [126]:
for i in movies.id:
    if "-" in i:
        print(i)

In [127]:
# Kastujemo u broj
movies['id'] = movies['id'].astype('long')

In [128]:
# Isto radimo i za genres, jer nas zanima samo ime zanra
from ast import literal_eval
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies.head()

Unnamed: 0,genres,id,title
0,"[Animation, Comedy, Family]",862,Toy Story
1,"[Adventure, Fantasy, Family]",8844,Jumanji
2,"[Romance, Comedy]",15602,Grumpier Old Men
3,"[Comedy, Drama, Romance]",31357,Waiting to Exhale
4,[Comedy],11862,Father of the Bride Part II


In [129]:
# Ovde podeli skup podataka prema zanrovima; jos ne znamo da li ce nam ovo biti potrebno u col filt
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = movies.drop('genres', axis=1).join(s)


In [130]:
# Pretvaramo u int, jer su neki objekti
ratings['movieId'] = ratings['movieId'].astype('long')

In [131]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [132]:
# pravimo matricu koja ima jednu kolonu za userId i po kolonu za svaki film
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
# Normalizujemo podatke. Pronaci neki drugi nacin ili se snaci sa StandardScaler

R = Ratings.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [134]:
# Sada SVD
from scipy.sparse.linalg import svds 
U, sigma, Vt = svds(Ratings_demeaned, k = 50)


In [135]:
sigma = np.diag(sigma)

In [136]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [137]:
all_user_predicted_ratings

array([[-5.42390333e-02,  4.51304289e-02, -4.83461757e-03, ...,
        -6.36490827e-03, -6.09765683e-03, -4.81885918e-03],
       [ 4.19835043e-01,  1.40644018e+00, -1.88807492e-01, ...,
         1.04679741e-03, -1.46825089e-03, -6.57708637e-03],
       [ 1.34561891e+00,  2.66504861e-01, -1.19621144e-02, ...,
         1.06114277e-02,  6.79214329e-03, -6.35703450e-03],
       ...,
       [ 7.28688090e-01, -1.35383733e-01,  1.89809902e-01, ...,
         1.57367435e-03, -3.40631718e-04, -7.00072377e-03],
       [ 1.58186999e+00,  8.45780940e-02, -4.63631293e-02, ...,
         7.31008459e-03,  6.69828780e-03,  4.58389107e-03],
       [ 3.50790455e+00,  3.28823135e-01, -6.74216131e-02, ...,
         1.27775344e-02,  1.10018934e-02, -1.68375103e-02]])

In [138]:
# Pravimo DataFrame od te matrice, to su nam ustv predictions
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)


In [145]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    user_row_number = userID - 1 # jer krece od 1,  a treba nam od 0
    # Sortira vrednosti tog reda u matrici
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) 
    
    # Uzimamo podatke o korisniku i spajamo sa info o filmovima
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'inner', left_on = 'movieId', right_on = 'id').
                     sort_values(['rating'], ascending=False)
                 )
    
    # Preporucujemo filmove sa najvecim predvidjenim vrednostima koje korisnik nije jos pogledao
    recommendations = (movies[~movies['id'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'id',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                        iloc[:num_recommendations, 0:3:2]
                      )
    
    return user_full, recommendations

In [146]:
already_rated, predictions = recommend_movies(preds, 152, movies, ratings, 20)

In [147]:
predictions

Unnamed: 0,genres,title
6102,"[Crime, Drama, Thriller]",The Good Thief
532,"[Comedy, Drama, Romance]",Sleepless in Seattle
6870,"[Drama, Fantasy, Romance]",Beauty and the Beast
1320,"[Action, Fantasy]",Batman Returns
23960,[Documentary],The Red Elvis
3038,"[Thriller, Crime, Drama]",The Talented Mr. Ripley
1392,"[Drama, Comedy, Romance]",Fools Rush In
43277,[Foreign],Boat
1490,"[Comedy, Romance]",My Best Friend's Wedding
1889,"[Adventure, Comedy, Family, Science Fiction]",Back to the Future Part II


In [149]:
already_rated

Unnamed: 0,userId,movieId,rating,timestamp,genres,id,title
44,152,1653,5.0,1335947506,[Drama],1653,The Motorcycle Diaries
71,152,31410,5.0,1335900450,[Comedy],31410,My Sweet Little Village
51,152,2028,5.0,1335948073,"[Comedy, Drama, Romance]",2028,Say Anything...
15,152,296,4.5,1335947920,"[Action, Thriller, Science Fiction]",296,Terminator 3: Rise of the Machines
86,152,68954,4.5,1335900872,"[TV Movie, Drama, History]",68954,Longitude
84,152,58559,4.5,1335900808,[Drama],58559,Confession of a Child of the Century
82,152,49530,4.5,1335900652,"[Action, Thriller, Science Fiction]",49530,In Time
79,152,44555,4.5,1335900817,"[Comedy, Drama, Thriller]",44555,"A Woman, a Gun and a Noodle Shop"
65,152,6934,4.5,1335949597,[Drama],6934,Yesterday
60,152,4973,4.5,1335901843,"[Drama, Mystery]",4973,Under the Sand
