In [54]:
import pandas as pd

# Više izlaznih linija
from IPython.core.interactiveshell import InteractiveShell
from typing import List
InteractiveShell.ast_node_interactivity = "all"

movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head()

ratings = pd.read_csv('ml-latest-small/ratings.csv',usecols=['userId','movieId','rating'])
ratings.head()

tags = pd.read_csv('ml-latest-small/tags.csv',usecols=['movieId','tag'])
tags.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


Unnamed: 0,movieId,tag
0,60756,funny
1,60756,Highly quotable
2,60756,will ferrell
3,89774,Boxing story
4,89774,MMA


Računanje srednje ocjene i broja ocjena za svaki film i njihov upis u dataset.

In [55]:
movie_rating_temp_df = pd.merge(ratings,movies,how='inner',on='movieId')[['movieId','rating']]

avg_rating_df = movie_rating_temp_df.groupby('movieId', as_index=False).mean().rename(columns={'rating': 'avgRating'})

movie_rating_count = movie_rating_temp_df.groupby('movieId', as_index=False).count().rename(columns={'rating': 'ratingCount'})

movies = pd.merge(pd.merge(movies,avg_rating_df,how='inner',on='movieId'),movie_rating_count,how='inner',on='movieId')

movies.head()

Unnamed: 0,movieId,title,genres,avgRating,ratingCount
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,110
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,52
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,7
4,5,Father of the Bride Part II (1995),Comedy,3.071429,49


In [56]:
mean_rating = ratings['rating'].mean()
print(mean_rating)

low_vote_number = movies['ratingCount'].quantile(0.90)
print(low_vote_number)

3.501556983616962
27.0


Računanje filmova koji su su u top10% po broju ocjena.

In [57]:
q_movies = movies.copy().loc[movies['ratingCount']>=low_vote_number]
q_movies.shape

movies.shape

(976, 5)

(9724, 5)

In [58]:
def weighed_rating(x, m=low_vote_number, C=mean_rating):
    v = x['ratingCount']
    R = x['avgRating']

    return (v/(v+m) * R) + (m/(m+v) * C)

q_movies['score'] = q_movies.apply(weighed_rating,axis=1)

In [59]:
q_movies = q_movies.sort_values(by='score',ascending=False)

q_movies[['title', 'avgRating', 'ratingCount', 'score']].head(20)

Unnamed: 0,title,avgRating,ratingCount,score
277,"Shawshank Redemption, The (1994)",4.429022,317,4.356227
659,"Godfather, The (1972)",4.289062,192,4.191973
2224,Fight Club (1999),4.272936,218,4.187927
224,Star Wars: Episode IV - A New Hope (1977),4.231076,251,4.160223
46,"Usual Suspects, The (1995)",4.237745,204,4.151697
461,Schindler's List (1993),4.225,220,4.145919
257,Pulp Fiction (1994),4.197068,307,4.140844
897,Star Wars: Episode V - The Empire Strikes Back...,4.21564,211,4.13463
1938,"Matrix, The (1999)",4.192446,278,4.131285
921,"Godfather: Part II, The (1974)",4.25969,129,4.128475


In [60]:
tags['tag'] = tags['tag'].apply(lambda x: str(x).replace(" ","").lower())

movies = pd.merge(tags.groupby(['movieId'], as_index=False)['tag'].apply(lambda x: " ".join(x)), movies,how='inner',on='movieId')

In [61]:
def createSoup(x):
    infoSet = set(x['tag'].split(' '))

    data = ' '.join(infoSet)

    for genre in x['genres'].split('|'):
        data+=(' ' + genre.lower())
    return data

movies['soup'] = movies.apply(createSoup,axis=1)

movies.head()

Unnamed: 0,movieId,tag,title,genres,avgRating,ratingCount,soup
0,1,pixar pixar fun,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215,fun pixar adventure animation children comedy ...
1,2,fantasy magicboardgame robinwilliams game,Jumanji (1995),Adventure|Children|Fantasy,3.431818,110,game fantasy magicboardgame robinwilliams adve...
2,3,moldy old,Grumpier Old Men (1995),Comedy|Romance,3.259615,52,old moldy comedy romance
3,5,pregnancy remake,Father of the Bride Part II (1995),Comedy,3.071429,49,pregnancy remake comedy
4,7,remake,Sabrina (1995),Comedy|Romance,3.185185,54,remake comedy romance


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

movies['soup'] = movies['soup'].fillna('')

tfidf_matrix = tfidf.fit_transform(movies['soup'])

tfidf_matrix.shape

tfidf.get_feature_names()[1000:1020]

(1554, 1498)

['pageant',
 'painter',
 'palahnuik',
 'palmed',
 'paranoia',
 'paranoid',
 'parenthood',
 'paris',
 'parody',
 'parrots',
 'passion',
 'paulgiamatti',
 'paulrudd',
 'peacecorp',
 'pearlsbuck',
 'peeweeherman',
 'personalsads',
 'petawilson',
 'peterpan',
 'philipk']

In [63]:
from sklearn.metrics.pairwise import linear_kernel

cosin_sin = linear_kernel(tfidf_matrix,tfidf_matrix)

cosin_sin.shape

cosin_sin[1]

(1554, 1554)

array([0.26144231, 1.        , 0.        , ..., 0.        , 0.11686964,
       0.        ])

In [64]:
indices = pd.Series(movies.index, index = movies['title']).drop_duplicates()

indices[:10]

title
Toy Story (1995)                      0
Jumanji (1995)                        1
Grumpier Old Men (1995)               2
Father of the Bride Part II (1995)    3
Sabrina (1995)                        4
American President, The (1995)        5
Nixon (1995)                          6
Casino (1995)                         7
Sense and Sensibility (1995)          8
Get Shorty (1995)                     9
dtype: int64

In [65]:
def get_recommendations(title, cosine_sim=cosin_sin):

    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]

get_recommendations('Toy Story (1995)')

543                                  Bug's Life, A (1998)
1268           Cat Returns, The (Neko no ongaeshi) (2002)
664                                    Toy Story 2 (1999)
1427                                        Sintel (2010)
1506                     Guardians of the Galaxy 2 (2017)
1261    Kiki's Delivery Service (Majo no takkyûbin) (1...
1263    Porco Rosso (Crimson Pig) (Kurenai no buta) (1...
248                            Alice in Wonderland (1951)
1027              Sinbad: Legend of the Seven Seas (2003)
641                       Who Framed Roger Rabbit? (1988)
Name: title, dtype: object