In [1]:
import pandas as pd
import numpy as np
import chardet
import math
import random
import datetime
import sys
from functools import partial
import time

np.set_printoptions(threshold=sys.maxsize)

df = pd.read_csv('Podatki/movies.dat', sep='\t', encoding='ISO-8859-1')

In [2]:
class UserItemData:
    def __init__(self, path, from_date='1-1-1900', to_date='1-1-2030', min_ratings=0):
        self.path = path
        self.from_date = from_date
        self.to_date = to_date
        self.min_ratings = min_ratings
        self.data = pd.read_csv(path, sep='\t', encoding='ISO-8859-1', usecols=["userID", "movieID", "rating", "date_day", "date_month", "date_year"])
        self.data = self.data.rename(columns={'date_day': 'day', 'date_month': 'month', 'date_year': 'year'})
        
        self.data['date'] = pd.to_datetime(self.data[['day', 'month', 'year']], dayfirst=True)
        
        f = partial(pd.to_datetime, dayfirst=True)
        
        mask = (self.data['date'] >= f(from_date)) & (self.data['date'] < f(to_date))
        self.data = self.data.loc[mask]

        if min_ratings > 0:
        
            movie_counts = self.data['movieID'].value_counts()
            movies_to_remove = movie_counts[movie_counts < min_ratings].index
            self.data = self.data[~self.data['movieID'].isin(movies_to_remove)]
           

    def nratings(self):
        print(len(self.data.index()))

In [3]:
uim = UserItemData('Podatki/user_ratedmovies.dat', min_ratings=0)

print(len(uim.data.index))

uim = UserItemData('Podatki/user_ratedmovies.dat', from_date = '12.1.2007', to_date='16.2.2008', min_ratings=100)

print(len(uim.data.index))


855598
73584


In [4]:
class MovieData:
    def __init__(self, path):
        self.path = path
        self.data = pd.read_csv(path, sep='\t', encoding='ISO-8859-1')
        
    def get_title(self, mid):
        return self.data.loc[self.data['id'] == mid, 'title'].iloc[0]

In [5]:
md = MovieData('Podatki/movies.dat')
print(md.get_title(1))

Toy story


In [6]:
class RandomPredictor:
    def __init__(self, mino, maxo):
        self.mino = mino
        self.maxo = maxo
    
    def fit(self, x):
        user_data = x
        
    def predict(self, user_id):
        output = {}
        for x in md.data['id']:
            output[x] = random.randint(self.mino, self.maxo)
        return output
    

In [7]:
md = MovieData('Podatki/movies.dat')
uim = UserItemData('Podatki/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 5
Film: Grumpy Old Men, ocena: 4
Film: Money Train, ocena: 5
Film: The Usual Suspects, ocena: 5
Film: City Hall, ocena: 4


In [8]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor

    def fit(self, x):
        user_data = x

    def recommend(self, user_id, n=10, rec_seen=True):
        user_ratings = self.predictor.predict(user_id)
        
        if not rec_seen:
            seen_id = uim.data.query('userID == @user_id')['movieID']
            for key in list(seen_id):
                user_ratings.pop(key, None)
            
        n_recommendations = sorted(user_ratings.items(), key=lambda x: x[1], reverse=True)[:n]

        return n_recommendations


In [9]:
md = MovieData('Podatki/movies.dat')
uim = UserItemData('Podatki/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Tom and Huck, ocena: 5
Film: GoldenEye, ocena: 5
Film: Balto, ocena: 5
Film: Casino, ocena: 5
Film: Wings of Courage, ocena: 5


In [10]:
class AveragePredictor:
    def __init__(self, b):
        if b < 0:
            raise ValueError("Parameter b mora biti večji ali enak 0.")
        self.b = b
        self.movies = {}

    def fit(self):
        na = 0
        n = 0
        vs = 0
        g_avg = 0
        movie_avg = {}
        
        for x in md.data['id']:
            n = len(uim.data[(uim.data['movieID'] == x)])
            na += n
            vs = uim.data.loc[uim.data['movieID'] == x, 'rating'].sum()
            g_avg += vs
            movie_avg[x] = [n,vs]
        
        for x in movie_avg:
            self.movies[x] = (movie_avg[x][1] + self.b * (g_avg / na)) / (movie_avg[x][0] + self.b)
            
        return self.movies

In [11]:
md = MovieData('Podatki/movies.dat')
uim = UserItemData('Podatki/user_ratedmovies.dat')
ap = AveragePredictor(100)
ddd = ap.fit()
ddd = {k: v for k, v in sorted(ddd.items(), key=lambda item: item[1], reverse=True)}
i = 0
for x in ddd:
    print("Film: {}, ocena: {}".format(md.get_title(x), ddd[x]))  
    i+=1
    if i == 10:
        break

Film: The Shawshank Redemption, ocena: 4.305187875177615
Film: The Godfather, ocena: 4.262394259034606
Film: The Usual Suspects, ocena: 4.225944245560473
Film: Fight Club, ocena: 4.199670479562388
Film: Pulp Fiction, ocena: 4.189550712063961
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb, ocena: 4.1415016089689
Film: Casablanca, ocena: 4.13585807902886
Film: The Matrix, ocena: 4.1323697828523756
Film: Memento, ocena: 4.131545433555425


In [12]:
class ViewPredictor:
    def __init__(self, mdata):
        self.movies = mdata
        
    def fit(self):
        movies = {}
        for x in self.movies.data['id']:
            movies[x] = len(uim.data[(uim.data['movieID'] == x)])
        return movies

In [13]:
vp = ViewPredictor(md)
ddd = vp.fit()
ddd = {k: v for k, v in sorted(ddd.items(), key=lambda item: item[1], reverse=True)}
i = 0
for x in ddd:
    print("Film: {}, ocena: {}".format(md.get_title(x), ddd[x]))  
    i+=1
    if i == 10:
        break
        
md = MovieData('Podatki/movies.dat')
uim = UserItemData('Podatki/user_ratedmovies.dat', min_ratings=1000)


Film: The Matrix, ocena: 1670
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: Forrest Gump, ocena: 1568
Film: Pulp Fiction, ocena: 1537
Film: The Lord of the Rings: The Two Towers, ocena: 1528
Film: American Beauty, ocena: 1472
Film: The Lord of the Rings: The Return of the King, ocena: 1457
Film: Jurassic Park, ocena: 1448
Film: The Shawshank Redemption, ocena: 1441
Film: Fight Club, ocena: 1434


In [14]:
user_averages = uim.data.groupby('userID')['rating'].mean()
user_averages = pd.DataFrame(user_averages).reset_index()
user_averages.columns = ['userID', 'avgRating']

In [15]:
from numpy import dot
from numpy.linalg import norm

simA = uim.data['userID'].unique()
simB = md.data['id'].unique()

r_table = uim.data.pivot(index='userID', columns='movieID', values='rating').fillna(0)
r_table = r_table.transpose()

class ItemBasedPredictor:
    def __init__(self, min_values=0, threshold=0):
        self.min_values = min_values
        self.threshold = threshold
    
    #def fit(self, x):
        
    def predict(self, user_id):
        not_movies = uim.data.loc[uim.data['userID'] != user_id, 'movieID']
        yes_movies = uim.data.loc[uim.data['userID'] == user_id, ['movieID', 'rating']]
        predicted_ratings = {}
        for x in not_movies.unique():
            simsum = 0
            calsum = 0
            for y, rating_y in yes_movies.itertuples(index=False):
                sim = self.similarity(x,y)
                cal = sim * rating_y
                simsum += sim
                calsum += cal
            predicted_ratings[x] = calsum / simsum if simsum != 0 else 0
        return predicted_ratings
            
    def similarity(self, p1, p2):
      
        a = r_table.loc[p1]
        b = r_table.loc[p2]
        
        d1 = pd.DataFrame(a[a!=0.0]).reset_index()
        d1.columns = ['userID', 'rating']

        
        d2 = pd.DataFrame(b[b!=0.0]).reset_index()
        d2.columns = ['userID', 'rating']

        
        t = pd.merge(d1, d2, how ='inner', on = ['userID']) 

        
        user_averages_dict = user_averages.set_index('userID')['avgRating'].to_dict()

        t['rating_x'] -= t['userID'].map(user_averages_dict)
        t['rating_y'] -= t['userID'].map(user_averages_dict)  
        
        
        g = np.round(np.dot(t['rating_x'], t['rating_y']), 5)
        h = np.sqrt(np.sum(t['rating_x']**2))
        j = np.sqrt(np.sum(t['rating_y']**2))
       
        sim = g/(j*h)
         
        if sim < 0 or sim < self.threshold:
            sim = 0
        
        
        return sim
        
    def similarItems(self, item, n):
        similarities = {}
        for other_item in r_table.index:
            if other_item != item:
                sim = self.similarity(item, other_item)
                if sim > 0:
                    similarities[other_item] = sim

        sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

        return sorted_similarities[:n]
        
        

In [16]:
rp = ItemBasedPredictor()
rec = Recommender(rp)

rec.fit(uim)
start_time = time.time()
print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", rp.similarity(1580, 780))

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))



Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.23395524060890824
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.42466125761831625
Predictions for 78: 
Film: Shichinin no samurai, ocena: 4.355734786086912
Film: The Usual Suspects, ocena: 4.354681725758536
Film: The Silence of the Lambs, ocena: 4.33530530623781
Film: Sin City, ocena: 4.2786871755488916
Film: Monsters, Inc., ocena: 4.217581212814615
Film: The Incredibles, ocena: 4.207098578274632
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.152792103339418
Film: Batman Begins, ocena: 4.146413774029916
Film: Die Hard, ocena: 4.125915607794364
Film: Rain Man, ocena: 4.07153524325333
Film: The Lord of the Rings: The Return of the King, ocena: 4.020237450065553
Film: A Beautiful Mind, ocena: 4.015142491419475
Film: Good Will Hunting, ocena: 4.009280798479942
Film: The Lord of the Rings: The Two Towe

In [17]:
sim_table = {}
mmmid = uim.data['movieID'].unique()

for x in mmmid:
    for y in mmmid:
        if x != y:
            sim_table[x,y] = rp.similarity(x,y)

        
sim_table = {k: v for k, v in sorted(sim_table.items(), key=lambda item: item[1], reverse=True)}

i = 0
for x in sim_table:
    if i > 20:
        break
    print('Film1: {0}, Film2: {1}, with distance of {2}:'.format(md.get_title(x[0]), md.get_title(x[1]), sim_table[x]))
    i += 1

Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Return of the King, with distance of 0.8439842185399856:
Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Two Towers, with distance of 0.8439842185399856:
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Two Towers, with distance of 0.823188537760604:
Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Fellowship of the Ring, with distance of 0.823188537760604:
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Return of the King, with distance of 0.8079374901422762:
Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Fellowship of the Ring, with distance of 0.8079374901422762:
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, with distance of 0.7372340163907803:
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, with di

In [18]:
rec_items = rp.similarItems(4993, 10)
print('Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": ')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": 
Film: The Lord of the Rings: The Two Towers, ocena: 0.823188537760604
Film: The Lord of the Rings: The Return of the King, ocena: 0.8079374901422762
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 0.23961943573520353
Film: Star Wars, ocena: 0.21965587034346667
Film: The Matrix, ocena: 0.21515552600232538
Film: Raiders of the Lost Ark, ocena: 0.19944277300895646
Film: The Usual Suspects, ocena: 0.18321189048778952
Film: Blade Runner, ocena: 0.16399680985513243
Film: Schindler's List, ocena: 0.16105905741680648
Film: Monty Python and the Holy Grail, ocena: 0.15780453242922818
