In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import math as m

In [None]:
class UserItemData:
    def __init__(self, path, start_date=None, end_date=None, min_ratings=0):
        self.path = path
        self.start_date = start_date
        self.end_date = end_date
        self.min_ratings = min_ratings
        self.data = pd.read_csv(path, encoding="windows-1250", on_bad_lines='skip', sep='\t', skiprows = 0, parse_dates= {"date" : ["date_year","date_month","date_day"]},
                keep_date_col=True)
        
        if self.start_date is not None and self.end_date is not None:
            self.start_date = datetime.strptime(self.start_date, '%d.%m.%Y')
            self.end_date = datetime.strptime(self.end_date, '%d.%m.%Y')

            self.data = self.data[(self.data['date'] >= self.start_date) & (self.data['date'] < self.end_date)]

        self.data['Allratings'] = self.data.groupby('movieID')['movieID'].transform('size')
        if self.min_ratings > 0:
            self.data = self.data.loc[self.data['Allratings'] >= self.min_ratings]


    def nratings(self):
        return len(self.data)

uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

In [None]:
def uredi_ratings(self):
        dates = []
        start_d = ""
        end_d = ""
        if self.start_date is not None and self.end_date is not None:
            start_d = datetime.strptime(self.start_date, '%d.%m.%Y')
            end_d = datetime.strptime(self.end_date, '%d.%m.%Y')

        for ind in self.data.index:
            day = str(int(self.data['date_day'][ind]))
            month = str(int(self.data['date_month'][ind]))
            year = str(int(self.data['date_year'][ind]))

            d = day + "." + month + "." + year
            date = datetime.strptime(d, '%d.%m.%Y') 
            dates.append(date)

        self.data['date'] = dates

        self.data = self.data[(self.data['date'] >= start_d) & (self.data['date'] < end_d)]
        self.data['Allratings'] = self.data.groupby('movieID')['movieID'].transform('size')
        
        if self.min_ratings > 0:
            self.data = self.data.loc[self.data['Allratings'] >= self.min_ratings]
        
        return self.data

In [None]:
class MovieData:
    def __init__(self, path):
        self.path = path
        self.movies = pd.read_csv(path, encoding="windows-1250", on_bad_lines='skip', sep='\t', skiprows = 0)

    def get_title(self, movieID):
        return self.movies.loc[self.movies["id"] == movieID]["title"].values[0]

md = MovieData('data/movies.dat')
print(md.get_title(1))

In [None]:
class RandomPredictor:
    def __init__(self, min, max):
        self.min = min
        self.max = max
        self.uim = None
    
    def fit(self, X):
        self.uim = X

    def predict(self, userID, rec_seen=True):
        if self.uim is not None:
            if not rec_seen:
                df = self.uim.data.loc[self.uim.data["userID"] == userID]['movieID']
                movies = pd.Series(list(df))
                movies.name = "moviesID"
                #self.uim.data = self.uim.data[~self.uim.data.index.isin(movies)]
                self.uim.data = self.uim.data[~self.uim.data['movieID'].isin(movies)]
                    
            movies = np.array(self.uim.data["movieID"])
            numbers = np.random.randint(self.min, self.max+1, len(movies))
            res = {}
            for i in range(len(movies)):
                res[movies[i]] = numbers[i] 

            return res
        else:
            print("none")

md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

In [None]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        self.uim = None

    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n = 10, rec_seen = True):
        return dict(sorted(self.predictor.predict(userID, rec_seen).items(), key=lambda item: (item[1]), reverse=True)[:n])


md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

In [None]:
class AveragePredictor:
    def __init__(self, b):
        if b >= 0:
            self.b = b
        else:
            self.b = 0

    def fit(self, X):
        self.uim = X
        self.uim.data['n'] = self.uim.data.groupby('movieID')['movieID'].transform('size')
        self.uim.data['vs'] = self.uim.data.groupby('movieID')['rating'].transform('sum')
        self.g_avg = self.uim.data['rating'].mean()
        # avg = (vs + b * g_avg) / (n + b
        self.uim.data['avg'] = (self.uim.data['vs'] + (self.b * self.g_avg)) / (self.uim.data['n'] + self.b)

    def predict(self, userID, rec_seen=True):
        if not rec_seen:
            df = self.uim.data.loc[self.uim.data["userID"] == userID]['movieID']
            movies = pd.Series(list(df))
            movies.name = 'movieIDs'
            self.uim.data = self.uim.data[~self.uim.data['movieID'].isin(movies)]
            
        movies = list(self.uim.data["movieID"])
        avg = list(self.uim.data["avg"])

        res = {}
        for i in range(len(movies)):
            res[movies[i]] = avg[i] 

        return res


md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = AveragePredictor(100)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=True)
for idmovie, val in rec_items.items():
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

In [None]:
class ViewsPredictor:
    def __init__(self):
        pass

    def fit(self, X):
        self.uim = X

    def predict(self, userID, rec_seen=True):
        if not rec_seen:
            df = self.uim.data.loc[self.uim.data["userID"] == userID]["movieID"]
            movies = pd.Series(list(df))
            movies.name = "moviesID"
            #self.uim.data = self.uim.data[~self.uim.data.index.isin(movies)]
            self.uim.data = self.uim.data.loc[~self.uim.data['movieID'].isin(movies)]

        return dict(self.uim.data.groupby("movieID").count()['userID'])

md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = ViewsPredictor()
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
   print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

In [128]:
class ItemBasedPredictor:
    def __init__(self, min_values = 0, threshold = 0):
        self.min_values = min_values
        self.threshold = threshold

    def fit(self, X):
        self.uim = X
        movieIDs = list(self.uim.data['movieID'].unique())
        self.matrika = pd.DataFrame(0, index=np.arange(len(movieIDs)), columns=movieIDs)

        #self.g_avg = self.uim.data.groupby('userID')['rating'].transform(np.mean)
        self.g_avg = self.uim.data.groupby('userID')['rating'].mean()

        u1 = list(self.matrika.index)
        u2 = list(self.matrika.columns)
       
        for i in u1:
            for j in u2:
                df1 = self.uim.data.loc[(self.uim.data["movieID"] == i)]
                df2 = self.uim.data.loc[(self.uim.data["movieID"] == j)]

                df = pd.merge(df1, df2, how="inner", on="userID")
                df = pd.merge(df, self.g_avg, how="inner", on="userID")

                # Če imamo minimalno toliko uporabnikov
                if len(df) >= self.min_values:
                    Ru = list(df['rating'])
                    R1 = list(df['rating_x'])
                    R2 = list(df['rating_y'])

                    sub1 = np.subtract(R1, Ru)
                    sub2 = np.subtract(R2, Ru)
                    produkt1 = np.multiply(sub1, sub2)
                    vsota1 = np.sum(produkt1)

                    sqrt_vsota_square1 = np.sqrt(np.sum(np.square(sub1)))
                    sqrt_vsota_square2 = np.sqrt(np.sum(np.square(sub2)))
                    produkt_final = sqrt_vsota_square1 * sqrt_vsota_square2

                    result = vsota1 / produkt_final

                    if result < self.threshold:
                        result = 0
                else:
                    result = 0

                self.matrika.loc[i, j] = result

    def predict(self, userID):
        pass

    def similarity(self, p1, p2):
        return self.matrika.loc[p1, p2]

md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", rp.similarity(1580, 780))

  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vsota1 / produkt_final
  result = vso

KeyError: 1580