In [5]:
import numpy as np
import pandas as pd
import itertools as IT
import datetime
# from tqdm import tqdm


In [6]:
class UserItemData:
    
    def __init__(self,path,start=None,end=None,min_ratings=0):
        df = pd.read_csv(path,sep="\s+")        #branje .dat datoteke
        
        #selekcija po minimalnem datumu
        if start is not None:
            s_day , s_month, s_year = start.strip().split(".")  
            s_day = int(s_day)
            s_month = int(s_month)
            s_year = int(s_year)
            
            #glede na leto
            df = df.loc[ (df["date_year"]   >= s_year) ] 
            
            #glede na mesec
            tmp_mask = (df["date_year"] == s_year) & (df["date_month"] < s_month)
            df = df.loc[np.logical_not(tmp_mask)]
            
            #glede na dan
            tmp_mask = (df["date_year"] == s_year) & (df["date_month"] == s_month) & (df["date_day"] < s_day)
            df = df.loc[np.logical_not(tmp_mask)]
        
        if end is not None:
            e_day , e_month, e_year = end.strip().split(".")  
            e_day = int(e_day)
            e_month = int(e_month)
            e_year = int(e_year)
            
            #glede na leto
            df = df.loc[ (df["date_year"] <= int(e_year)) ] 
            
            #glede na mesec
            tmp_mask = (df["date_year"] == int(e_year)) & (df["date_month"] > int(e_month))
            df = df.loc[np.logical_not(tmp_mask)]
            
            #glede na dan
            tmp_mask = (df["date_year"] == e_year) & (df["date_month"] == e_month) & (df["date_day"] >= e_day)
            df = df.loc[np.logical_not(tmp_mask)]
        
        counts = df["movieID"].value_counts()
        mask = counts > min_ratings
        ids = counts.loc[mask].index.values
        df = df.loc[df["movieID"].isin(ids)]
        
        self.data = df
        self.len = len(self.data)
        
    def nrows(self):
        return self.len
    
    def movies_from_user(self, uid):
        user_mask = self.data["userID"] == uid
        return self.data["movieID"].loc[user_mask]

In [7]:
class MovieData:
    def __init__(self,path):
        self.data =  pd.read_csv(path,sep="\t+",usecols=["id","title"],engine="python")
    
    def get_title(self,id):
        return self.data["title"].loc[self.data["id"] == id].values[0]

In [8]:
class RandomPredictor:
    def __init__(self, min_grade=0, max_grade=5):
        self.min_grade = min_grade
        self.max_grade = max_grade + 1
    def predict(self,uid):
        mids = set(self.data["movieID"].values)
        N = len(mids)
        ratings = np.random.randint(self.min_grade, self.max_grade,N)
        items = zip(mids,ratings)
        return dict(items)
    def fit(self,X):
        self.data = X.data

In [47]:
class AveragePredictor:
    def __init__(self, b):
        self.b = b
    
    def fit(self, X):
        g_avg = X.data["rating"].mean()
        m_sum = X.data[["movieID", "rating"]].groupby("movieID").sum().rename(columns={"rating":"sum"})
        m_count = X.data[["movieID", "rating"]].groupby("movieID").count().rename(columns={"rating":"count"})
        data = m_sum.merge(right=m_count, how='inner', left_index=True, right_index=True)
        preds = data.apply(lambda x: pd.Series([(x['sum'] + self.b * g_avg)/(x['count']+self.b)], index=['ocena']), axis=1)
        self.preds = dict(zip(preds.index, preds.ocena))
        
    def predict(self, uid):
        return dict(sorted(self.preds.items(), key=lambda x: x[1], reverse=True))

In [10]:
class ViewsPredictor:
    def __init__(self):
        pass
    def fit(self,X):
        n = X.data["movieID"].value_counts()   
        vals = n.values.flatten()
        keys = n.index
        self.predictions = dict( zip(keys,vals) )
    def predict(self,uid):
        return dict(sorted(self.predictions.items(), key=lambda item: item[1], reverse=True))

In [11]:
class STDPredictor:
    def __init__(self, n):
        self.n = n
    
    def fit(self,X):
        mask = X.data["movieID"].value_counts() > self.n
        mids = X.data["movieID"].value_counts().loc[mask]
        mids = mids.index
        
        mask = X.data["movieID"].isin(mids)
        tmp = X.data[["movieID","rating"]].loc[mask].groupby("movieID").std()
        self.predictions = dict( zip(tmp.index, tmp.values.flatten()) ) 
        
    def predict(self,uid):
        return dict(sorted(self.predictions.items(), key=lambda item: item[1], reverse=True))

In [12]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        
    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        preds = self.predictor.predict(userID)
        rec_movies = sorted(preds.items(), key=lambda item: item[1], reverse=True)
        if not rec_seen:
            rec_movies = [m for m in rec_movies if m[0] not in self.uim.movies_from_user(userID)]
        return dict(rec_movies[:n])

In [56]:
md = MovieData('movielens/movies.dat')
uim = UserItemData('movielens/user_ratedmovies.dat', min_ratings=100) 

In [59]:
rp = STDPredictor(100)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val)) 

Film: Plan 9 from Outer Space, ocena: 1.3449520951495717
Film: The Passion of the Christ, ocena: 1.281493459525735
Film: The Texas Chainsaw Massacre, ocena: 1.235349321908819
Film: Jackass Number Two, ocena: 1.2189769976366684
Film: White Chicks, ocena: 1.1899581424297319


In [106]:
for mid in rec_items.keys():
    if mid in uim.movies_from_user(78):
        print("Fail")

In [125]:
uim.movies_from_user(78)

55        17
56        29
57        32
58        41
59        82
       ...  
518    46578
519    46976
520    48774
521    49272
522    50872
Name: movieID, Length: 468, dtype: int64

In [43]:
p = AveragePredictor(0)
p.fit(uim)

{1: 3.7351543942992875, 2: 2.976470588235294, 3: 2.873015873015873, 4: 2.577777777777778, 5: 2.7533333333333334, 6: 3.8523890784982937, 7: 3.1826241134751774, 8: 2.717948717948718, 9: 2.3688524590163933, 10: 3.3124147339699865, 11: 3.4195710455764075, 12: 2.4895833333333335, 13: 2.742424242424242, 14: 3.408536585365854, 15: 2.685840707964602, 16: 3.821564885496183, 17: 3.875698324022346, 18: 3.375, 19: 2.5560165975103732, 20: 2.60077519379845, 21: 3.3922155688622753, 22: 3.1551020408163266, 23: 3.0760869565217392, 24: 3.0430622009569377, 25: 3.662921348314607, 26: 3.639705882352941, 27: 3.3152173913043477, 28: 4.0227272727272725, 29: 3.876854599406528, 30: 3.75, 31: 2.9897540983606556, 32: 4.009067357512953, 33: 3.25, 34: 3.286363636363636, 35: 3.1818181818181817, 36: 3.809133489461358, 37: 3.375, 38: 2.176470588235294, 39: 3.2415123456790123, 40: 3.4411764705882355, 41: 3.9794520547945207, 42: 2.917910447761194, 43: 3.265957446808511, 44: 2.4522292993630574, 45: 3.455223880597015, 46: