In [1]:
import numpy as np
import pandas as pd
import itertools as IT
import datetime
# from tqdm import tqdm


In [65]:
class UserItemData:
    
    def __init__(self,path,start=None,end=None,min_ratings=0):
        df = pd.read_csv(path,sep="\s+")        #branje .dat datoteke
        
        #selekcija po minimalnem datumu
        if start is not None:
            s_day , s_month, s_year = start.strip().split(".")  
            s_day = int(s_day)
            s_month = int(s_month)
            s_year = int(s_year)
            
            #glede na leto
            df = df.loc[ (df["date_year"]   >= s_year) ] 
            
            #glede na mesec
            tmp_mask = (df["date_year"] == s_year) & (df["date_month"] < s_month)
            df = df.loc[np.logical_not(tmp_mask)]
            
            #glede na dan
            tmp_mask = (df["date_year"] == s_year) & (df["date_month"] == s_month) & (df["date_day"] < s_day)
            df = df.loc[np.logical_not(tmp_mask)]
        
        if end is not None:
            e_day , e_month, e_year = end.strip().split(".")  
            e_day = int(e_day)
            e_month = int(e_month)
            e_year = int(e_year)
            
            #glede na leto
            df = df.loc[ (df["date_year"] <= int(e_year)) ] 
            
            #glede na mesec
            tmp_mask = (df["date_year"] == int(e_year)) & (df["date_month"] > int(e_month))
            df = df.loc[np.logical_not(tmp_mask)]
            
            #glede na dan
            tmp_mask = (df["date_year"] == e_year) & (df["date_month"] == e_month) & (df["date_day"] >= e_day)
            df = df.loc[np.logical_not(tmp_mask)]
        
        counts = df["movieID"].value_counts()
        mask = counts > min_ratings
        ids = counts.loc[mask].index.values
        df = df.loc[df["movieID"].isin(ids)]
        
        self.data = df
        self.len = len(self.data)
        
    def nrows(self):
        return self.len
    
    def movies_from_user(self, uid):
        user_mask = self.data["userID"] == uid
        return self.data["movieID"].loc[user_mask]

In [3]:
class MovieData:
    def __init__(self,path):
        self.data =  pd.read_csv(path,sep="\t+",usecols=["id","title"],engine="python")
    
    def get_title(self,id):
        return self.data["title"].loc[self.data["id"] == id].values[0]

In [4]:
class RandomPredictor:
    def __init__(self, min_grade=0, max_grade=5):
        self.min_grade = min_grade
        self.max_grade = max_grade + 1
    def predict(self,uid):
        mids = set(self.data["movieID"].values)
        N = len(mids)
        ratings = np.random.randint(self.min_grade, self.max_grade,N)
        items = zip(mids,ratings)
        return dict(items)
    def fit(self,X):
        self.data = X.data

In [124]:
class AveragePredictor:
    def __init__(self, b):
        self.b = b
    
    def fit(self, X):
        g_avg = X.data["rating"].mean()
        m_sum = X.data[["movieID", "rating"]].groupby("movieID").sum().rename(columns={"rating":"sum"})
        m_count = X.data[["movieID", "rating"]].groupby("movieID").count().rename(columns={"rating":"count"})
        print(m_sum)

In [5]:
class ViewsPredictor:
    def __init__(self):
        pass
    def fit(self,X):
        n = X.data["movieID"].value_counts()   
        vals = n.values.flatten()
        keys = n.index
        self.predictions = dict( zip(keys,vals) )
    def predict(self,uid):
        return dict(sorted(self.predictions.items(), key=lambda item: item[1], reverse=True))

In [7]:
class STDPredictor:
    def __init__(self, n):
        self.n = n
    
    def fit(self,X):
        mask = X.data["movieID"].value_counts() > self.n
        mids = X.data["movieID"].value_counts().loc[mask]
        mids = mids.index
        
        mask = X.data["movieID"].isin(mids)
        tmp = X.data[["movieID","rating"]].loc[mask].groupby("movieID").std()
        self.predictions = dict( zip(tmp.index, tmp.values.flatten()) ) 
        
    def predict(self,uid):
        return dict(sorted(self.predictions.items(), key=lambda item: item[1], reverse=True))

In [90]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        
    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        preds = self.predictor.predict(userID)
        rec_movies = sorted(preds.items(), key=lambda item: item[1], reverse=True)
        if not rec_seen:
            rec_movies = [m for m in rec_movies if m[0] not in self.uim.movies_from_user(userID)]
        return dict(rec_movies[:n])

In [79]:
md = MovieData('movielens/movies.dat')
uim = UserItemData('movielens/user_ratedmovies.dat') 

In [105]:
rp = ViewsPredictor()
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film {}: {}, ocena: {}".format(idmovie, md.get_title(idmovie), val))  

Film 2571: The Matrix, ocena: 1670
Film 4993: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film 5952: The Lord of the Rings: The Two Towers, ocena: 1528
Film 2858: American Beauty, ocena: 1472
Film 7153: The Lord of the Rings: The Return of the King, ocena: 1457


In [106]:
for mid in rec_items.keys():
    if mid in uim.movies_from_user(78):
        print("Fail")

In [125]:
uim.movies_from_user(78)

55        17
56        29
57        32
58        41
59        82
       ...  
518    46578
519    46976
520    48774
521    49272
522    50872
Name: movieID, Length: 468, dtype: int64

In [126]:
p = AveragePredictor(0)
p.fit(uim)

            sum
movieID        
1        4717.5
2        2277.0
3         724.0
4         116.0
5         619.5
...         ...
65088       3.5
65091       4.0
65126       6.5
65130       2.5
65133      12.0

[10109 rows x 1 columns]
