In [1]:
import numpy as np
import pandas as pd
import itertools as IT
import datetime
# from tqdm import tqdm


In [2]:
class UserItemData:
    
    def __init__(self,path,start=None,end=None,min_ratings=0):
        df = pd.read_csv(path,sep="\s+")        #branje .dat datoteke
        
        #selekcija po minimalnem datumu
        if start is not None:
            s_day , s_month, s_year = start.strip().split(".")  
            s_day = int(s_day)
            s_month = int(s_month)
            s_year = int(s_year)
            
            #glede na leto
            df = df.loc[ (df["date_year"]   >= s_year) ] 
            
            #glede na mesec
            tmp_mask = (df["date_year"] == s_year) & (df["date_month"] < s_month)
            df = df.loc[np.logical_not(tmp_mask)]
            
            #glede na dan
            tmp_mask = (df["date_year"] == s_year) & (df["date_month"] == s_month) & (df["date_day"] < s_day)
            df = df.loc[np.logical_not(tmp_mask)]
        
        if end is not None:
            e_day , e_month, e_year = end.strip().split(".")  
            e_day = int(e_day)
            e_month = int(e_month)
            e_year = int(e_year)
            
            #glede na leto
            df = df.loc[ (df["date_year"] <= int(e_year)) ] 
            
            #glede na mesec
            tmp_mask = (df["date_year"] == int(e_year)) & (df["date_month"] > int(e_month))
            df = df.loc[np.logical_not(tmp_mask)]
            
            #glede na dan
            tmp_mask = (df["date_year"] == e_year) & (df["date_month"] == e_month) & (df["date_day"] >= e_day)
            df = df.loc[np.logical_not(tmp_mask)]
        
        counts = df["movieID"].value_counts()
        mask = counts > min_ratings
        ids = counts.loc[mask].index.values
        df = df.loc[df["movieID"].isin(ids)]
        
        self.data = df
        self.len = len(self.data)
        
    def nrows(self):
        return self.len
    
    def movies_from_user(self, uid):
        user_mask = self.data["userID"] == uid
        return self.data["movieID"].loc[user_mask]

In [3]:
class MovieData:
    def __init__(self,path):
        self.data =  pd.read_csv(path,sep="\t+",usecols=["id","title"],engine="python")
    
    def get_title(self,id):
        return self.data["title"].loc[self.data["id"] == id].values[0]

In [4]:
class RandomPredictor:
    def __init__(self, min_grade=0, max_grade=5):
        self.min_grade = min_grade
        self.max_grade = max_grade + 1
    def predict(self,uid):
        mids = set(self.data["movieID"].values)
        N = len(mids)
        ratings = np.random.randint(self.min_grade, self.max_grade,N)
        items = zip(mids,ratings)
        return dict(items)
    def fit(self,X):
        self.data = X.data

In [5]:
class AveragePredictor:
    def __init__(self, b):
        self.b = b
    
    def fit(self, X):
        g_avg = X.data["rating"].mean()
        m_sum = X.data[["movieID", "rating"]].groupby("movieID").sum().rename(columns={"rating":"sum"})
        m_count = X.data[["movieID", "rating"]].groupby("movieID").count().rename(columns={"rating":"count"})
        data = m_sum.merge(right=m_count, how='inner', left_index=True, right_index=True)
        preds = data.apply(lambda x: pd.Series([(x['sum'] + self.b * g_avg)/(x['count']+self.b)], index=['ocena']), axis=1)
        self.preds = dict(zip(preds.index, preds.ocena))
        
    def predict(self, uid):
        return dict(sorted(self.preds.items(), key=lambda x: x[1], reverse=True))

In [6]:
class ViewsPredictor:
    def __init__(self):
        pass
    def fit(self,X):
        n = X.data["movieID"].value_counts()   
        vals = n.values.flatten()
        keys = n.index
        self.predictions = dict( zip(keys,vals) )
    def predict(self,uid):
        return dict(sorted(self.predictions.items(), key=lambda item: item[1], reverse=True))

In [7]:
class STDPredictor:
    def __init__(self, n):
        self.n = n
    
    def fit(self,X):
        mask = X.data["movieID"].value_counts() > self.n
        mids = X.data["movieID"].value_counts().loc[mask]
        mids = mids.index
        
        mask = X.data["movieID"].isin(mids)
        tmp = X.data[["movieID","rating"]].loc[mask].groupby("movieID").std()
        self.predictions = dict( zip(tmp.index, tmp.values.flatten()) ) 
        
    def predict(self,uid):
        return dict(sorted(self.predictions.items(), key=lambda item: item[1], reverse=True))

In [8]:
class ItemBasedPredictor:
    def __init__(self, min_ratings=0, threshold=0):
        self.mr = min_ratings
        self.thr = threshold
        
    def get_most_similar_movies(self, n=20):
        if not hasattr(self, 'sm'):
            return []
        else:
            sims = []
            for i, row in enumerate(self.sm.index[:-1]):
                for col in self.sm.columns[(i+1):]:
                    sims.append((col, row, self.sm[col][row]))
            return sorted(sims, key=lambda x: x[2], reverse=True)[:n]
        
    def similar_items(self, item, n):
        if not hasattr(self, 'sm'):
            return []
        else:
            try:
                return self.sm[item].sort_values(ascending=False)[1:(n+1)]
            except KeyError:
                return []
            
    def similarity(self, p1, p2):
        return self.sm[p1][p2]
        
    def build_similarity_matrix(self, m):
        sm = pd.DataFrame(pd.DataFrame(), columns=m.columns, index=m.columns)
        
        for c1 in m.columns:
            r1 = m[c1].dropna().rename("r1")
            for c2 in m.columns:
                # inner join
                r2 = m[c2].dropna().rename("r2")
                r12 = pd.merge(left=r1, right=r2, how='inner', left_index=True, right_index=True)
                
                if len(r12.index) < self.mr: # if not enough ratings then similarity is 0
                    sm.at[c1, c2] = 0
                    continue
                
                if c1 == c2: # if same columns then similarity is 1
                    sm.at[c1, c2] = 1
                    continue
                
                # similarity calculation
                dot_product = r12["r1"] @ r12["r2"]
                norm_1 = np.linalg.norm(r12["r1"])
                norm_2 = np.linalg.norm(r12["r2"])
                similarity = dot_product/(norm_1 * norm_2) if (norm_1 * norm_2) != 0 else 0
                
                # if below threshold then similarity is 0
                sm.at[c1, c2] = similarity if similarity >= self.thr else 0
        self.sm = sm
    
    def fit(self, X):
        # Pivot table and normalize by subtracting ratings by their users average rating
        mr = X.data.pivot_table(index="userID", columns="movieID", values="rating")
        mr['avg'] = mr.mean(axis=1)
        self.urm = mr
        norm = mr.sub(mr['avg'], axis=0)
        del norm['avg']
        self.urm_norm = norm
        self.build_similarity_matrix(norm)   
    
    def predict(self, userID):
        ra = self.urm['avg'][userID]
        
        user_rated_movies = self.urm.loc[userID, :][:-1].dropna().index # movies rated by userID
        
        self.preds = dict()
        # Calculate prediction score for each movie
        for m in self.urm.columns[:-1]: # score(userID, m)
            s1 = 0
            s2 = 0
            for um in user_rated_movies:
                s = self.similarity(m, um)
                rating_scale = self.urm[um][userID] - ra
                s1 += s*rating_scale
                s2 += s
#                 print(s1, s2, rating_scale, s)
            score = (s1/s2)+ra
            self.preds[m] = score
        return self.preds

In [9]:
class SlopeOnePredictor:
    def __init__(self):
        pass
    
    def fit(self, X):
        # Pivot table and normalize by subtracting ratings by their users average rating
        self.mr = X.data.pivot_table(index="userID", columns="movieID", values="rating")
    
    def predict(self, userID):
        user_rated_movies = self.mr.loc[userID, :].dropna()
        user_unrated_movies = pr.mr.loc[userID, ~pr.mr.loc[userID, :].notna()]
        
        for unrated_movie in user_unrated_movies: # Predicting scores for unrated movies ...
            score=0
            mr1 = self.mr[unrated_movie].dropna().rename("r1")
            for rated_movie in user_rated_movies: # ...with help of rated movies
                sub_score = self.mr.loc[userID, rated_movie]
                mr2 = self.mr[rated_movie].dropna().rename("r2")
                
                # Inner join on both movies -> only ratings from users who rated both movies
                mr12 = pd.merge(left=mr1, right=mr2, how='inner', left_index=True, right_index=True)
                diff = mr12["r1"] - mr12["r2"]
                sub_score += diff.mean()
                score += sub_score
                
            

In [10]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        
    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        preds = self.predictor.predict(userID)
        rec_movies = sorted(preds.items(), key=lambda item: item[1], reverse=True)
        if not rec_seen:
            rec_movies = [m for m in rec_movies if m[0] not in self.uim.movies_from_user(userID)]
        return dict(rec_movies[:n])

In [11]:
md = MovieData('movielens/movies.dat')
uim = UserItemData('movielens/user_ratedmovies.dat', min_ratings=1000) 

In [12]:
my_ratings = [(20,4),(480,4.5),(1270,3.5),(6539,5),(1196,3.5),(260,3),(541,4),(2571,5),(8961,4),(1240,4),(589,5),(1036,5),
             (1721,5),(648,3.5),(5349,3.5),(2628,3),(597,2),(1291,3.5),(457,4.5)]

In [13]:
uim_my = UserItemData('movielens/user_ratedmovies.dat', min_ratings=1000) 

In [14]:
for movie, rating in my_ratings:
    uim_my.data = uim_my.data.append([{"userID":1, "movieID": movie, "rating":rating,
                                "date_day": 1, "date_month": 1, "date_year": 2021, "date_hour": 1,
                                "date_minute": 1, "date_second": 1}], ignore_index=True)

In [15]:
pr = ItemBasedPredictor()
rec = Recommender(pr)
rec.fit(uim_my)

In [16]:
rec_items = rec.recommend(1, n=10, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film {}: {}, ocena: {}".format(idmovie, md.get_title(idmovie), val))

Film 1704: Good Will Hunting, ocena: 5.0
Film 7438: Kill Bill: Vol. 2, ocena: 5.0
Film 6874: Kill Bill: Vol. 2, ocena: 4.810554185599607
Film 2762: The Sixth Sense, ocena: 4.7157258968696745
Film 20: Money Train, ocena: 4.545454545454545
Film 47: Shichinin no samurai, ocena: 4.419897285820535
Film 589: Terminator 2: Judgment Day, ocena: 4.384634226128983
Film 2028: Saving Private Ryan, ocena: 4.379178024873356
Film 1036: Die Hard, ocena: 4.340835904395168
Film 6539: Pirates of the Caribbean: The Curse of the Black Pearl, ocena: 4.323609422685804


In [17]:
pr = SlopeOnePredictor()
pr.fit(uim)
pr.predict(78)

KeyError: nan

In [24]:
pr.mr.loc[78, ~pr.mr.loc[78, :].notna()]

movieID
1       NaN
47      NaN
50      NaN
344     NaN
364     NaN
367     NaN
377     NaN
457     NaN
500     NaN
592     NaN
593     NaN
597     NaN
648     NaN
1036    NaN
1291    NaN
1527    NaN
1580    NaN
1704    NaN
1721    NaN
1961    NaN
2628    NaN
2683    NaN
3578    NaN
4306    NaN
4886    NaN
4963    NaN
4993    NaN
4995    NaN
5349    NaN
5418    NaN
5952    NaN
5989    NaN
6377    NaN
6539    NaN
7153    NaN
8961    NaN
32587   NaN
33794   NaN
Name: 78, dtype: float64

In [25]:
pr.mr[1].dropna().rename("r1")

userID
170      3.0
175      4.0
190      4.5
267      2.5
325      4.0
        ... 
71420    5.0
71483    4.0
71497    5.0
71509    4.0
71529    4.5
Name: r1, Length: 1263, dtype: float64

In [20]:
a = pd.Series([1,2,3,4,5])
b = pd.Series([0,1,2,3,4])

In [23]:
pr.mr[32]

userID
75       4.5
78       5.0
170      4.0
175      4.0
190      3.0
        ... 
71497    NaN
71509    3.0
71525    NaN
71529    NaN
71534    4.5
Name: 32, Length: 2083, dtype: float64