In [31]:
import numpy as np
import pandas as pd

In [2]:
class UserItemData:
    
    def __init__(self,path,start_date=None,end_date=None,min_ratings=0):
        self.data = pd.read_csv(path,sep="\s+")
        
        if start_date is not None: # Take into account start_date
            d , m, y = start_date.strip().split(".")
            self.data = self.data.loc[ (self.data["date_year"]   >= int(y)) ]
            self.data = self.data.loc[~((self.data["date_year"] == int(y)) & (self.data["date_month"] < int(m)))]
            self.data = self.data.loc[~((self.data["date_year"] == int(y)) & (self.data["date_month"] == int(m)) & (self.data["date_day"] < int(d)))]
        
        if end_date is not None: # Take into account end_date
            d , m, y = end_date.strip().split(".")
            self.data = self.data.loc[ (self.data["date_year"] <= int(y)) ]
            self.data = self.data.loc[~((self.data["date_year"] == int(y)) & (self.data["date_month"] > int(m)))]
            self.data = self.data.loc[~((self.data["date_year"] == int(y)) & (self.data["date_month"] == int(m)) & (self.data["date_day"] >= int(d)))]
        
        # Exclude movies with number of ratings < min_ratings
        nr = self.data["movieID"].value_counts()
        self.data = self.data.loc[self.data["movieID"].isin(nr.loc[(nr > min_ratings)].index.values)]
        
    def nratings(self):
        return len(self.data.index)
    
    def movies_from_user(self, uid): # Returns all movies from user
        user_mask = self.data["userID"] == uid
        return self.data["movieID"].loc[user_mask]

In [3]:
class MovieData:
    def __init__(self,path):
        self.data =  pd.read_csv(path,sep="\t+",usecols=["id","title"],engine="python")
    
    def get_title(self,id): # Tties to find the title of the given movie id
        try:
            return self.data["title"].loc[self.data["id"] == id].values[0]
        except IndexError:
            return None        

In [10]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        
    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        # Get predictions
        preds = self.predictor.predict(userID)
        # Sort predictions
        rec_movies = sorted(preds.items(), key=lambda x: x[1], reverse=True)
        if not rec_seen:
            # Exclude seen movies
            rec_movies = [m for m in rec_movies if m[0] not in self.uim.movies_from_user(userID).values]
        return dict(rec_movies[:n]) # Return a slice of first n predictions

In [4]:
class RandomPredictor:
    def __init__(self, min_grade=0, max_grade=5):
        self.min = min_grade
        self.max = max_grade
        
    def fit(self,X):
        self.movies = X.data["movieID"].values
        
    def predict(self,uid):
        preds = dict()
        for m in self.movies:
            # Generate random grade for each movie
            preds[m] = np.random.randint(self.min, self.max+1)
        return preds

In [27]:
class AveragePredictor:
    def __init__(self, b):
        self.b = b
    
    def fit(self, X):
        self.data = X.data
        
    def predict(self, uid):
        # Get the overall average (global average)
        g_avg = self.data["rating"].mean()
        # Sum of ratings per movie
        m_sum = self.data[["movieID", "rating"]].groupby("movieID").sum().rename(columns={"rating":"sum"})
        # Number of ratings per movie
        m_count = self.data[["movieID", "rating"]].groupby("movieID").count().rename(columns={"rating":"count"})
        # Merge m_sum and m_count into dataframe
        data = m_sum.merge(right=m_count, how='inner', left_index=True, right_index=True)
        # Calculate an average for each movie(row in dataframe)
        self.preds = data.apply(lambda x: pd.Series([(x['sum'] + self.b * g_avg)/(x['count']+self.b)], index=['ocena']), axis=1)
        return dict(zip(self.preds.index, self.preds.ocena))

In [6]:
class ViewsPredictor:
    def fit(self,X):
        self.data = X.data
        
    def predict(self,uid):
        # Just return number of ratings for each movie
        return self.data["movieID"].value_counts()

In [29]:
class STDPredictor:
    def __init__(self, n):
        self.n = n
    
    def fit(self,X):
        self.data = X.data
        
    def predict(self,uid):
        # Create a mask for movies with number of ratings > self.n
        nr = self.data["movieID"].value_counts()
        movies = nr.loc[(nr > self.n)].index
        mask = self.data["movieID"].isin(movies)
        # Group by movieID and calculate standard deviation
        self.preds = self.data[["movieID","rating"]].loc[mask].groupby("movieID").std()["rating"]
        return self.preds

In [30]:
class ItemBasedPredictor:
    def __init__(self, min_ratings=0, threshold=0):
        self.mr = min_ratings
        self.thr = threshold
        
    def get_most_similar_movies(self, n=20):
        if not hasattr(self, 'sm'):
            # Similarity matrix was not calculated yet @return empty list
            return []
        else:
            sims = []
            # Iterate through upper triangle (excludeing main diagonal) and collect all similarities
            for i, row in enumerate(self.sm.index[:-1]):
                for col in self.sm.columns[(i+1):]:
                    sims.append((col, row, self.sm[col][row]))
            # Sort similarities and slice only first n
            return sorted(sims, key=lambda x: x[2], reverse=True)[:n]
        
    def similar_items(self, item, n):
        if not hasattr(self, 'sm'):
            # Similarity matrix was not calculated yet @return empty list
            return []
        else:
            try:
                # Sort similarities of given movie @return first n+1 (excluding the very first (similarity(item,item)))
                return self.sm[item].sort_values(ascending=False)[1:(n+1)]
            except KeyError:
                # Movie not found in similarity matrix @return empty list
                return []
            
    def similarity(self, m1, m2):
        # @Return similarity between given movies
        return self.sm[m1][m2]
        
    def build_similarity_matrix(self, m):
        sm = pd.DataFrame(pd.DataFrame(), columns=m.columns, index=m.columns)
        
        for c1 in m.columns:
            r1 = m[c1].dropna().rename("r1")
            for c2 in m.columns:
                # inner join
                r2 = m[c2].dropna().rename("r2")
                r12 = pd.merge(left=r1, right=r2, how='inner', left_index=True, right_index=True)
                
                if len(r12.index) < self.mr: # if not enough ratings then similarity is 0
                    sm.at[c1, c2] = 0
                    continue
                
                if c1 == c2: # if same columns then similarity is 1
                    sm.at[c1, c2] = 1
                    continue
                
                # similarity calculation
                dot_product = r12["r1"] @ r12["r2"]
                norm_1 = np.linalg.norm(r12["r1"])
                norm_2 = np.linalg.norm(r12["r2"])
                similarity = dot_product/(norm_1 * norm_2) if (norm_1 * norm_2) != 0 else 0
                
                # if below threshold then similarity is 0
                sm.at[c1, c2] = similarity if similarity >= self.thr else 0
        self.sm = sm
    
    def fit(self, X):
        # Pivot table and normalize by subtracting ratings by their users average rating
        mr = X.data.pivot_table(index="userID", columns="movieID", values="rating")
        mr['avg'] = mr.mean(axis=1)
        self.urm = mr
        norm = mr.sub(mr['avg'], axis=0)
        del norm['avg']
        self.urm_norm = norm
        self.build_similarity_matrix(norm)   
    
    def predict(self, userID):
        ra = self.urm['avg'][userID]
        
        user_rated_movies = self.urm.loc[userID, :][:-1].dropna().index # movies rated by userID
        
        self.preds = dict()
        # Calculate prediction score for each movie
        for m in self.urm.columns[:-1]: # score(userID, m)
            s1 = 0
            s2 = 0
            for um in user_rated_movies:
                s = self.similarity(m, um)
                rating_scale = self.urm[um][userID] - ra
                s1 += s*rating_scale
                s2 += s
            score = (s1/s2)+ra
            self.preds[m] = score
        return self.preds

In [9]:
class SlopeOnePredictor:
    def fit(self, X):
        # Pivot table
        self.mr = X.data.pivot_table(index="userID", columns="movieID", values="rating")
    
    def predict(self, userID):
        # Get user rated and unrated movies
        user_rated_movies = self.mr.loc[userID, :].dropna()
        user_unrated_movies = self.mr.loc[userID, ~self.mr.loc[userID, :].notna()]
        
        preds = []
        for unrated_movie in user_unrated_movies.index: # Predicting scores for unrated movies ...
            score = 0
            n = 0
            mr1 = self.mr[unrated_movie].dropna().rename("r1")
            for rated_movie in user_rated_movies.index: # ...with help of rated movies
                sub_score = self.mr.loc[userID, rated_movie]
                mr2 = self.mr[rated_movie].dropna().rename("r2")
                
                # Inner join on both movies -> only ratings from users who rated both movies
                mr12 = pd.merge(left=mr1, right=mr2, how='inner', left_index=True, right_index=True)
                diff = mr12["r1"] - mr12["r2"]
                sub_score += diff.mean()
                sub_score *= len(mr12.index)
                n += len(mr12.index)
                score += sub_score
            score  = score/n if n != 0 else 0
            preds.append(score)
        to_append = pd.Series(preds, index=user_unrated_movies.index)
        self.preds = user_rated_movies.append(to_append)
        return dict(self.preds)

In [11]:
md = MovieData('movielens/movies.dat')
uim = UserItemData('movielens/user_ratedmovies.dat') 

In [28]:
pr = AveragePredictor(100)
rec = Recommender(pr)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
print(rec_items)
for idmovie, val in rec_items.items():
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

{50: 4.225944245560473, 1221: 4.146907937910189, 6016: 4.116538340205236, 58559: 4.10413904093503, 1203: 4.103639627096175}
Film: The Usual Suspects, ocena: 4.225944245560473
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Cidade de Deus, ocena: 4.116538340205236
Film: The Dark Knight, ocena: 4.10413904093503
Film: 12 Angry Men, ocena: 4.103639627096175


In [None]:
pr = AveragePredictor(100)
rec = Recommender(pr)
rec.fit(uim)
# rec_items = rec.recommend(78, n=5, rec_seen=False)
# for idmovie, val in rec_items.items():
#     print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

In [None]:
my_ratings = [(20,4),(480,4.5),(1270,3.5),(6539,5),(1196,3.5),(260,3),(541,4),(2571,5),(8961,4),(1240,4),(589,5),(1036,5),
             (1721,5),(648,3.5),(5349,3.5),(2628,3),(597,2),(1291,3.5),(457,4.5)]

In [None]:
uim_my = UserItemData('movielens/user_ratedmovies.dat', min_ratings=1000) 

In [None]:
for movie, rating in my_ratings:
    uim_my.data = uim_my.data.append([{"userID":1, "movieID": movie, "rating":rating,
                                "date_day": 1, "date_month": 1, "date_year": 2021, "date_hour": 1,
                                "date_minute": 1, "date_second": 1}], ignore_index=True)

In [None]:
pr = ItemBasedPredictor()
rec = Recommender(pr)
rec.fit(uim_my)

In [None]:
rec_items = rec.recommend(1, n=10, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film {}: {}, ocena: {}".format(idmovie, md.get_title(idmovie), val))

In [None]:
pr = SlopeOnePredictor()
rec = Recommender(pr)
rec.fit(uim)

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film {}: {}, ocena: {}".format(idmovie, md.get_title(idmovie), val))