In [2]:
import numpy as np
import pandas as pd
import itertools as IT
import datetime
# from tqdm import tqdm


In [3]:
class UserItemData:
    
    def __init__(self,path,start_date=None,end_date=None,min_ratings=0):
        self.data = pd.read_csv(path,sep="\s+")
        
        if start_date is not None: # Take into account start_date
            d , m, y = start_date.strip().split(".")
            self.data = self.data.loc[ (self.data["date_year"]   >= int(y)) ]
            self.data = self.data.loc[~((self.data["date_year"] == int(y)) & (self.data["date_month"] < int(m)))]
            self.data = self.data.loc[~((self.data["date_year"] == int(y)) & (self.data["date_month"] == int(m)) & (self.data["date_day"] < int(d)))]
        
        if end_date is not None:
            d , m, y = end_date.strip().split(".")
            self.data = self.data.loc[ (self.data["date_year"] <= int(y)) ]
            self.data = self.data.loc[~((self.data["date_year"] == int(y)) & (self.data["date_month"] > int(m)))]
            self.data = self.data.loc[~((self.data["date_year"] == int(y)) & (self.data["date_month"] == int(m)) & (self.data["date_day"] >= int(d)))]
        
        nr = self.data["movieID"].value_counts()
        self.data = self.data.loc[self.data["movieID"].isin(nr.loc[(nr > min_ratings)].index.values)]
        
    def nratings(self):
        return len(self.data.index)
    
    def movies_from_user(self, uid):
        user_mask = self.data["userID"] == uid
        return self.data["movieID"].loc[user_mask]

In [4]:
class MovieData:
    def __init__(self,path):
        self.data =  pd.read_csv(path,sep="\t+",usecols=["id","title"],engine="python")
    
    def get_title(self,id):
        try:
            return self.data["title"].loc[self.data["id"] == id].values[0]
        except IndexError:
            return None        

In [5]:
class RandomPredictor:
    def __init__(self, min_grade=0, max_grade=5):
        self.min = min_grade
        self.max = max_grade
        
    def fit(self,X):
        self.movies = X.data["movieID"].values
        
    def predict(self,uid):
        preds = dict()
        for m in self.movies:
            preds[m] = np.random.randint(self.min, self.max+1)
        return preds

In [6]:
class AveragePredictor:
    def __init__(self, b):
        self.b = b
    
    def fit(self, X):
        g_avg = X.data["rating"].mean()
        m_sum = X.data[["movieID", "rating"]].groupby("movieID").sum().rename(columns={"rating":"sum"})
        m_count = X.data[["movieID", "rating"]].groupby("movieID").count().rename(columns={"rating":"count"})
        data = m_sum.merge(right=m_count, how='inner', left_index=True, right_index=True)
        preds = data.apply(lambda x: pd.Series([(x['sum'] + self.b * g_avg)/(x['count']+self.b)], index=['ocena']), axis=1)
        self.preds = dict(zip(preds.index, preds.ocena))
        
    def predict(self, uid):
        return self.preds

In [7]:
class ViewsPredictor:
    def fit(self,X):
        self.data = X.data
        
    def predict(self,uid):
        return self.data["movieID"].value_counts()

In [8]:
class STDPredictor:
    def __init__(self, n):
        self.n = n
    
    def fit(self,X):
        self.data = X.data
        
    def predict(self,uid):
        nr = self.data["movieID"].value_counts()
        movies = nr.loc[(nr > self.n)].index
        
        mask = self.data["movieID"].isin(movies)
        self.preds = self.data[["movieID","rating"]].loc[mask].groupby("movieID").std()["rating"]
        return self.preds

In [14]:
class ItemBasedPredictor:
    def __init__(self, min_ratings=0, threshold=0):
        self.mr = min_ratings
        self.thr = threshold
        
    def get_most_similar_movies(self, n=20):
        if not hasattr(self, 'sm'):
            return []
        else:
            sims = []
            for i, row in enumerate(self.sm.index[:-1]):
                for col in self.sm.columns[(i+1):]:
                    sims.append((col, row, self.sm[col][row]))
            return sorted(sims, key=lambda x: x[2], reverse=True)[:n]
        
    def similar_items(self, item, n):
        if not hasattr(self, 'sm'):
            return []
        else:
            try:
                return self.sm[item].sort_values(ascending=False)[1:(n+1)]
            except KeyError:
                return []
            
    def similarity(self, p1, p2):
        return self.sm[p1][p2]
        
    def build_similarity_matrix(self, m):
        sm = pd.DataFrame(pd.DataFrame(), columns=m.columns, index=m.columns)
        
        for c1 in m.columns:
            r1 = m[c1].dropna().rename("r1")
            for c2 in m.columns:
                # inner join
                r2 = m[c2].dropna().rename("r2")
                r12 = pd.merge(left=r1, right=r2, how='inner', left_index=True, right_index=True)
                
                if len(r12.index) < self.mr: # if not enough ratings then similarity is 0
                    sm.at[c1, c2] = 0
                    continue
                
                if c1 == c2: # if same columns then similarity is 1
                    sm.at[c1, c2] = 1
                    continue
                
                # similarity calculation
                dot_product = r12["r1"] @ r12["r2"]
                norm_1 = np.linalg.norm(r12["r1"])
                norm_2 = np.linalg.norm(r12["r2"])
                similarity = dot_product/(norm_1 * norm_2) if (norm_1 * norm_2) != 0 else 0
                
                # if below threshold then similarity is 0
                sm.at[c1, c2] = similarity if similarity >= self.thr else 0
        self.sm = sm
    
    def fit(self, X):
        # Pivot table and normalize by subtracting ratings by their users average rating
        mr = X.data.pivot_table(index="userID", columns="movieID", values="rating")
        mr['avg'] = mr.mean(axis=1)
        self.urm = mr
        norm = mr.sub(mr['avg'], axis=0)
        del norm['avg']
        self.urm_norm = norm
        self.build_similarity_matrix(norm)   
    
    def predict(self, userID):
        ra = self.urm['avg'][userID]
        
        user_rated_movies = self.urm.loc[userID, :][:-1].dropna().index # movies rated by userID
        
        self.preds = dict()
        # Calculate prediction score for each movie
        for m in self.urm.columns[:-1]: # score(userID, m)
            s1 = 0
            s2 = 0
            for um in user_rated_movies:
                s = self.similarity(m, um)
                rating_scale = self.urm[um][userID] - ra
                s1 += s*rating_scale
                s2 += s
            score = (s1/s2)+ra
            self.preds[m] = score
        return self.preds

In [15]:
class SlopeOnePredictor:
    def fit(self, X):
        # Pivot table and normalize by subtracting ratings by their users average rating
        self.mr = X.data.pivot_table(index="userID", columns="movieID", values="rating")
    
    def predict(self, userID):
        user_rated_movies = self.mr.loc[userID, :].dropna()
        user_unrated_movies = self.mr.loc[userID, ~self.mr.loc[userID, :].notna()]
        
        preds = []
        
        for unrated_movie in user_unrated_movies.index: # Predicting scores for unrated movies ...
            score = 0
            n = 0
            mr1 = self.mr[unrated_movie].dropna().rename("r1")
            for rated_movie in user_rated_movies.index: # ...with help of rated movies
                sub_score = self.mr.loc[userID, rated_movie]
                mr2 = self.mr[rated_movie].dropna().rename("r2")
                
                # Inner join on both movies -> only ratings from users who rated both movies
                mr12 = pd.merge(left=mr1, right=mr2, how='inner', left_index=True, right_index=True)
                diff = mr12["r1"] - mr12["r2"]
                sub_score += diff.mean()
                sub_score *= len(mr12.index)
                n += len(mr12.index)
                score += sub_score
            score  = score/n if n != 0 else 0
            preds.append(score)
        to_append = pd.Series(preds, index=user_unrated_movies.index)
        self.preds = user_rated_movies.append(to_append)
        return dict(self.preds)

In [11]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        
    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        preds = self.predictor.predict(userID)
        rec_movies = sorted(preds.items(), key=lambda x: x[1], reverse=True)
        if not rec_seen:
            rec_movies = [m for m in rec_movies if m[0] not in self.uim.movies_from_user(userID).values]
        return dict(rec_movies[:n])

In [12]:
md = MovieData('movielens/movies.dat')
uim = UserItemData('movielens/user_ratedmovies.dat') 

In [13]:
pr = AveragePredictor(100)
rec = Recommender(pr)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Usual Suspects, ocena: 4.225944245560473
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Cidade de Deus, ocena: 4.116538340205236
Film: The Dark Knight, ocena: 4.10413904093503
Film: 12 Angry Men, ocena: 4.103639627096175


In [36]:
pr = AveragePredictor(100)
rec = Recommender(pr)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Plan 9 from Outer Space, ocena: 1.3449520951495717
Film: The Passion of the Christ, ocena: 1.281493459525735
Film: The Texas Chainsaw Massacre, ocena: 1.235349321908819
Film: Jackass Number Two, ocena: 1.2189769976366684
Film: White Chicks, ocena: 1.1899581424297319


In [16]:
my_ratings = [(20,4),(480,4.5),(1270,3.5),(6539,5),(1196,3.5),(260,3),(541,4),(2571,5),(8961,4),(1240,4),(589,5),(1036,5),
             (1721,5),(648,3.5),(5349,3.5),(2628,3),(597,2),(1291,3.5),(457,4.5)]

In [17]:
uim_my = UserItemData('movielens/user_ratedmovies.dat', min_ratings=1000) 

In [18]:
for movie, rating in my_ratings:
    uim_my.data = uim_my.data.append([{"userID":1, "movieID": movie, "rating":rating,
                                "date_day": 1, "date_month": 1, "date_year": 2021, "date_hour": 1,
                                "date_minute": 1, "date_second": 1}], ignore_index=True)

In [19]:
pr = ItemBasedPredictor()
rec = Recommender(pr)
rec.fit(uim_my)

In [20]:
rec_items = rec.recommend(1, n=10, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film {}: {}, ocena: {}".format(idmovie, md.get_title(idmovie), val))

Film 1704: Good Will Hunting, ocena: 5.0
Film 7438: Kill Bill: Vol. 2, ocena: 5.0
Film 6874: Kill Bill: Vol. 2, ocena: 4.810554185599607
Film 2762: The Sixth Sense, ocena: 4.7157258968696745
Film 47: Shichinin no samurai, ocena: 4.419897285820535
Film 2028: Saving Private Ryan, ocena: 4.379178024873356
Film 2858: American Beauty, ocena: 4.320662461778854
Film 593: The Silence of the Lambs, ocena: 4.291815661708156
Film 2959: Fight Club, ocena: 4.286300980895842
Film 32587: Sin City, ocena: 4.285364107974284


In [21]:
pr = SlopeOnePredictor()
rec = Recommender(pr)
rec.fit(uim)

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film {}: {}, ocena: {}".format(idmovie, md.get_title(idmovie), val))

Predictions for 78: 
Film 50: The Usual Suspects, ocena: 4.325079182263173
Film 4993: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.155293229840448
Film 7153: The Lord of the Rings: The Return of the King, ocena: 4.153135076202185
Film 593: The Silence of the Lambs, ocena: 4.127978169643881
Film 47: Shichinin no samurai, ocena: 4.119790444913598
Film 5952: The Lord of the Rings: The Two Towers, ocena: 4.083325894849594
Film 1291: Indiana Jones and the Last Crusade, ocena: 3.9670398355464194
Film 8961: The Incredibles, ocena: 3.9664496674557546
Film 1704: Good Will Hunting, ocena: 3.963362387354114
Film 32587: Sin City, ocena: 3.942619137615212
Film 33794: Batman Begins, ocena: 3.9375326640077017
Film 4995: A Beautiful Mind, ocena: 3.9140940935239508
Film 1961: Rain Man, ocena: 3.9107819079644943
Film 4886: Monsters, Inc., ocena: 3.8819375978658006
Film 6377: Finding Nemo, ocena: 3.8807711131654794


In [22]:
uim = UserItemData('movielens/user_ratedmovies.dat', start_date='12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

72784


In [27]:
md.get_title(65089)

IndexError: index 0 is out of bounds for axis 0 with size 0