In [2]:
import numpy as np
import pandas as pd
from datetime import datetime

In [3]:
class UserItemData:
    def __init__(self, path, start_date=None, end_date=None, min_ratings=0):
        self.path = path
        self.start_date = start_date
        self.end_date = end_date
        self.min_ratings = min_ratings
        self.data = pd.read_csv(path, encoding="windows-1250", on_bad_lines='skip', sep='\t', skiprows = 0)
        
        if self.start_date is not None and self.end_date is not None:
            self.data = self.uredi_ratings()

    def nratings(self):
        return len(self.data)

    # ----------------------------------------------------------------
    # Metoda ki ti uredi pdoatke. Pretvori iskane datume v datetime, z for loopom naredimo stolpec z datumi, 
    # potem primerjam datume in odstranim neželjene in nato še pogledam kateri imajo min_rating >= 100
    def uredi_ratings(self):
        dates = []
        start_d = ""
        end_d = ""
        if self.start_date is not None and self.end_date is not None:
            start_d = datetime.strptime(self.start_date, '%d.%m.%Y')
            end_d = datetime.strptime(self.end_date, '%d.%m.%Y')

        for ind in self.data.index:
            day = str(int(self.data['date_day'][ind]))
            month = str(int(self.data['date_month'][ind]))
            year = str(int(self.data['date_year'][ind]))

            d = day + "." + month + "." + year
            date = datetime.strptime(d, '%d.%m.%Y') 
            dates.append(date)

        self.data['date'] = dates

        self.data = self.data[(self.data['date'] >= start_d) & (self.data['date'] < end_d)]
        self.data['Allratings'] = self.data.groupby('movieID')['movieID'].transform('size')
        
        if self.min_ratings is not None:
            self.data = self.data.loc[self.data['Allratings'] >= self.min_ratings]
        
        return self.data


uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855598
73584


In [4]:
class MovieData:
    def __init__(self, path):
        self.path = path
        self.movies = pd.read_csv(path, encoding="windows-1250", on_bad_lines='skip', sep='\t', skiprows = 0)

    def get_title(self, movieID):
        return self.movies.loc[self.movies["id"] == movieID]["title"].values[0]

md = MovieData('data/movies.dat')
print(md.get_title(1))

Toy story


In [256]:
class RandomPredictor:
    def __init__(self, min, max):
        self.min = min
        self.max = max
        self.uim = None
    
    def fit(self, X):
        self.uim = X

    def predict(self, userID, rec_seen=True):
        if self.uim is not None:
            df = self.uim.data.loc[self.uim.data["userID"] == userID]['movieID']
            movies = pd.Series(list(df))
            movies.name = "moviesID"
            #self.uim.data = self.uim.data[~self.uim.data.index.isin(movies)]
            self.uim.data = self.uim.data[~self.uim.data['movieID'].isin(movies)]
                
            movies = np.array(self.uim.data["movieID"])
            numbers = np.random.randint(self.min, self.max+1, len(movies))
            res = {}
            for i in range(len(movies)):
                res[movies[i]] = numbers[i] 

            return res
        else:
            print("none")

md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 5
Film: Grumpy Old Men, ocena: 5
Film: Money Train, ocena: 4
Film: The Usual Suspects, ocena: 1
Film: City Hall, ocena: 5


In [197]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        self.uim = None

    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n = 10, rec_seen = True):
        return dict(sorted(self.predictor.predict(userID, rec_seen).items(), key=lambda item: (item[1]), reverse=True)[:n])


md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Ransom, ocena: 5
Film: Gone with the Wind, ocena: 5
Film: Das Boot, ocena: 5
Film: Armageddon, ocena: 5
Film: The General's Daughter, ocena: 5


In [257]:
class AveragePredictor:
    def __init__(self, b):
        if b >= 0:
            self.b = b
        else:
            self.b = 0

    def fit(self, X):
        self.uim = X

    def predict(self, userID, rec_seen=True):
        self.uim.data['n'] = self.uim.data.groupby('movieID')['movieID'].transform('size')
        self.uim.data['vs'] = self.uim.data.groupby('movieID')['rating'].transform('sum')
        self.g_avg = self.uim.data['rating'].mean()
        # avg = (vs + b * g_avg) / (n + b
        self.uim.data['avg'] = (self.uim.data['vs'] + (self.b * self.g_avg)) / (self.uim.data['n'] + self.b)
        
        df = self.uim.data.loc[self.uim.data["userID"] == userID]['movieID']
        movies = pd.Series(list(df))
        movies.name = "moviesID"
        #self.uim.data = self.uim.data[~self.uim.data.index.isin(movies)]
        self.uim.data = self.uim.data[~self.uim.data['movieID'].isin(movies)]
            
        movies = list(self.uim.data["movieID"])
        avg = list(self.uim.data["avg"])

        res = {}
        for i in range(len(movies)):
            res[movies[i]] = avg[i] 

        return res


md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = AveragePredictor(100)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Usual Suspects, ocena: 4.225944245560473
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Cidade de Deus, ocena: 4.116538340205236
Film: The Dark Knight, ocena: 4.10413904093503
Film: 12 Angry Men, ocena: 4.103639627096175


In [258]:
class ViewsPredictor:
    def __init__(self):
        pass

    def fit(self, X):
        self.uim = X

    def predict(self, userID, rec_seen):
        df = self.uim.data.loc[self.uim.data["userID"] == userID]["movieID"]
        movies = pd.Series(list(df))
        movies.name = "moviesID"
        self.uim.data = self.uim.data.loc[~self.uim.data['movieID'].isin(movies)]

        return dict(self.uim.data.groupby("movieID").count()['userID'])

md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = ViewsPredictor()
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
   print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: The Lord of the Rings: The Two Towers, ocena: 1528
Film: The Lord of the Rings: The Return of the King, ocena: 1457
Film: The Silence of the Lambs, ocena: 1431
Film: Shrek, ocena: 1404
