In [2]:
import numpy as np
import pandas as pd
from datetime import datetime

In [3]:
class UserItemData:
    def __init__(self, path, start_date=None, end_date=None, min_ratings=0):
        self.path = path
        self.start_date = start_date
        self.end_date = end_date
        self.min_ratings = min_ratings
        self.data = pd.read_csv(path, encoding="windows-1250", on_bad_lines='skip', sep='\t', skiprows = 0)
        
        if self.start_date is not None and self.end_date is not None:
            self.data = self.uredi_ratings()

    def nratings(self):
        return len(self.data)

    # ----------------------------------------------------------------
    # Metoda ki ti uredi pdoatke. Pretvori iskane datume v datetime, z for loopom naredimo stolpec z datumi, 
    # potem primerjam datume in odstranim neželjene in nato še pogledam kateri imajo min_rating >= 100
    def uredi_ratings(self):
        dates = []
        start_d = ""
        end_d = ""
        if self.start_date is not None and self.end_date is not None:
            start_d = datetime.strptime(self.start_date, '%d.%m.%Y')
            end_d = datetime.strptime(self.end_date, '%d.%m.%Y')

        for ind in self.data.index:
            day = str(int(self.data['date_day'][ind]))
            month = str(int(self.data['date_month'][ind]))
            year = str(int(self.data['date_year'][ind]))

            d = day + "." + month + "." + year
            date = datetime.strptime(d, '%d.%m.%Y') 
            dates.append(date)

        self.data['date'] = dates

        self.data = self.data[(self.data['date'] >= start_d) & (self.data['date'] < end_d)]
        self.data['Allratings'] = self.data.groupby('movieID')['movieID'].transform('size')
        
        if self.min_ratings is not None:
            self.data = self.data.loc[self.data['Allratings'] >= self.min_ratings]
        
        return self.data


uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855598
73584


In [4]:
class MovieData:
    def __init__(self, path):
        self.path = path
        self.movies = pd.read_csv(path, encoding="windows-1250", on_bad_lines='skip', sep='\t', skiprows = 0)

    def get_title(self, movieID):
        return self.movies.loc[self.movies["id"] == movieID]["title"].values[0]

md = MovieData('data/movies.dat')
print(md.get_title(1))

Toy story


In [24]:
class RandomPredictor:
    def __init__(self, min, max):
        self.min = min
        self.max = max
        self.uim = None
    
    def fit(self, X):
        self.uim = X

    def predict(self, user_id):
        if self.uim is not None:
            df = self.uim.data.loc[self.uim.data["userID"] == user_id]
            movies = np.array(self.uim.data["movieID"])
            numbers = np.random.randint(self.min, self.max+1, len(movies))
            slovar = {i:j for i,j in zip(movies, numbers)}

            return slovar
        else:
            print("none")

md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 4
Film: Grumpy Old Men, ocena: 2
Film: Money Train, ocena: 4
Film: The Usual Suspects, ocena: 5
Film: City Hall, ocena: 1


In [27]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        self.uim = None

    def fit(self, X):
        self.uim = X

    def recommend(self, userID, n = 10, rec_seen = True):
        if self.uim is not None:
            df = self.uim.data.loc[self.uim.data["userID"] == userID]

        return df


md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
#for idmovie, val in rec_items:
#    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

rec_items

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
55,78,17,4.0,10,10,2004,15,34,55
56,78,29,4.5,16,4,2007,4,57,48
57,78,32,5.0,7,5,2004,23,32,18
58,78,41,4.5,10,10,2004,15,56,39
59,78,82,4.0,7,5,2004,23,20,36
...,...,...,...,...,...,...,...,...,...
518,78,46578,5.0,16,4,2007,2,32,25
519,78,46976,4.5,16,4,2007,2,29,47
520,78,48774,4.5,16,4,2007,2,30,18
521,78,49272,3.0,16,4,2007,2,30,48
