In [41]:
import os
import numpy as np  
import pandas as pd

from surprise import ( 
  Dataset,
  Reader,
  accuracy, 
  SVD,
  AlgoBase,
  BaselineOnly
) 

from surprise.model_selection import (
  train_test_split
)



In [42]:
class DataLoader: 
  
  def __init__(self, data_path: str, item_path: str, user_path: str) -> None:
    
    current = os.getcwd () [ 0 : os.getcwd ().rfind( '\\' ) ]
    self.DATA_PATH = current + data_path
    self.ITEM_PATH = current + item_path
    self.USER_PATH = current + user_path

    self.data_set = self.load_set ( 'DATA' )
    self.item_set = self.load_set ( 'USER' )
    self.user_set = self.load_set ( 'ITEM' )

  def load_set (self, name: str ) -> pd.DataFrame:

    if name == 'DATA':
      columns = [ 'userID', 'itemID', 'rating', 'timestamp' ]
      df = pd.read_csv ( 
        self.DATA_PATH, 
        names=columns, 
        sep='\t', 
        encoding='latin-1', 
        skipinitialspace=True 
      )
      # df.drop ( columns= [ 'timestamp' ] )
      return df
  
    if name == 'USER':
      columns = [ 'userID', 'age', 'gender', 'occupation', 'zipCode' ]
      df = pd.read_csv ( 
        self.USER_PATH, 
        names=columns, 
        sep='|', 
        encoding='latin-1', 
        skipinitialspace=True 
      )
      # df.drop ( columns= [ 'zipCode' ] )
      return df
  
    if name == 'ITEM':
      columns = [ 
        'itemID', 
        'name', 
        'releaseDate', 
        'videoReleaseDate', 
        'IMDbURL', 
        'gender_unknown', 
        'gender_action', 
        'gender_adventure', 
        'gender_animation', 
        'gender_children', 
        'gender_comedy',
        'gender_crime',
        'gender_documentary',
        'gender_drama',
        'gender_fantasy',
        'gender_film_noir',
        'gender_horror',
        'gender_musical',
        'gender_mystery',
        'gender_romance',
        'gender_scifi',
        'gender_thriller',
        'gender_war',
        'gender_western',
      ]
      df = pd.read_csv ( 
        self.ITEM_PATH, 
        names=columns, 
        sep='|', 
        encoding='latin-1', 
        skipinitialspace=True 
      )
      # df.drop ( columns= [ 'zipCode' ] )
      return df

  def load_dataset ( self ) -> Dataset:
    reader = Reader ( rating_scale= ( 1,5 ) )
    data = Dataset.load_from_df ( self.data_set [ [ 'userID', 'itemID', 'rating' ] ], reader )
    return data

  def get_user_by_id ( self, id: int ):
    pass

  def get_item_by_id ( self, id: int ):
    pass

In [43]:
DATA_PATH = '\\dataset\\data.csv'
ITEM_PATH = '\\dataset\\item.csv'
USER_PATH = '\\dataset\\user.csv'

In [44]:
loader = DataLoader( 
  data_path=DATA_PATH,
  item_path=ITEM_PATH,
  user_path=USER_PATH 
)


In [45]:
loader.data_set

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [46]:
class Metrics: 
  def __init__(self, predictions) -> None:
    self.predictions = predictions
    self.metrics = { }

  def compute_metrics ( self, *args ) -> dict :
    if 'MAE' in args: self.MAE ( )
    if 'RMSE' in args: self.RMSE ( )
    return self.metrics

  def MAE  ( self ):
    self.metrics [ 'MAE' ] = accuracy.mae ( self.predictions ) 

  def RMSE ( self ): 
    self.metrics [ 'RMSE' ] = accuracy.rmse ( self.predictions ) 

In [47]:
class DataGenerator: 
  def __init__(self, data, percentage = 0.25) -> None:
    """Data Generator

    Build a 75/25 train/test split for measuring accuracy

    Args:
        percentage (float, optional): _description_. Defaults to 0.25.
    """
    self.trainset, self.testset = train_test_split ( data, test_size=percentage, random_state=1 )

  def get_trainset( self ):
    return self.trainset
  
  def get_testset( self ):
    return self.testset

In [48]:
class Model: 
  def __init__(self, model, name) -> None:
    self.model = model
    self.name = name
  
  def __str__(self) -> str:
    return f'Model: { self.name }'
  
  def evaluate ( self, data: DataGenerator ): 
    predictions = self.model.fit ( data.get_trainset() ).test ( data.get_testset() )
    metrics = Metrics ( predictions ).compute_metrics( 'MAE', 'RMSE' )
    return metrics

In [49]:
class Factory:
  def __init__(self, dataset) -> None:
    self.dataset = DataGenerator ( dataset )
    self.models: list[ Model ] = [ ]
  
  def add_model ( self, model: Model ):
    self.models.append ( model )
  
  def evaluate ( self ):
    results = { }
    for model in self.models:
      print ( f'Evaluating { model.name }' )
      results [ model.name ] = model.evaluate( self.dataset )

  def clean_models ( self ):
    self.models = [] 


In [66]:
model_svd = Model ( model=SVD(), name='SVD' )
factory = Factory( loader.load_dataset( ) )

factory.add_model( model_svd )
factory.evaluate()

Evaluating SVD
MAE:  0.7402
RMSE: 0.9411


In [67]:
model_baseline = Model ( model=BaselineOnly(), name='Baseline Only' )
factory = Factory( loader.load_dataset( ) )

factory.add_model( model_baseline )
factory.evaluate()

Evaluating Baseline Only
Estimating biases using als...
MAE:  0.7501
RMSE: 0.9485


In [72]:
class HybridModel ( AlgoBase ):

  def __init__ (self, models, weights, **kwargs):
    super().__init__(**kwargs) 
    self.models: list[ Model ] = models
    self.weights = weights
  
  def fit (self, trainset):
    AlgoBase.fit ( self, trainset )
    for model in self.models:
      model.model.fit ( trainset )
    return self
  
  def estimate ( self, user_id, item_id ):
    scores = 0 
    weight = 0
    for i in range ( len( self.models ) ):
      scores += self.models[i].model.predict ( user_id, item_id ).est * self.weights[i]
      weight += self.weights[i]

    return scores/weight

In [71]:
models = [ 
  model_svd,
  model_baseline
]
weights = [ 
  0.5,
  0.5
]

hybrid = HybridModel ( models, weights )

factory = Factory ( loader.load_dataset() )

model = Model ( hybrid, 'Hybrid: SVD - Baseline' )
factory.add_model ( model )

factory.evaluate()

Evaluating Hybrid: SVD - Baseline
Estimating biases using als...
MAE:  0.9949
RMSE: 1.2280
