In [1]:
import pandas as pd

## Import dataset

In [2]:
df = pd.read_csv('dataset/electronics.csv')
df.head()

Unnamed: 0,item_id,user_id,rating,category,brand,year,count,mean
0,372,7509,4.0,Camera & Photo,Philips,2006,28530.0,4.400911
1,119,1963,5.0,Computers & Accessories,Toshiba,2004,9393.0,4.418184
2,73,1530,5.0,Camera & Photo,Canon,2002,8622.0,4.218743
3,121,2203,3.0,Computers & Accessories,Linksys,2003,6278.0,4.168844
4,145,2399,1.0,Camera & Photo,Canon,2004,5810.0,4.306368


## Data engineering

In [3]:
df.drop(columns=['item_id', 'category', 'year', 'count', 'mean'], inplace=True)
df.head()

Unnamed: 0,user_id,rating,brand
0,7509,4.0,Philips
1,1963,5.0,Toshiba
2,1530,5.0,Canon
3,2203,3.0,Linksys
4,2399,1.0,Canon


In [4]:
df1 = df[['user_id', 'brand', 'rating']]
df1.head(20)

Unnamed: 0,user_id,brand,rating
0,7509,Philips,4.0
1,1963,Toshiba,5.0
2,1530,Canon,5.0
3,2203,Linksys,3.0
4,2399,Canon,1.0
5,2258,Linksys,5.0
6,5583,,5.0
7,1665,Sony,3.0
8,1103,,1.0
9,1826,Sony,5.0


## Training

In [5]:
from surprise import Reader, Dataset, NMF
from surprise.model_selection import cross_validate

In [6]:
data = Dataset.load_from_df(df1, Reader())
trainset = data.build_full_trainset() # generate data train

In [7]:
model = NMF()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x10c8298d808>

In [8]:
model.predict(7509, "Philips")

Prediction(uid=7509, iid='Philips', r_ui=None, est=3.649609180036814, details={'was_impossible': False})

## All item

In [9]:
df.head()

Unnamed: 0,user_id,rating,brand
0,7509,4.0,Philips
1,1963,5.0,Toshiba
2,1530,5.0,Canon
3,2203,3.0,Linksys
4,2399,1.0,Canon


In [10]:
all_item = df['brand'].unique()
all_item

array(['Philips', 'Toshiba', 'Canon', 'Linksys', nan, 'Sony', 'Kodak',
       'Polaroid', 'Olympus', 'Nikon', 'Apple', 'Pyle', 'Panasonic',
       'Sennheiser', 'Fujifilm', 'EldHus', 'Logitech', 'Garmin', 'Bose',
       'JVC', 'HP', 'Kensington', 'Vivitar', 'Samsung', 'Archos',
       'Savage', 'ViewSonic', 'Etre Jeune', 'Jabra', 'Gary Fong',
       'Uniden', 'ebasy', 'Generic', 'JLAB', 'Skullcandy', 'TaoTronics',
       'Neewer', 'Koolertron', 'DURAGADGET', 'iRULU', 'Tiamat', 'DBPOWER',
       'Fintie', 'Plemo', 'EINCAR', 'Cooper Cases', 'LSS', 'Mpow',
       'XShields', 'IRULU', 'Funlux'], dtype=object)

In [11]:
print('Number of user:',trainset.n_users)
print('Number of items:',trainset.n_items)

Number of user: 1157633
Number of items: 51


In [12]:
user_id = 7509

In [13]:
test_set = [[user_id,iid,4] for iid in all_item]
test_set

[[7509, 'Philips', 4],
 [7509, 'Toshiba', 4],
 [7509, 'Canon', 4],
 [7509, 'Linksys', 4],
 [7509, nan, 4],
 [7509, 'Sony', 4],
 [7509, 'Kodak', 4],
 [7509, 'Polaroid', 4],
 [7509, 'Olympus', 4],
 [7509, 'Nikon', 4],
 [7509, 'Apple', 4],
 [7509, 'Pyle', 4],
 [7509, 'Panasonic', 4],
 [7509, 'Sennheiser', 4],
 [7509, 'Fujifilm', 4],
 [7509, 'EldHus', 4],
 [7509, 'Logitech', 4],
 [7509, 'Garmin', 4],
 [7509, 'Bose', 4],
 [7509, 'JVC', 4],
 [7509, 'HP', 4],
 [7509, 'Kensington', 4],
 [7509, 'Vivitar', 4],
 [7509, 'Samsung', 4],
 [7509, 'Archos', 4],
 [7509, 'Savage', 4],
 [7509, 'ViewSonic', 4],
 [7509, 'Etre Jeune', 4],
 [7509, 'Jabra', 4],
 [7509, 'Gary Fong', 4],
 [7509, 'Uniden', 4],
 [7509, 'ebasy', 4],
 [7509, 'Generic', 4],
 [7509, 'JLAB', 4],
 [7509, 'Skullcandy', 4],
 [7509, 'TaoTronics', 4],
 [7509, 'Neewer', 4],
 [7509, 'Koolertron', 4],
 [7509, 'DURAGADGET', 4],
 [7509, 'iRULU', 4],
 [7509, 'Tiamat', 4],
 [7509, 'DBPOWER', 4],
 [7509, 'Fintie', 4],
 [7509, 'Plemo', 4],
 [7509, '

In [14]:
pred = model.test(test_set)

In [15]:
rec = pd.DataFrame(pred).sort_values(by='est',ascending=False)
rec.head(5)

Unnamed: 0,uid,iid,r_ui,est,details
45,7509,Cooper Cases,4,3.853797,{'was_impossible': False}
25,7509,Savage,4,3.759804,{'was_impossible': False}
0,7509,Philips,4,3.649609,{'was_impossible': False}
41,7509,DBPOWER,4,3.635074,{'was_impossible': False}
2,7509,Canon,4,3.577077,{'was_impossible': False}


In [16]:
# df.drop(columns='details', inplace=True)
rec.drop(columns='details', inplace=True)

In [17]:
rec.head()

Unnamed: 0,uid,iid,r_ui,est
45,7509,Cooper Cases,4,3.853797
25,7509,Savage,4,3.759804
0,7509,Philips,4,3.649609
41,7509,DBPOWER,4,3.635074
2,7509,Canon,4,3.577077


In [18]:
rec = rec.rename(columns={'uid':'user_id', 'iid':'item', 'r_ui':'rating', 'est':'pred_score'})
rec.head()

Unnamed: 0,user_id,item,rating,pred_score
45,7509,Cooper Cases,4,3.853797
25,7509,Savage,4,3.759804
0,7509,Philips,4,3.649609
41,7509,DBPOWER,4,3.635074
2,7509,Canon,4,3.577077


### 5 barang yang dapat direkomendasikan untuk user 7509

## Export dataframe to csv

In [54]:
csv_data = df1.to_csv('collaborative_electronics.csv', index =False)

## Evaluasi Model

In [20]:
from surprise.model_selection import train_test_split
from surprise import NMF
[trainset,testset] = train_test_split(data,test_size=0.3)
model = NMF()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x10c9a251408>

In [21]:
test_pred= model.test(test_set)

In [22]:
tp = pd.DataFrame(test_pred)
tp.head()

Unnamed: 0,uid,iid,r_ui,est,details
0,7509,Philips,4,4.052342,"{'was_impossible': True, 'reason': 'User and i..."
1,7509,Toshiba,4,4.052342,"{'was_impossible': True, 'reason': 'User and i..."
2,7509,Canon,4,4.052342,"{'was_impossible': True, 'reason': 'User and i..."
3,7509,Linksys,4,4.052342,"{'was_impossible': True, 'reason': 'User and i..."
4,7509,,4,4.052342,"{'was_impossible': True, 'reason': 'User and i..."


In [23]:
tp.drop(columns='details', inplace=True)

In [24]:
tp = tp.rename(columns={'uid':'user_id', 'iid':'item', 'r_ui':'rating', 'est':'pred_score'})
tp.head(6)

Unnamed: 0,user_id,item,rating,pred_score
0,7509,Philips,4,4.052342
1,7509,Toshiba,4,4.052342
2,7509,Canon,4,4.052342
3,7509,Linksys,4,4.052342
4,7509,,4,4.052342
5,7509,Sony,4,4.052342


In [25]:
tp = tp.drop(4)

In [26]:
tp.head()

Unnamed: 0,user_id,item,rating,pred_score
0,7509,Philips,4,4.052342
1,7509,Toshiba,4,4.052342
2,7509,Canon,4,4.052342
3,7509,Linksys,4,4.052342
5,7509,Sony,4,4.052342


## ML Engineering

In [57]:
class RecommenderSystem:
    def __init__(self, data):
        self.df1 = pd.read_csv(data)
        self.all_item = self.df1['brand'].unique()
        self.model = None
        
    def fit(self):
        data = Dataset.load_from_df(self.df1, Reader())
        trainset = data.build_full_trainset() # generate data train
        
        self.model = NMF()
        self.model.fit(trainset)
        
    def recommend(self, user_id, topk=10):
        test_set = [[user_id,iid,4] for iid in all_item]
        pred = model.test(test_set)
        
        rec = pd.DataFrame(pred).sort_values(by='est',ascending=False)
        rec = rec.rename(columns={'uid':'user_id', 'iid':'item', 'r_ui':'rating', 'est':'pred_score'})
        rec.drop(columns=['rating', 'details'], inplace=True)
        rec.sort_values("pred_score", ascending=False, inplace=True)
        return rec.head(topk)

In [58]:
recsys = RecommenderSystem("collaborative_electronics.csv")
recsys.fit()

In [59]:
recsys.recommend(user_id=7509)

Unnamed: 0,user_id,item,pred_score
0,7509,Philips,4.052342
13,7509,Sennheiser,4.052342
2,7509,Canon,4.052342
3,7509,Linksys,4.052342
4,7509,,4.052342
5,7509,Sony,4.052342
6,7509,Kodak,4.052342
7,7509,Polaroid,4.052342
8,7509,Olympus,4.052342
9,7509,Nikon,4.052342
