# Import Library 

In [31]:
# Untuk pengolahan data
import pandas as pd 
import numpy as np 
from datetime import datetime, timedelta

# Untuk Format Data
from surprise import Reader
from surprise import Dataset

# Untuk Model 
from surprise import SVD
from surprise import BaselineOnly

# Untuk Metric
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV 

# Result Model 
import pickle


# Import Dataset 

In [2]:
prod = pd.read_csv(r"D:\Belajar\Skilvul\Dataset\product_details.csv", sep = ';')
prod

Unnamed: 0,product_id,category,price,ratings,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,101,Electronics,500,4.5,,,
1,102,Clothing,50,3.8,,,
2,103,Home & Kitchen,200,4.2,,,
3,104,Beauty,30,4.0,,,
4,105,Electronics,800,4.8,,,


In [3]:
cus = pd.read_csv(r"D:\Belajar\Skilvul\Dataset\customer_interactions.csv")
cus

Unnamed: 0,customer_id,page_views,time_spent
0,1,25,120
1,2,20,90
2,3,30,150
3,4,15,80
4,5,22,110


In [4]:
pur = pd.read_csv(r"D:\Belajar\Skilvul\Dataset\purchase_history.csv", sep=';')
pur

Unnamed: 0,customer_id,product_id,purchase_date,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,1,101,2023-01-01,,,,
1,1,105,2023-01-05,,,,
2,2,102,2023-01-02,,,,
3,3,103,2023-01-03,,,,
4,4,104,2023-01-04,,,,
5,5,101,2023-01-05,,,,


# Data Understanding 

In [5]:
pur.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   customer_id    6 non-null      int64  
 1   product_id     6 non-null      int64  
 2   purchase_date  6 non-null      object 
 3   Unnamed: 3     0 non-null      float64
 4   Unnamed: 4     0 non-null      float64
 5   Unnamed: 5     0 non-null      float64
 6   Unnamed: 6     0 non-null      float64
dtypes: float64(4), int64(2), object(1)
memory usage: 464.0+ bytes


In [6]:
cus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   customer_id  5 non-null      int64
 1   page_views   5 non-null      int64
 2   time_spent   5 non-null      int64
dtypes: int64(3)
memory usage: 248.0 bytes


In [7]:
prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   product_id  5 non-null      int64  
 1   category    5 non-null      object 
 2   price       5 non-null      int64  
 3   ratings     5 non-null      float64
 4   Unnamed: 4  0 non-null      float64
 5   Unnamed: 5  0 non-null      float64
 6   Unnamed: 6  0 non-null      float64
dtypes: float64(4), int64(2), object(1)
memory usage: 408.0+ bytes


In [8]:
prod.ratings.unique()

array([4.5, 3.8, 4.2, 4. , 4.8])

# Data Preprocessing

In [9]:
# removing NAN values

pur_clean = pur.drop(columns=['Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'])
prod_clean = prod.drop(columns=['Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'])

In [10]:
# Convert to datetime 

pur_clean['purchase_date'] = pd.to_datetime(pur_clean['purchase_date'])
pur_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   customer_id    6 non-null      int64         
 1   product_id     6 non-null      int64         
 2   purchase_date  6 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 272.0 bytes


In [12]:
# Membuat Data Dummy untuk menambah informasi

# Fungsi untuk menghasilkan tanggal acak dalam rentang tertentu
def random_date(start_date, end_date):
    delta = end_date - start_date
    random_days = np.random.randint(delta.days + 1)
    return start_date + timedelta(days=random_days)

# input rentang tanggal
start_date = datetime(2023, 1, 6)
end_date = datetime(2023, 3, 31)

# data dummy random
data = {
    'customer_id': np.random.randint(1, 6, size=85), 
    'product_id': np.random.randint(101, 106, size=85),  
    'purchase_date': [random_date(start_date, end_date) for _ in range(85)]  
}

pur_dummy = pd.DataFrame(data).sort_values(by='purchase_date').reset_index(drop='first')
pur_dummy 

Unnamed: 0,customer_id,product_id,purchase_date
0,2,104,2023-01-06
1,5,103,2023-01-06
2,4,101,2023-01-07
3,1,105,2023-01-08
4,1,103,2023-01-09
...,...,...,...
80,4,104,2023-03-26
81,3,105,2023-03-26
82,4,104,2023-03-28
83,5,104,2023-03-28


In [13]:
# Joining Data 

purchase_dummy = (pd.concat([pur_dummy,pur_clean],axis=0)).sort_values(by=['customer_id','purchase_date']).reset_index(drop='first')

df_all = purchase_dummy.copy()
df_all = pd.merge(df_all,prod_clean, how='left', on='product_id')
df_all = pd.merge(df_all,cus, how='left', on='customer_id')

df_all


Unnamed: 0,customer_id,product_id,purchase_date,category,price,ratings,page_views,time_spent
0,1,101,2023-01-01,Electronics,500,4.5,25,120
1,1,105,2023-01-05,Electronics,800,4.8,25,120
2,1,105,2023-01-08,Electronics,800,4.8,25,120
3,1,103,2023-01-09,Home & Kitchen,200,4.2,25,120
4,1,104,2023-01-30,Beauty,30,4.0,25,120
...,...,...,...,...,...,...,...,...
86,5,104,2023-03-18,Beauty,30,4.0,22,110
87,5,101,2023-03-19,Electronics,500,4.5,22,110
88,5,101,2023-03-21,Electronics,500,4.5,22,110
89,5,101,2023-03-24,Electronics,500,4.5,22,110


In [14]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   customer_id    91 non-null     int64         
 1   product_id     91 non-null     int64         
 2   purchase_date  91 non-null     datetime64[ns]
 3   category       91 non-null     object        
 4   price          91 non-null     int64         
 5   ratings        91 non-null     float64       
 6   page_views     91 non-null     int64         
 7   time_spent     91 non-null     int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(1)
memory usage: 5.8+ KB


In [15]:
df_all.columns

Index(['customer_id', 'product_id', 'purchase_date', 'category', 'price',
       'ratings', 'page_views', 'time_spent'],
      dtype='object')

In [16]:
df_all.shape

(91, 8)

In [17]:
reader = Reader(rating_scale=(1.0,5.0))
data = Dataset.load_from_df(df_all[['customer_id', 'product_id','ratings']],reader)

In [18]:
df_all

Unnamed: 0,customer_id,product_id,purchase_date,category,price,ratings,page_views,time_spent
0,1,101,2023-01-01,Electronics,500,4.5,25,120
1,1,105,2023-01-05,Electronics,800,4.8,25,120
2,1,105,2023-01-08,Electronics,800,4.8,25,120
3,1,103,2023-01-09,Home & Kitchen,200,4.2,25,120
4,1,104,2023-01-30,Beauty,30,4.0,25,120
...,...,...,...,...,...,...,...,...
86,5,104,2023-03-18,Beauty,30,4.0,22,110
87,5,101,2023-03-19,Electronics,500,4.5,22,110
88,5,101,2023-03-21,Electronics,500,4.5,22,110
89,5,101,2023-03-24,Electronics,500,4.5,22,110


# Modeling

In [19]:
# splitting data 

train_set, test_set = train_test_split(data, test_size= 0.20, random_state= 100)

- SVD (Singular Value Decomposition) 

In [20]:
svd = SVD() 

svd.fit(train_set) 
predictions = svd.test(test_set) 

In [21]:
# Evaluasi 
accuracy.rmse(predictions) 

RMSE: 0.0933


0.093280891640921

- ALS (Alternating Least Squares)

In [22]:
als_options = {'method':'als'} 
als = BaselineOnly(bsl_options = als_options)

als.fit(train_set)
predictions = als.test(test_set)

Estimating biases using als...


In [23]:
# Evaluasi 
accuracy.rmse(predictions) 

RMSE: 0.1517


0.1517100022190939

# Cross Validation

- SVD (Singular Value Decomposition) 

In [24]:
cv_std = cross_validate(svd, data, measures= ['RMSE','MSE'], cv = 10, verbose = True)

Evaluating RMSE, MSE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.1023  0.0656  0.0742  0.0626  0.0974  0.0928  0.0643  0.0820  0.0273  0.0876  0.0756  0.0210  
MSE (testset)     0.0105  0.0043  0.0055  0.0039  0.0095  0.0086  0.0041  0.0067  0.0007  0.0077  0.0062  0.0028  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    


- ALS (Alternating Least Squares)

In [25]:
cv_std = cross_validate(als, data, measures= ['RMSE','MSE'], cv = 10, verbose = True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MSE of algorithm BaselineOnly on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.1403  0.1086  0.1904  0.1289  0.1188  0.1365  0.1618  0.1403  0.1444  0.1289  0.1399  0.0217  
MSE (testset)     0.0197  0.0118  0.0363  0.0166  0.0141  0.0186  0.0262  0.0197  0.0209  0.0166  0.0200  0.0066  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    


The best model based on the mean cross validation is `SVD (Singular Value Decomposition)`, so then the model will be pickled.

# Testing Predict 

In [63]:
df_test = df_all[['customer_id','product_id']].drop_duplicates().reset_index(drop='first')
df_test

Unnamed: 0,customer_id,product_id
0,1,101
1,1,105
2,1,103
3,1,104
4,1,102
5,2,102
6,2,104
7,2,105
8,2,103
9,3,103


In [64]:
# prediction
pred = []

for _, row in  df_test.iterrows():
    est = svd.predict(row['customer_id'], row['product_id'])
    pred.append(est[3])

In [65]:
pred

[4.547640258948783,
 4.785674202720607,
 4.248890380025774,
 3.9601031094830774,
 3.7806138838151,
 3.8689356632167224,
 4.023602325432276,
 4.616880725568187,
 4.154447689430673,
 4.150310588469435,
 4.727743036793979,
 4.0296134646490565,
 4.473697193524554,
 3.9121124658621964,
 4.03329428403458,
 4.310345720760168,
 4.7660139525141,
 3.7883620591635636,
 4.242626400615625,
 4.4811491231728064,
 4.163023380370127,
 3.852915440011798,
 4.672505821769908,
 4.140433977282365]

In [69]:
df_test['y_pred'] = pred
df_test.sort_values(by=['customer_id','y_pred'],ascending=False)

Unnamed: 0,customer_id,product_id,y_pred
22,5,105,4.672506
19,5,101,4.481149
20,5,103,4.163023
23,5,104,4.140434
21,5,102,3.852915
16,4,105,4.766014
15,4,101,4.310346
18,4,103,4.242626
14,4,104,4.033294
17,4,102,3.788362


In [71]:
df_test[df_test['customer_id'] == 2]

Unnamed: 0,customer_id,product_id,y_pred
5,2,102,3.868936
6,2,104,4.023602
7,2,105,4.616881
8,2,103,4.154448


# Pickle Model 

In [32]:
with open('svd_model.pkl', 'wb') as  file:
    pickle.dump(svd, file)

In [72]:
with open('svd_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [34]:
loaded_model

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2951cdf48b0>

In [56]:
# Contoh penggunaan model untuk melakukan prediksi pada data baru
customer_id = '2'
item_id = '104'
prediction = loaded_model.predict(int(customer_id), int(item_id))

# Menampilkan hasil prediksi
print("Prediksi rating untuk customer dengan ID:", customer_id, "pada produk dengan ID:", item_id, "adalah:", round(prediction.est))


Prediksi rating untuk customer dengan ID: 2 pada produk dengan ID: 104 adalah: 4


In [89]:
# Sales Dummy
df_all['month'] = df_all['purchase_date'].dt.strftime('%B')
df_all['month_int'] = df_all['purchase_date'].dt.strftime('%m')

df_all.to_csv('Dataset/sales_dummy.csv')