In [54]:
#import libraies 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

### Load Data

In [55]:
data=pd.read_csv("/kaggle/input/amazon-product-reviews/ratings_Electronics (1).csv")

### Data Preprocess

In [56]:
#explore data
data.head()

Unnamed: 0,AKM1MP6P0OYPR,0132793040,5.0,1365811200
0,A2CX7LUOHB2NDG,321732944,5.0,1341100800
1,A2NWSAGRHCP8N5,439886341,1.0,1367193600
2,A2WNBOD3WNDNKT,439886341,3.0,1374451200
3,A1GI0U4ZRJA8WN,439886341,1.0,1334707200
4,A1QGNMC6O1VW39,511189877,5.0,1397433600


In [57]:
# Rename columns to understand them 
data.rename(columns = {'AKM1MP6P0OYPR':'userid','0132793040':'productid','5.0':'rating'}, inplace = True)

In [58]:
# taking part of data becouse it's very big
data=data[:200000]

In [59]:
# checking if there missng values
data.isnull().sum().sum()

0

In [60]:
# checking if there duplicated values
data.duplicated().sum()

0

In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   userid      200000 non-null  object 
 1   productid   200000 non-null  object 
 2   rating      200000 non-null  float64
 3   1365811200  200000 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 6.1+ MB


In [62]:
# drop not important column
data.drop("1365811200",axis=1,inplace=True)

In [63]:
data.head(2)

Unnamed: 0,userid,productid,rating
0,A2CX7LUOHB2NDG,321732944,5.0
1,A2NWSAGRHCP8N5,439886341,1.0


###  Modifing dataset to be accpt in SVD model

## 
is a specialized library for building recommender systems
     so it provies a good modules and handling with data preprocess to recommendation sysytems like SVD model 

In [64]:
from surprise import Dataset, Reader
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(data[['userid', 'productid', 'rating']], reader)

### Splitting dataset

In [65]:
 from surprise.model_selection import train_test_split
trainset, testset = train_test_split(surprise_data, test_size=0.2)

# Create and Training model 

In [66]:
from surprise import SVD
model=SVD(reg_all=.001)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7933b1b23220>

In [67]:
# Make predictions on the test data
predictions = model.test(testset)

In [68]:
from surprise.accuracy import rmse

# Evaluate the model using RMSE
accuracy = rmse(predictions)
print(f"RMSE: {accuracy}")

RMSE: 1.2917
RMSE: 1.2916645313172062


### Save model to file to deploy it

In [69]:
from surprise.dump import dump
# Save the model to a file
filename = 'svd_model.pkl'
dump(filename, algo=model)

In [70]:
# exploring test set
testset[2]

('AWAGR51EAHGM4', 'B00005T3G0', 5.0)

In [71]:
model.predict(testset[2][0],testset[2][1]).est

4.6462065136891955

# Create recommends products

In [72]:
def recommend_products(user_id,product_id):
    all_products=data['productid'].unique()
    recommended_product=[]
    for pid in all_products:
        recommended_product.append((pid,model.predict(user_id,pid).est))
    
    recommended_product.sort(key=lambda x:x[1],reverse=True)
    return recommended_product[:5]

## Top 5 products recommended

In [73]:
for i in recommend_products(testset[2][0],testset[1000][1]):
    print("Recommended product:",i[0],", Predcted rate",i[1])

Recommended product: B000053HC5 , Predcted rate 4.919286425202842
Recommended product: B00000JBHE , Predcted rate 4.907731759611343
Recommended product: B00004TDLD , Predcted rate 4.862434719914043
Recommended product: B000053HH5 , Predcted rate 4.838191596009058
Recommended product: B00005UKBD , Predcted rate 4.8365022243421665


In [74]:
data['productid'].value_counts()[:8]

B00004ZCJE    2547
B00001P4ZH    2075
B000065BP9    1714
B00004T8R2    1692
B00001WRSJ    1586
B000065BPB    1304
B00005N6KG    1296
B00005T3G0    1287
Name: productid, dtype: int64

In [75]:
data['userid'].value_counts()[:8]

A231WM2Z2JL0U3    192
A5JLAU2ARJ0BO      86
A25HBO5V8S8SEA     72
A2BGZ52M908MJY     54
AT2J7H5TRZM8Z      53
A1MJMYLRTZ76ZX     48
A2AEZQ3DGBBLPR     48
A6FIAB28IS79       46
Name: userid, dtype: int64

In [76]:
prodid=[]
for i in data['productid'].unique() :
    prodid.append(i)
prodid[2]

'0511189877'

In [77]:
userid=[]
for i in data['userid'].unique() :
    userid.append(i)
userid[2]

'A2WNBOD3WNDNKT'

In [78]:
# Create a DataFrame from the list
df1 = pd.DataFrame(data={'userid': userid})
df2=pd.DataFrame(data={'productid': prodid})
# Define the file name for the CSV file
file_name2 = 'product_id.csv'
file_name1= 'user_id.csv'
# Save the DataFrame to CSV
df1.to_csv(file_name1, index=False)
df2.to_csv(file_name2, index=False)

In [79]:
pd.read_csv("user_id.csv")

Unnamed: 0,userid
0,A2CX7LUOHB2NDG
1,A2NWSAGRHCP8N5
2,A2WNBOD3WNDNKT
3,A1GI0U4ZRJA8WN
4,A1QGNMC6O1VW39
...,...
173344,A14PZ54R9FN8XX
173345,A161MTE56L17RK
173346,A2T4QNWVSKY9XB
173347,A1FVCT5TMNE75W
