In [1]:
import pandas as pd
import numpy as np

num_users = 1000
num_products = 500

In [7]:
# step 1 : Data generation
user_data = {
    'user_id': np.arange(1, num_users + 1),
    'age': np.random.randint(18, 70, size=num_users),
    'gender': np.random.choice(['M', 'F'], size=num_users),
    'location': np.random.choice(['Urban', 'Suburban', 'Rural'], size=num_users)
}

users_df = pd.DataFrame(user_data)

product_data = {
    'product_id': np.arange(1, num_products+1),
    'category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Books'], size=num_products),
    'gender': np.round(np.random.uniform(5, 500, size=num_products), 2),
    'rating': np.round(np.random.uniform(1, 5, size=num_products), 1)
} 

products_df = pd.DataFrame(product_data)

interaction_data = {
    'user_id': np.random.choice(users_df['user_id'], size=5000),
    'product_id': np.random.choice(products_df['product_id'], size=5000),
    'rating': np.random.randint(1,6, size=5000),
    'timestamp': pd.date_range(start = '2023-01-01', periods=5000, freq='T')
}

interactions_df = pd.DataFrame(interaction_data)

users_df.head(), products_df.head(), interactions_df.head()

(   user_id  age gender  location
 0        1   47      M     Urban
 1        2   65      M     Rural
 2        3   54      F     Rural
 3        4   62      F     Rural
 4        5   31      F  Suburban,
    product_id     category  gender  rating
 0           1  Electronics  233.17     1.1
 1           2         Home  263.97     2.2
 2           3        Books  256.43     3.1
 3           4         Home  121.41     3.3
 4           5     Clothing  493.56     3.2,
    user_id  product_id  rating           timestamp
 0      887         322       5 2023-01-01 00:00:00
 1      799         232       5 2023-01-01 00:01:00
 2      957         394       1 2023-01-01 00:02:00
 3      725         155       1 2023-01-01 00:03:00
 4      206         235       2 2023-01-01 00:04:00)

In [9]:
# step 2 - data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#handling missing values
print('Missing valkues in users data:\n', users_df.isnull().sum())
print('Missing values in products\n', products_df.isnull().sum())
print('Missing values in interactions data:\n', interactions_df.isnull().sum())

# encoding categorical values
label_encoder = LabelEncoder()
users_df['gender_encoded'] = label_encoder.fit_transform(users_df['gender'])
users_df['location_encoded'] = label_encoder.fit_transform(users_df['location'])
products_df['category_encoded'] = label_encoder.fit_transform(products_df['category'])

# creating user-product rating matrix
user_product_matrix = interactions_df.pivot_table(index='user_id', columns='product_id', values='rating').fillna(0)

# train-test split
train_data, test_data = train_test_split(interactions_df, test_size = 0.2, random_state = 42)

# display some rows
print("User-Product Matrix:\n", user_product_matrix.head())
print("Train Data Sample:\n", train_data.head())
print("Test Data Sample:\n", test_data.head())

Missing valkues in users data:
 user_id     0
age         0
gender      0
location    0
dtype: int64
Missing values in products
 product_id    0
category      0
gender        0
rating        0
dtype: int64
Missing values in interactions data:
 user_id       0
product_id    0
rating        0
timestamp     0
dtype: int64
User-Product Matrix:
 product_id  1    2    3    4    5    6    7    8    9    10   ...  491  492  \
user_id                                                       ...             
1           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
3           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
4           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
5           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

product_id  493  494  495  496  497  498  499  500  
user_id                                           

In [12]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.model_selection import cross_validate
from surprise import accuracy

# preparing the data for surprise
reader = Reader(rating_scale = (1,5))
data = Dataset.load_from_df(interactions_df[["user_id", "product_id", "rating"]], reader)

# train test split
trainset, testset = surprise_train_test_split(data, test_size = 0.2)

# training the SVD model
model = SVD()
model.fit(trainset)

# predictions
predictions = model.test(testset)

# evaluating the model (root mean square error)
rmse = accuracy.rmse(predictions)

RMSE: 1.4620


In [13]:
# saving the model
import pickle

# saving the model to a file
model_filename = 'svd_model.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(model, model_file)

print(f"Model saved to { model_filename }")


Model saved to svd_model.pkl


In [15]:
# evaluating the model
mae = accuracy.mae(predictions)

# generate report
performance_report = {
    "RMSE": rmse,
    "MAE": mae
}

# display the performance report
print("Model Performance Report:")
for metric, score in performance_report.items():
    print(f"{metric}: {score: .4f}")

MAE:  1.2614
Model Performance Report:
RMSE:  1.4620
MAE:  1.2614
