## Preprocessing

In [2]:
import sys
sys.path.append('../')
from utils.imports import *

path = kagglehub.dataset_download("skillsmuggler/amazon-ratings")
csv_file = os.path.join(path, "ratings_Beauty.csv")
df_customers = pd.read_csv(csv_file, names=['userId', 'productId', 'rating', 'timestamp'], skiprows=1, low_memory=False)
print("Dataset shape:", df_customers.shape)

  from .autonotebook import tqdm as notebook_tqdm


Dataset shape: (2023070, 4)


In [3]:
df_customers['timestamp'] = pd.to_datetime(df_customers['timestamp'], unit='s')
df_customers['year'] = df_customers['timestamp'].dt.year
df_customers['month'] = df_customers['timestamp'].dt.month
df_customers['day'] = df_customers['timestamp'].dt.day

In [4]:
df_customers['rating'] = pd.to_numeric(df_customers['rating'], errors='coerce')
df_customers['rating_normalized'] = (df_customers['rating'] - df_customers['rating'].min()) / (df_customers['rating'].max() - df_customers['rating'].min())

In [5]:
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

df_customers['userId'] = user_encoder.fit_transform(df_customers['userId'])
df_customers['productId'] = product_encoder.fit_transform(df_customers['productId'])


In [6]:
os.makedirs('data/processed', exist_ok=True)
df_customers.to_csv('data/processed/customers_cleaned.csv', index=False)

In [7]:
user_stats = df_customers.groupby('userId').agg({
    'rating': ['mean', 'std', 'count'],
    'productId': 'nunique',
    'rating_normalized': 'mean'
}).reset_index()

user_stats.columns = ['userId', 'user_avg_rating', 'user_rating_std', 'user_total_ratings', 'user_unique_products', 'user_avg_normalized_rating']

print("User statistics:")
display(user_stats.head())


User statistics:


Unnamed: 0,userId,user_avg_rating,user_rating_std,user_total_ratings,user_unique_products,user_avg_normalized_rating
0,0,5.0,,1,1,1.0
1,1,5.0,,1,1,1.0
2,2,3.0,,1,1,0.5
3,3,5.0,,1,1,1.0
4,4,5.0,,1,1,1.0


In [8]:
product_stats = df_customers.groupby('productId').agg({
    'rating': ['mean', 'std', 'count'],
    'userId': 'nunique',
    'rating_normalized': 'mean'
}).reset_index()

product_stats.columns = ['productId', 'product_avg_rating', 'product_rating_std', 'product_total_ratings', 'product_unique_users', 'product_avg_normalized_rating']

print("Product statistics:")
display(product_stats.head())


Product statistics:


Unnamed: 0,productId,product_avg_rating,product_rating_std,product_total_ratings,product_unique_users,product_avg_normalized_rating
0,0,5.0,,1,1,1.0
1,1,4.0,1.414,2,2,0.75
2,2,4.0,,1,1,0.75
3,3,1.0,,1,1,0.0
4,4,5.0,,1,1,1.0


In [9]:
df_customers = df_customers.merge(user_stats, on='userId', how='left')
df_customers = df_customers.merge(product_stats, on='productId', how='left')

print("Columns after merge:")
print(df_customers.columns.tolist())

Columns after merge:
['userId', 'productId', 'rating', 'timestamp', 'year', 'month', 'day', 'rating_normalized', 'user_avg_rating', 'user_rating_std', 'user_total_ratings', 'user_unique_products', 'user_avg_normalized_rating', 'product_avg_rating', 'product_rating_std', 'product_total_ratings', 'product_unique_users', 'product_avg_normalized_rating']


In [10]:
os.makedirs('data/processed', exist_ok=True)
df_customers.to_csv('data/processed/customers_cleaned.csv', index=False)

In [11]:
print("Dataset shape:", df_customers.shape)
print("\nMissing values:\n", df_customers.isnull().sum())
print("\nData types:\n", df_customers.dtypes)
print("\nMemory usage:", df_customers.memory_usage().sum() / 1024**2, "MB")

Dataset shape: (2023070, 18)

Missing values:
 userId                                0
productId                             0
rating                                0
timestamp                             0
year                                  0
month                                 0
day                                   0
rating_normalized                     0
user_avg_rating                       0
user_rating_std                  887401
user_total_ratings                    0
user_unique_products                  0
user_avg_normalized_rating            0
product_avg_rating                    0
product_rating_std               103484
product_total_ratings                 0
product_unique_users                  0
product_avg_normalized_rating         0
dtype: int64

Data types:
 userId                                    int64
productId                                 int64
rating                                  float64
timestamp                        datetime64[ns]
year          

In [12]:
print("Final features:", df_customers.columns.tolist())
print("\nSample of processed data:")
display(df_customers.head())

Final features: ['userId', 'productId', 'rating', 'timestamp', 'year', 'month', 'day', 'rating_normalized', 'user_avg_rating', 'user_rating_std', 'user_total_ratings', 'user_unique_products', 'user_avg_normalized_rating', 'product_avg_rating', 'product_rating_std', 'product_total_ratings', 'product_unique_users', 'product_avg_normalized_rating']

Sample of processed data:


Unnamed: 0,userId,productId,rating,timestamp,year,month,day,rating_normalized,user_avg_rating,user_rating_std,user_total_ratings,user_unique_products,user_avg_normalized_rating,product_avg_rating,product_rating_std,product_total_ratings,product_unique_users,product_avg_normalized_rating
0,725046,0,5.0,2013-05-28,2013,5,28,1.0,4.25,0.957,4,4,0.812,5.0,,1,1,1.0
1,814606,1,3.0,2012-12-14,2012,12,14,0.5,3.5,0.707,2,2,0.625,4.0,1.414,2,2,0.75
2,313101,1,5.0,2014-07-07,2014,7,7,1.0,5.0,,1,1,1.0,4.0,1.414,2,2,0.75
3,291075,2,4.0,2013-10-24,2013,10,24,0.75,4.0,,1,1,0.75,4.0,,1,1,0.75
4,802842,3,1.0,2010-05-19,2010,5,19,0.0,1.0,0.0,9,9,0.0,1.0,,1,1,0.0
