## Preprocessing

In [68]:
# Setup and data loading
import sys
sys.path.append('../')
from src.utils.imports import *

DATA_DIR = './data/processed'

df_customers = pd.read_csv(os.path.join(DATA_DIR, 'customers_cleaned.csv'))
df_customers['timestamp'] = pd.to_datetime(df_customers['timestamp'])

print("Data loaded successfully!")
print("Shape:", df_customers.shape)

Data loaded successfully!
Shape: (313823, 19)


In [69]:
# Data cleaning and filtering
df_customers.columns = df_customers.columns.str.lower()

min_user_interactions = 5
min_product_interactions = 5
df_customers = df_customers.groupby('userid').filter(lambda x: len(x) >= min_user_interactions)
df_customers = df_customers.groupby('productid').filter(lambda x: len(x) >= min_product_interactions)

In [70]:
# ID encoding
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

df_customers['userid'] = user_encoder.fit_transform(df_customers['userid'])
df_customers['productid'] = product_encoder.fit_transform(df_customers['productid'])


In [71]:
# Save intermediate results
os.makedirs('data/processed', exist_ok=True)
df_customers.to_csv('data/processed/customers_cleaned.csv', index=False)

In [73]:
# Feature engineering

# Process ratings
df_customers['rating'] = pd.to_numeric(df_customers['rating'], errors='coerce')
df_customers['rating_normalized'] = (df_customers['rating'] - df_customers['rating'].min()) / (df_customers['rating'].max() - df_customers['rating'].min())

# Create time features
df_customers['timestamp'] = pd.to_datetime(df_customers['timestamp'], unit='s')
df_customers['year'] = df_customers['timestamp'].dt.year
df_customers['month'] = df_customers['timestamp'].dt.month
df_customers['day_of_week'] = df_customers['timestamp'].dt.dayofweek
df_customers['hour'] = df_customers['timestamp'].dt.hour

In [74]:
# Calculate user statistics
user_stats = df_customers.groupby('userid').agg({
    'rating': ['mean', 'std', 'count'],
    'productid': 'nunique',
    'rating_normalized': 'mean'
}).reset_index()

user_stats.columns = ['userid', 'user_avg_rating', 'user_rating_std', 'user_total_ratings', 'user_unique_products', 'user_avg_normalized_rating']

print("User statistics:")
display(user_stats.head())


User statistics:


Unnamed: 0,userid,user_avg_rating,user_rating_std,user_total_ratings,user_unique_products,user_avg_normalized_rating
0,0,3.167,1.602,6,6,0.542
1,1,3.333,0.816,6,6,0.583
2,2,5.0,0.0,10,10,1.0
3,3,4.875,0.354,8,8,0.969
4,4,4.125,0.991,8,8,0.781


In [75]:
# Calculate product statistics
product_stats = df_customers.groupby('productid').agg({
    'rating': ['mean', 'std', 'count'],
    'userid': 'nunique',
    'rating_normalized': 'mean'
}).reset_index()

product_stats.columns = ['productid', 'product_avg_rating', 'product_rating_std', 'product_total_ratings', 'product_unique_users', 'product_avg_normalized_rating']

print("Product statistics:")
display(product_stats.head())


Product statistics:


Unnamed: 0,productid,product_avg_rating,product_rating_std,product_total_ratings,product_unique_users,product_avg_normalized_rating
0,0,2.444,1.424,9,9,0.361
1,1,3.091,1.514,11,11,0.523
2,2,3.8,1.643,5,5,0.7
3,3,5.0,0.0,5,5,1.0
4,4,4.333,1.033,6,6,0.833


In [76]:
# Merge statistics
df_customers = df_customers.merge(user_stats, on='userid', how='left')
df_customers = df_customers.merge(product_stats, on='productid', how='left')

print("Columns after merge:")
print(df_customers.columns.tolist())

Columns after merge:
['userid', 'productid', 'rating', 'timestamp', 'rating_normalized', 'year', 'month', 'day_of_week', 'hour', 'user_avg_rating_x', 'user_rating_std_x', 'user_total_ratings_x', 'user_unique_products_x', 'user_avg_normalized_rating_x', 'product_avg_rating_x', 'product_rating_std_x', 'product_total_ratings_x', 'product_unique_users_x', 'product_avg_normalized_rating_x', 'user_avg_rating_y', 'user_rating_std_y', 'user_total_ratings_y', 'user_unique_products_y', 'user_avg_normalized_rating_y', 'product_avg_rating_y', 'product_rating_std_y', 'product_total_ratings_y', 'product_unique_users_y', 'product_avg_normalized_rating_y']


In [77]:
# Save processed data
os.makedirs('data/processed', exist_ok=True)
df_customers.to_csv('data/processed/customers_cleaned.csv', index=False)

In [78]:
# Final data quality check
print("Dataset shape:", df_customers.shape)
print("\nMissing values:\n", df_customers.isnull().sum())
print("\nData types:\n", df_customers.dtypes)
print("\nMemory usage:", df_customers.memory_usage().sum() / 1024**2, "MB")

Dataset shape: (224229, 29)

Missing values:
 userid                              0
productid                           0
rating                              0
timestamp                           0
rating_normalized                   0
year                                0
month                               0
day_of_week                         0
hour                                0
user_avg_rating_x                   0
user_rating_std_x                   0
user_total_ratings_x                0
user_unique_products_x              0
user_avg_normalized_rating_x        0
product_avg_rating_x                0
product_rating_std_x                0
product_total_ratings_x             0
product_unique_users_x              0
product_avg_normalized_rating_x     0
user_avg_rating_y                   0
user_rating_std_y                  15
user_total_ratings_y                0
user_unique_products_y              0
user_avg_normalized_rating_y        0
product_avg_rating_y                0
prod

In [79]:
# Document final features
print("Final features:", df_customers.columns.tolist())
print("\nSample of processed data:")
display(df_customers.head())

Final features: ['userid', 'productid', 'rating', 'timestamp', 'rating_normalized', 'year', 'month', 'day_of_week', 'hour', 'user_avg_rating_x', 'user_rating_std_x', 'user_total_ratings_x', 'user_unique_products_x', 'user_avg_normalized_rating_x', 'product_avg_rating_x', 'product_rating_std_x', 'product_total_ratings_x', 'product_unique_users_x', 'product_avg_normalized_rating_x', 'user_avg_rating_y', 'user_rating_std_y', 'user_total_ratings_y', 'user_unique_products_y', 'user_avg_normalized_rating_y', 'product_avg_rating_y', 'product_rating_std_y', 'product_total_ratings_y', 'product_unique_users_y', 'product_avg_normalized_rating_y']

Sample of processed data:


Unnamed: 0,userid,productid,rating,timestamp,rating_normalized,year,month,day_of_week,hour,user_avg_rating_x,user_rating_std_x,user_total_ratings_x,user_unique_products_x,user_avg_normalized_rating_x,product_avg_rating_x,product_rating_std_x,product_total_ratings_x,product_unique_users_x,product_avg_normalized_rating_x,user_avg_rating_y,user_rating_std_y,user_total_ratings_y,user_unique_products_y,user_avg_normalized_rating_y,product_avg_rating_y,product_rating_std_y,product_total_ratings_y,product_unique_users_y,product_avg_normalized_rating_y
0,6869,0,1.0,2014-01-30,0.0,2014,1,3,0,3.667,2.066,6,6,0.667,2.769,1.589,13,13,0.442,3.4,2.191,5,5,0.6,2.444,1.424,9,9,0.361
1,21519,0,3.0,2014-04-18,0.5,2014,4,4,0,4.429,0.787,7,7,0.857,2.769,1.589,13,13,0.442,4.429,0.787,7,7,0.857,2.444,1.424,9,9,0.361
2,17856,0,4.0,2013-09-06,0.75,2013,9,4,0,4.4,0.843,10,10,0.85,2.769,1.589,13,13,0.442,4.4,0.843,10,10,0.85,2.444,1.424,9,9,0.361
3,17124,0,1.0,2014-05-09,0.0,2014,5,4,0,3.8,1.789,5,5,0.7,2.769,1.589,13,13,0.442,3.8,1.789,5,5,0.7,2.444,1.424,9,9,0.361
4,5125,0,2.0,2013-12-08,0.25,2013,12,6,0,2.833,1.472,6,6,0.458,2.769,1.589,13,13,0.442,2.833,1.472,6,6,0.458,2.444,1.424,9,9,0.361
