# Exploratory Data Analysis for Personalized Customer Recommendation System

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('../dataset/processed/user_product_interactions_processed.csv')

# Display the first few rows of the dataset
data.head()

### 1. Basic Dataset Overview

In [2]:
# Basic information about the dataset
data.info()

# Summary statistics
data.describe()

# Count of interactions
interaction_count = data['interaction'].value_counts()
print(interaction_count)

# Check for missing values
data.isnull().sum()

### 2. Visualizing Interaction Distribution

In [3]:
# Plot the distribution of interactions (0 = No interaction, 1 = Interaction)
plt.figure(figsize=(8, 6))
sns.countplot(x='interaction', data=data, palette='viridis')
plt.title('Distribution of Interactions (User-Product)', fontsize=14)
plt.xlabel('Interaction')
plt.ylabel('Count')
plt.show()

### 3. User Activity Distribution

In [4]:
# Count the number of interactions per user
user_interaction_count = data.groupby('user_id')['interaction'].sum()

# Plot the user interaction distribution
plt.figure(figsize=(10, 6))
sns.histplot(user_interaction_count, bins=50, kde=True, color='blue')
plt.title('Distribution of User Activity (Number of Interactions)', fontsize=14)
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')
plt.show()

### 4. Top Products by Interaction Count

In [5]:
# Count the number of interactions per product
product_interaction_count = data.groupby('product_id')['interaction'].sum().sort_values(ascending=False)

# Plot the top 10 products by interaction count
top_10_products = product_interaction_count.head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_10_products.index, y=top_10_products.values, palette='magma')
plt.title('Top 10 Products by Interaction Count', fontsize=14)
plt.xlabel('Product ID')
plt.ylabel('Number of Interactions')
plt.show()

### 5. Heatmap of User-Product Interaction Matrix

In [6]:
# Create the user-product interaction matrix
interaction_matrix = data.pivot(index='user_id', columns='product_id', values='interaction').fillna(0)

# Plot the heatmap for the first 100 users and 50 products
plt.figure(figsize=(12, 8))
sns.heatmap(interaction_matrix.iloc[:100, :50], cmap='coolwarm', cbar=True)
plt.title('User-Product Interaction Matrix (First 100 Users and 50 Products)', fontsize=14)
plt.show()

### 6. Correlation Between Products

In [7]:
# Calculate the correlation between different products based on user interactions
product_correlation = interaction_matrix.corr()

# Plot the heatmap for product correlation
plt.figure(figsize=(12, 10))
sns.heatmap(product_correlation, cmap='coolwarm', cbar=True)
plt.title('Correlation Between Products Based on User Interactions', fontsize=14)
plt.show()

### 7. User Interaction Sparsity

In [8]:
# Calculate sparsity of the interaction matrix
total_elements = np.prod(interaction_matrix.shape)
non_zero_elements = np.count_nonzero(interaction_matrix)
sparsity = 1 - (non_zero_elements / total_elements)
print(f"Sparsity of the interaction matrix: {sparsity:.4f}")

# Plot sparsity
plt.figure(figsize=(6, 6))
plt.pie([sparsity, 1 - sparsity], labels=['Sparsity', 'Interactions'], autopct='%1.1f%%', startangle=140, colors=['#FF9999', '#66B2FF'])
plt.title('Sparsity of User-Product Interaction Matrix', fontsize=14)
plt.show()

### 8. Product Popularity Distribution

In [9]:
# Plot the distribution of product popularity (total interactions per product)
plt.figure(figsize=(10, 6))
sns.histplot(product_interaction_count, bins=50, kde=True, color='purple')
plt.title('Distribution of Product Popularity (Number of Interactions)', fontsize=14)
plt.xlabel('Number of Interactions per Product')
plt.ylabel('Count')
plt.show()

### 9. Conclusion and Next Steps

- The interaction data is highly sparse, meaning that most users have interacted with only a few products.
- The distribution of user activity shows that a small number of users are highly active, while the majority have limited interactions.
- Product popularity is skewed, with a few products receiving significantly more interactions than others.
- Correlation analysis shows potential patterns in how products are related based on user preferences.

### Next Steps:
1. Leverage collaborative filtering techniques to build a personalized recommendation model.
2. Explore content-based filtering by incorporating metadata such as product categories and descriptions.
3. Test model performance using various evaluation metrics such as RMSE and Precision@K.