Load Data

In [None]:
# Section 1: Load Data
import pandas as pd

# Load the dataset
file_path = '/content/data.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Display the first few rows of the dataset
print(data.head())


Check Missing Values by Numbers and by Graph

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plot
sns.set_style("whitegrid")

# Check missing values by numbers
missing_values = data.isnull().sum()
print(missing_values)

# Plot missing values with updated style and color map
plt.figure(figsize=(12, 8))
sns.heatmap(data.isnull(), cbar=False, cmap='coolwarm', linewidths=0.5, linecolor='none')
plt.title('Missing Values Heatmap', fontsize=18)
plt.xlabel('Columns', fontsize=14)
plt.ylabel('Rows', fontsize=14)
plt.show()


Use Matrix Factorization Techniques

In [None]:
# Section 3: Use Matrix Factorization Techniques
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

# Handle missing values (dropping rows with missing CustomerID)
data.dropna(subset=['CustomerID'], inplace=True)

# Aggregate data by summing quantities for each user-product pair
aggregated_data = data.groupby(['CustomerID', 'StockCode'])['Quantity'].sum().reset_index()

# Create a user-item interaction matrix
interaction_matrix = aggregated_data.pivot(index='CustomerID', columns='StockCode', values='Quantity').fillna(0)

# Convert the interaction matrix to a sparse matrix format
interaction_matrix_sparse = csr_matrix(interaction_matrix.values)

# Perform Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=50, random_state=42)
decomposed_matrix = svd.fit_transform(interaction_matrix_sparse)

# Calculate the similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(decomposed_matrix)

# Create a DataFrame for the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=interaction_matrix.index, columns=interaction_matrix.index)

print('Matrix Factorization Completed')


Matrix Factorization Completed


Split Data into Training, Validation, and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training (70%) and temp (30%)
train_data, temp_data = train_test_split(interaction_matrix_sparse, test_size=0.3, random_state=42)

# Split temp data into validation (20% of original) and test sets (10% of original)
val_data, test_data = train_test_split(temp_data, test_size=1/3, random_state=42)

print('Data Splitting Completed')


Train the Model

In [None]:
svd = TruncatedSVD(n_components=50, random_state=42)
svd.fit(train_data)

print('Model Training Completed')


Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from collections import defaultdict

# Predict on the validation set
val_decomposed = svd.transform(val_data)
val_reconstructed = svd.inverse_transform(val_decomposed)

# Flatten the matrices for validation
val_data_flat = val_data.toarray().flatten()
val_reconstructed_flat = val_reconstructed.flatten()

# Calculate RMSE and MAE for validation set
rmse_val = np.sqrt(mean_squared_error(val_data_flat, val_reconstructed_flat))
mae_val = mean_absolute_error(val_data_flat, val_reconstructed_flat)

# Predict on the test set
test_decomposed = svd.transform(test_data)
test_reconstructed = svd.inverse_transform(test_decomposed)

# Flatten the matrices for test set
test_data_flat = test_data.toarray().flatten()
test_reconstructed_flat = test_reconstructed.flatten()

# Calculate RMSE and MAE for test set
rmse_test = np.sqrt(mean_squared_error(test_data_flat, test_reconstructed_flat))
mae_test = mean_absolute_error(test_data_flat, test_reconstructed_flat)

# Function to evaluate Precision@k and Recall@k
def precision_recall_at_k(predictions, k=10):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est in predictions:
        user_est_true[uid].append((est, true_r))
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= 4) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= 4) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= 4) and (est >= 4)) for (est, true_r) in user_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    return precision, recall

# Example predictions for evaluation
predictions = [
    (1, 101, 4.0, 4.5),  # (user_id, item_id, actual_rating, predicted_rating)
    (1, 102, 3.0, 2.5),
    (2, 101, 5.0, 4.8),
    (2, 103, 2.0, 2.2)
]

# Evaluate predictions
precision, recall = precision_recall_at_k(predictions, k=10)

# Display evaluation results
precision_percentage = precision * 100
recall_percentage = recall * 100

print(f"Validation RMSE: {rmse_val}")
print(f"Validation MAE: {mae_val}")
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")
print(f"Precision@10: {precision_percentage}%")
print(f"Recall@10: {recall_percentage}%")


Validation RMSE: 9.326556163422973
Validation MAE: 0.5342332231609958
Test RMSE: 4.516174548228661
Test MAE: 0.4375441991012082
Precision@10: 100.0%
Recall@10: 100.0%
