In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib
import os
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score

In [2]:
df = pd.read_csv(r"C:\Users\ADMIN\Downloads\E-commerce_Customer_Behavior.csv")

In [3]:
# Handle missing values if any
df.fillna({'Satisfaction Level': 'Neutral'}, inplace=True)

In [4]:
# Remove Customer ID if present
if 'Customer ID' in df.columns:
    df = df.drop(columns=['Customer ID'])

print("Dataset shape:", df.shape)
print("\nDataset columns:")
print(df.columns.tolist())
print("\nFirst few rows:")
df.head()

Dataset shape: (350, 10)

Dataset columns:
['Gender', 'Age', 'City', 'Membership Type', 'Total Spend', 'Items Purchased', 'Average Rating', 'Discount Applied', 'Days Since Last Purchase', 'Satisfaction Level']

First few rows:


Unnamed: 0,Gender,Age,City,Membership Type,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level
0,Female,29,New York,Gold,1120.2,14,4.6,True,25,Satisfied
1,Male,34,Los Angeles,Silver,780.5,11,4.1,False,18,Neutral
2,Female,43,Chicago,Bronze,510.75,9,3.4,True,42,Unsatisfied
3,Male,30,San Francisco,Gold,1480.3,19,4.7,False,12,Satisfied
4,Male,27,Miami,Silver,720.4,13,4.0,True,55,Unsatisfied


In [5]:
#Define synthetic product categories and their products
product_categories = ['Electronics', 'Fashion', 'Home', 'Sports', 'Beauty']
products = {
    'Electronics': ['Laptop', 'Smartphone', 'Headphones', 'Tablet', 'Smartwatch', 
                   'Camera', 'Router', 'Monitor'],
    'Fashion': ['T-Shirt', 'Jeans', 'Sneakers', 'Jacket', 'Hat', 
               'Belt', 'Sunglasses', 'Scarf'],
    'Home': ['Blender', 'Vacuum', 'Lamp', 'Toaster', 'Microwave', 
            'Kettle', 'Fan', 'AirPurifier'],
    'Sports': ['Football', 'TennisRacket', 'RunningShoes', 'BaseballBat', 
              'Helmet', 'YogaMat', 'Dumbbells', 'CyclingGloves'],
    'Beauty': ['Lipstick', 'Perfume', 'Moisturizer', 'Foundation', 
              'Eyeliner', 'FaceWash', 'NailPolish', 'HairSerum']
}

print("Product categories defined:")
for category, items in products.items():
    print(f"{category}: {len(items)} products")

Product categories defined:
Electronics: 8 products
Fashion: 8 products
Home: 8 products
Sports: 8 products
Beauty: 8 products


In [6]:
## Cell 4: Assign Product Categories and Products to Customers
np.random.seed(42)
df_copy = df.copy()

def assign_products(row):
    """Assign product category based on customer profile"""
    if row['Membership Type'] == 'Gold':
        if row['Age'] < 35:
            return np.random.choice(['Electronics', 'Home', 'Fashion'])
        else:
            return np.random.choice(['Home', 'Sports', 'Fashion'])
    elif row['Membership Type'] == 'Silver':
        if row['Age'] < 30:
            return np.random.choice(['Fashion', 'Sports', 'Home'])
        else:
            return np.random.choice(['Fashion', 'Beauty'])
    else:
        return np.random.choice(['Fashion', 'Beauty', 'Home'])

# Assign product categories
df_copy['Assigned Product Category'] = df_copy.apply(assign_products, axis=1)

def assign_products_list(row):
    """Assign multiple products from the assigned category"""
    category = row['Assigned Product Category'] 
    # Randomly select 5-8 products from the category
    n = np.random.randint(5, 9)
    return np.random.choice(products[category], n, replace=False).tolist()

# Assign product lists
df_copy['Assigned Products'] = df_copy.apply(assign_products_list, axis=1)

print("Product assignment completed!")
print("\nSample assignments:")
print(df_copy[['Age', 'Membership Type', 'Assigned Product Category', 'Assigned Products']].head())


Product assignment completed!

Sample assignments:
   Age Membership Type Assigned Product Category  \
0   29            Gold                   Fashion   
1   34          Silver                    Beauty   
2   43          Bronze                   Fashion   
3   30            Gold                   Fashion   
4   27          Silver                      Home   

                                   Assigned Products  
0          [Sneakers, Jeans, Scarf, Hat, Sunglasses]  
1  [NailPolish, Eyeliner, Perfume, HairSerum, Fac...  
2  [Sunglasses, Jeans, Scarf, T-Shirt, Jacket, Sn...  
3       [Sunglasses, Jeans, Scarf, Jacket, Sneakers]  
4  [Fan, Kettle, Vacuum, Toaster, Lamp, Blender, ...  


In [7]:
##Define Preprocessing Components

# Define a proper tokenizer function
def product_tokenizer(text):
    """Tokenize product strings by splitting on spaces"""
    return text.split()

# Define column groups
text_col = "Assigned Products"
cat_cols = ["Gender", "City", "Membership Type"]
num_cols = ["Age", "Total Spend", "Items Purchased", "Average Rating", 
           "Discount Applied", "Days Since Last Purchase"]

# Add Satisfaction Level to categorical if it exists
if 'Satisfaction Level' in df_copy.columns:
    cat_cols.append('Satisfaction Level')

# Convert product lists to space-separated strings
df_copy["Assigned Products"] = df_copy["Assigned Products"].apply(lambda x: " ".join(x))

print("Column groups defined:")
print(f"Categorical columns: {cat_cols}")
print(f"Numerical columns: {num_cols}")
print(f"Text column: {text_col}")

Column groups defined:
Categorical columns: ['Gender', 'City', 'Membership Type', 'Satisfaction Level']
Numerical columns: ['Age', 'Total Spend', 'Items Purchased', 'Average Rating', 'Discount Applied', 'Days Since Last Purchase']
Text column: Assigned Products


In [8]:
## Cell 6: Create Preprocessing Pipeline
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("products", CountVectorizer(tokenizer=product_tokenizer), text_col),
        ("num", StandardScaler(), num_cols)
    ]
)

# Apply preprocessing
print("Applying preprocessing...")
X_processed = preprocessor.fit_transform(df_copy)

# Create multi-label target
df_copy['product_list'] = df_copy['Assigned Products'].str.split()
mlb = MultiLabelBinarizer()
y_multilabel = mlb.fit_transform(df_copy['product_list'])

print(f"Processed features shape: {X_processed.shape}")
print(f"Multi-label target shape: {y_multilabel.shape}")
print(f"Number of unique products: {len(mlb.classes_)}")

Applying preprocessing...
Processed features shape: (350, 60)
Multi-label target shape: (350, 40)
Number of unique products: 40




In [9]:
##Create Feature DataFrame
cat_names = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols)
prod_names = preprocessor.named_transformers_['products'].get_feature_names_out()
num_names = num_cols

# Merge all feature names
all_names = list(cat_names) + list(prod_names) + list(num_names)

# Create DataFrame with transformed values
X_arr = X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed
final_df = pd.DataFrame(X_arr, columns=all_names)

print(f"Total number of features: {len(all_names)}")
print(f"Feature types breakdown:")
print(f"  - Categorical features: {len(cat_names)}")
print(f"  - Product features: {len(prod_names)}")
print(f"  - Numerical features: {len(num_names)}")

Total number of features: 60
Feature types breakdown:
  - Categorical features: 14
  - Product features: 40
  - Numerical features: 6


In [10]:
##Prepare Features and Split Data
feature_columns = [col for col in final_df.columns if not col.startswith('Satisfaction Level_')]
X = final_df[feature_columns].values
y = y_multilabel

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Target shape: {y.shape}")
print(f"Number of classes (products): {y.shape[1]}")

Training set shape: (280, 57)
Test set shape: (70, 57)
Target shape: (350, 40)
Number of classes (products): 40


In [11]:
##Train Random Forest Model
print("Training Random Forest...")
rf_multilabel = MultiOutputClassifier(
    RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        random_state=42,
        class_weight='balanced'
    )
)

rf_multilabel.fit(X_train, y_train)
print("Random Forest training completed!")

Training Random Forest...
Random Forest training completed!


In [12]:
##Train KNN model
print("Training KNN...")
from sklearn.neighbors import KNeighborsClassifier
knn_multilabel = MultiOutputClassifier(
    KNeighborsClassifier(
        n_neighbors=5,
        weights='uniform',
        algorithm='auto'
    )
)
knn_multilabel.fit(X_train, y_train)
print("KNN training completed!")
##Evaluate Models
def evaluate_model(model, X_test, y_test):
    """Evaluate multi-label model and print precision and accuracy"""
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average='samples', zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    return precision, accuracy
rf_precision, rf_accuracy = evaluate_model(rf_multilabel, X_test, y_test)
knn_precision, knn_accuracy = evaluate_model(knn_multilabel, X_test, y_test)                        




Training KNN...
KNN training completed!


In [13]:
## Cell 11: Evaluate Both Models
print(f"Random Forest - Precision: {rf_precision:.4f}, Accuracy: {rf_accuracy:.4f}")
print(f"KNN - Precision: {knn_precision:.4f}, Accuracy: {knn_accuracy:.4f}")


Random Forest - Precision: 0.9252, Accuracy: 0.6571
KNN - Precision: 0.8318, Accuracy: 0.2571


In [14]:
## Cell 12: Select Best Model
best_model = rf_multilabel if rf_accuracy > knn_accuracy else knn_multilabel
best_model_name = "Random Forest" if rf_accuracy > knn_accuracy else "KNN"
best_model_precision = rf_precision if rf_accuracy > knn_accuracy else knn_precision
best_model_accuracy = max(rf_accuracy, knn_accuracy)
print(f"Best model selected: {best_model_name}")


Best model selected: Random Forest


In [15]:

import pickle
import os
import numpy as np

# Prepare model data for saving
model_data = {
    'model': best_model,
    'preprocessor': preprocessor,
    'mlb': mlb,
    'feature_columns': feature_columns,
    'product_categories': product_categories,
    'products': products,
    'user_profiles': df_copy,
    'model_metadata': {
        'model_type': best_model_name,
        'precision': best_model_precision,
        'n_users': len(df_copy),
        'n_products': len(mlb.classes_),
        'n_interactions': np.sum(y_multilabel)
    }
}

# Create directories if they don't exist
os.makedirs('models', exist_ok=True)

# Save using pickle
with open('models/recommendation_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("✅ Model saved successfully (as .pkl)!")
print("📊 Model Summary:")
print(f"  - Best model: {best_model_name}")
print(f"  - Precision: {best_model_precision:.4f}")
print(f"  - Number of products: {len(mlb.classes_)}")
print(f"  - Number of users: {len(df_copy)}")
print(f"  - Total interactions: {np.sum(y_multilabel)}")

✅ Model saved successfully (as .pkl)!
📊 Model Summary:
  - Best model: Random Forest
  - Precision: 0.9252
  - Number of products: 40
  - Number of users: 350
  - Total interactions: 2274


In [16]:
## Create Recommendation Functions
def recommend_products(customer_data, model_data, top_k=5):
    """
    Recommend top-k products for a customer
    
    Parameters:
    customer_data: dict with customer features
    model_data: loaded model data
    top_k: number of recommendations to return
    """
    # Create DataFrame from customer data
    customer_df = pd.DataFrame([customer_data])
    
    # Add dummy products column for preprocessing
    customer_df['Assigned Products'] = ""
    
    # Apply same preprocessing
    X_processed = model_data['preprocessor'].transform(customer_df)
    
    # Get feature array
    X_arr = X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed
    X_final = pd.DataFrame(X_arr, columns=all_names)[model_data['feature_columns']].values
    
    # Get predictions (probabilities)
    predictions = model_data['model'].predict_proba(X_final)
    
    # Get top-k recommendations
    if hasattr(predictions[0], 'shape') and len(predictions[0].shape) > 1:
        # For binary classifiers, take the positive class probability
        probs = np.array([pred[:, 1] if pred.shape[1] > 1 else pred[:, 0] for pred in predictions])
    else:
        probs = np.array(predictions)
    
    # Get top-k indices
    top_k_indices = np.argsort(probs)[-top_k:][::-1]
    
    # Get product names
    recommended_products = model_data['mlb'].classes_[top_k_indices]
    recommendation_scores = probs[top_k_indices]
    
    return recommended_products, recommendation_scores

print("Recommendation functions created!")

Recommendation functions created!


In [17]:
## Cell 15: Test Recommendations
test_customer = {
    'Gender': 'Male',
    'Age': 28,
    'City': 'New York',
    'Membership Type': 'Gold',
    'Total Spend': 1500,
    'Items Purchased': 12,
    'Average Rating': 4.2,
    'Discount Applied': 1,
    'Days Since Last Purchase': 15
}

# Add Satisfaction Level if it exists in the data
if 'Satisfaction Level' in df_copy.columns:
    test_customer['Satisfaction Level'] = 'Satisfied'

# Get recommendations
try:
    recommended_products, scores = recommend_products(test_customer, model_data, top_k=5)
    
    print("🛍️  TOP 5 PRODUCT RECOMMENDATIONS")
    print("="*40)
    for i, (product, score) in enumerate(zip(recommended_products, scores), 1):
        print(f"{i}. {product} (Score: {score:.3f})")
        
except Exception as e:
    print(f"Error in recommendation: {e}")
    # Fallback: simple recommendation based on user profile
    print("Using fallback recommendation...")
    
    # Simple rule-based recommendation
    if test_customer['Age'] < 30 and test_customer['Membership Type'] == 'Gold':
        sample_recs = ['Smartphone', 'Laptop', 'Headphones', 'Sneakers', 'T-Shirt']
    else:
        sample_recs = ['Blender', 'Vacuum', 'Perfume', 'Moisturizer', 'Scarf']
    
    print("📱 Sample Recommendations:")
    for i, product in enumerate(sample_recs, 1):
        print(f"{i}. {product}")

🛍️  TOP 5 PRODUCT RECOMMENDATIONS
Error in recommendation: unsupported format string passed to numpy.ndarray.__format__
Using fallback recommendation...
📱 Sample Recommendations:
1. Smartphone
2. Laptop
3. Headphones
4. Sneakers
5. T-Shirt


In [18]:
##Model Performance Summary
print("="*60)
print("FINAL MODEL PERFORMANCE SUMMARY")
print("="*60)
print(f"📊 Dataset Information:")
print(f"  - Total customers: {len(df_copy)}")
print(f"  - Total unique products: {len(mlb.classes_)}")
print(f"  - Product categories: {len(product_categories)}")
print(f"  - Average products per customer: {np.mean([len(x.split()) for x in df_copy['Assigned Products']]):.1f}")

print(f"\n🤖 Model Information:")
print(f"  - Best model type: {best_model_name}")
print(f"  - Training samples: {X_train.shape[0]}")
print(f"  - Test samples: {X_test.shape[0]}")
print(f"  - Number of features: {X_train.shape[1]}")

print(f"\n📈 Performance Metrics:")
print(f"  - Precision: {best_model_precision:.4f}")
if best_model_name == "Random Forest":
    print(f"  - Micro Precision: {rf_precision:.4f}")
else:
    print(f"  - Micro Precision: {knn_precision:.4f}")

print(f"\n💾 Model saved to: recommendation_model.pkl")
print("="*60)

FINAL MODEL PERFORMANCE SUMMARY
📊 Dataset Information:
  - Total customers: 350
  - Total unique products: 40
  - Product categories: 5
  - Average products per customer: 6.5

🤖 Model Information:
  - Best model type: Random Forest
  - Training samples: 280
  - Test samples: 70
  - Number of features: 57

📈 Performance Metrics:
  - Precision: 0.9252
  - Micro Precision: 0.9252

💾 Model saved to: recommendation_model.pkl


In [19]:
df_copy.head()

Unnamed: 0,Gender,Age,City,Membership Type,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level,Assigned Product Category,Assigned Products,product_list
0,Female,29,New York,Gold,1120.2,14,4.6,True,25,Satisfied,Fashion,Sneakers Jeans Scarf Hat Sunglasses,"[Sneakers, Jeans, Scarf, Hat, Sunglasses]"
1,Male,34,Los Angeles,Silver,780.5,11,4.1,False,18,Neutral,Beauty,NailPolish Eyeliner Perfume HairSerum FaceWash...,"[NailPolish, Eyeliner, Perfume, HairSerum, Fac..."
2,Female,43,Chicago,Bronze,510.75,9,3.4,True,42,Unsatisfied,Fashion,Sunglasses Jeans Scarf T-Shirt Jacket Sneakers,"[Sunglasses, Jeans, Scarf, T-Shirt, Jacket, Sn..."
3,Male,30,San Francisco,Gold,1480.3,19,4.7,False,12,Satisfied,Fashion,Sunglasses Jeans Scarf Jacket Sneakers,"[Sunglasses, Jeans, Scarf, Jacket, Sneakers]"
4,Male,27,Miami,Silver,720.4,13,4.0,True,55,Unsatisfied,Home,Fan Kettle Vacuum Toaster Lamp Blender AirPuri...,"[Fan, Kettle, Vacuum, Toaster, Lamp, Blender, ..."


In [20]:
final_df.head()

Unnamed: 0,Gender_Female,Gender_Male,City_Chicago,City_Houston,City_Los Angeles,City_Miami,City_New York,City_San Francisco,Membership Type_Bronze,Membership Type_Gold,...,tennisracket,toaster,vacuum,yogamat,Age,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,-0.945152,0.76013,0.337346,1.001981,1.0,-0.118359
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.082826,-0.179459,-0.385538,0.139479,-1.0,-0.639907
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.933185,-0.92557,-0.867461,-1.068024,1.0,1.148256
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,-0.739557,1.756144,1.542153,1.174482,-1.0,-1.086947
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,-1.356343,-0.345692,0.096385,-0.033022,1.0,2.116844


In [21]:
final_df.columns

Index(['Gender_Female', 'Gender_Male', 'City_Chicago', 'City_Houston',
       'City_Los Angeles', 'City_Miami', 'City_New York', 'City_San Francisco',
       'Membership Type_Bronze', 'Membership Type_Gold',
       'Membership Type_Silver', 'Satisfaction Level_Neutral',
       'Satisfaction Level_Satisfied', 'Satisfaction Level_Unsatisfied',
       'airpurifier', 'baseballbat', 'belt', 'blender', 'camera',
       'cyclinggloves', 'dumbbells', 'eyeliner', 'facewash', 'fan', 'football',
       'foundation', 'hairserum', 'hat', 'headphones', 'helmet', 'jacket',
       'jeans', 'kettle', 'lamp', 'laptop', 'lipstick', 'microwave',
       'moisturizer', 'monitor', 'nailpolish', 'perfume', 'router',
       'runningshoes', 'scarf', 'smartphone', 'smartwatch', 'sneakers',
       'sunglasses', 't-shirt', 'tablet', 'tennisracket', 'toaster', 'vacuum',
       'yogamat', 'Age', 'Total Spend', 'Items Purchased', 'Average Rating',
       'Discount Applied', 'Days Since Last Purchase'],
      dtype='o

In [22]:
y_multilabel

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(350, 40))

In [23]:
# Updated model saving with proper tokenizer handling
import pickle
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define the tokenizer function that will be saved with the model
def product_tokenizer(text):
    """Tokenize product strings by splitting on spaces"""
    return text.split()

# Recreate the preprocessor with the tokenizer function properly referenced
cat_cols = ["Gender", "City", "Membership Type", "Satisfaction Level"]
num_cols = ["Age", "Total Spend", "Items Purchased", "Average Rating", 
           "Discount Applied", "Days Since Last Purchase"]
text_col = "Assigned Products"

# Create new preprocessor
new_preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("products", CountVectorizer(tokenizer=product_tokenizer), text_col),
        ("num", StandardScaler(), num_cols)
    ]
)

# Fit the new preprocessor on the data
new_preprocessor.fit(df_copy)

# Prepare updated model data for saving
updated_model_data = {
    'model': best_model,
    'preprocessor': new_preprocessor,  # Use the new preprocessor
    'mlb': mlb,
    'feature_columns': feature_columns,
    'product_categories': product_categories,
    'products': products,
    'user_profiles': df_copy,
    'product_tokenizer': product_tokenizer,  # Explicitly save the tokenizer function
    'model_metadata': {
        'model_type': best_model_name,
        'precision': best_model_precision,
        'n_users': len(df_copy),
        'n_products': len(mlb.classes_),
        'n_interactions': np.sum(y_multilabel)
    }
}

# Create directories if they don't exist
os.makedirs('models', exist_ok=True)

# Save the updated model
with open('models/recommendation_model.pkl', 'wb') as f:
    pickle.dump(updated_model_data, f)

print("✅ Model re-saved successfully with proper tokenizer!")
print("📊 Model Summary:")
print(f"  - Best model: {best_model_name}")
print(f"  - Precision: {best_model_precision:.4f}")
print(f"  - Number of products: {len(mlb.classes_)}")
print(f"  - Number of users: {len(df_copy)}")
print(f"  - Total interactions: {np.sum(y_multilabel)}")

✅ Model re-saved successfully with proper tokenizer!
📊 Model Summary:
  - Best model: Random Forest
  - Precision: 0.9252
  - Number of products: 40
  - Number of users: 350
  - Total interactions: 2274


