In [1]:
import sklearn
print(sklearn.__version__)


1.6.1


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load dataset
print("Loading online shoppers data...")
shoppers_data = pd.read_csv('processed_online_shoppers_intention.csv')

print(f"Original dataset shape: {shoppers_data.shape}")
orig_shape = shoppers_data.shape

Loading online shoppers data...
Original dataset shape: (12205, 45)


In [3]:
# Browsing efficiency ratio
shoppers_data['browsing_efficiency'] = shoppers_data.apply(
    lambda row: row['ProductRelated'] / row['ProductRelated_Duration'] 
    if row['ProductRelated_Duration'] > 0 else 0,
    axis=1
)

In [4]:
# Content-to-product ratio
shoppers_data['content_product_ratio'] = shoppers_data.apply(
    lambda row: (row['Informational'] + row['Administrative']) / row['ProductRelated'] 
    if row['ProductRelated'] > 0 else 0,
    axis=1
)

In [5]:
# Engagement depth score
shoppers_data['engagement_depth'] = (
    shoppers_data['ProductRelated'] * 1.0 +
    shoppers_data['Informational'] * 0.7 +
    shoppers_data['Administrative'] * 0.5
) / shoppers_data[['ProductRelated', 'Informational', 'Administrative']].sum(axis=1).replace(0, 1)

In [6]:
# Visitor Type Enhancement
# Create binary features for visitor types
visitor_dummies = pd.get_dummies(shoppers_data['VisitorType'], prefix='visitor')
shoppers_data = pd.concat([shoppers_data, visitor_dummies], axis=1)

# Loyalty indicator (higher for returning)
loyalty_map = {
    'Returning_Visitor': 3,
    'New_Visitor': 1,
    'Other': 2
}
shoppers_data['loyalty_score'] = shoppers_data['VisitorType'].map(loyalty_map)

In [7]:
# Page Value Context
# Total time spent on site
shoppers_data['total_duration'] = (
    shoppers_data['Administrative_Duration'] + 
    shoppers_data['Informational_Duration'] + 
    shoppers_data['ProductRelated_Duration']
)

# Total pages viewed
shoppers_data['total_pages_viewed'] = (
    shoppers_data['Administrative'] + 
    shoppers_data['Informational'] + 
    shoppers_data['ProductRelated']
)

# Normalize page values
max_page_value = shoppers_data['PageValues'].max()
if max_page_value > 0:
    shoppers_data['page_value_norm'] = shoppers_data['PageValues'] / max_page_value
else:
    shoppers_data['page_value_norm'] = 0

# Page value to visit ratio
shoppers_data['value_per_page'] = shoppers_data['PageValues'] / shoppers_data['total_pages_viewed'].replace(0, 1)

# Value potential (inverse relationship with bounce rate)
shoppers_data['value_potential'] = shoppers_data['page_value_norm'] * (1 - shoppers_data['BounceRates'])

In [8]:
# Behavior Clustering
# Select features for clustering
cluster_features = [
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues',
    'browsing_efficiency', 'engagement_depth'
]

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(shoppers_data[cluster_features])

# Apply K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
shoppers_data['behavior_cluster'] = kmeans.fit_predict(scaled_features)

# Create cluster descriptions
cluster_means = shoppers_data.groupby('behavior_cluster')[cluster_features + ['Revenue']].mean()

# Assign meaningful names based on cluster characteristics
cluster_names = {}
cluster_names[cluster_means['Revenue'].idxmax()] = 'High_Converters'
cluster_names[cluster_means['ProductRelated'].idxmax()] = 'Product_Browsers'
cluster_names[cluster_means['BounceRates'].idxmax()] = 'Quick_Bouncers'

# Fill in any unnamed clusters
for i in range(kmeans.n_clusters):
    if i not in cluster_names:
        cluster_names[i] = f'Cluster_{i}'

shoppers_data['behavior_segment'] = shoppers_data['behavior_cluster'].map(cluster_names)

In [9]:
# Interest Category Inference
# Create interest categories based on browsing behavior patterns
def infer_interest_category(row):
    admin_ratio = row['Administrative'] / row['total_pages_viewed'] if row['total_pages_viewed'] > 0 else 0
    info_ratio = row['Informational'] / row['total_pages_viewed'] if row['total_pages_viewed'] > 0 else 0
    product_ratio = row['ProductRelated'] / row['total_pages_viewed'] if row['total_pages_viewed'] > 0 else 0
    
    if admin_ratio > 0.4:
        return 'Service_Oriented'
    elif info_ratio > 0.4:
        return 'Research_Oriented'
    elif product_ratio > 0.8:
        return 'Product_Focused'
    elif row['BounceRates'] > 0.4:
        return 'Casual_Browser'
    else:
        return 'Balanced_Shopper'

shoppers_data['interest_category'] = shoppers_data.apply(infer_interest_category, axis=1)

In [10]:
# Time-based metrics
# Average time per page
shoppers_data['avg_time_per_page'] = shoppers_data['total_duration'] / shoppers_data['total_pages_viewed'].replace(0, 1)

# Time efficiency score (higher means more efficient browsing)
shoppers_data['time_efficiency'] = shoppers_data['total_pages_viewed'] / shoppers_data['total_duration'].replace(0, 0.1)

# Admin page time per page
shoppers_data['admin_time_per_page'] = shoppers_data.apply(
    lambda x: x['Administrative_Duration'] / x['Administrative'] if x['Administrative'] > 0 else 0, 
    axis=1
)

# Info page time per page
shoppers_data['info_time_per_page'] = shoppers_data.apply(
    lambda x: x['Informational_Duration'] / x['Informational'] if x['Informational'] > 0 else 0, 
    axis=1
)

# Product page time per page
shoppers_data['product_time_per_page'] = shoppers_data.apply(
    lambda x: x['ProductRelated_Duration'] / x['ProductRelated'] if x['ProductRelated'] > 0 else 0, 
    axis=1
)

In [11]:
# Engagement score
# Create a comprehensive engagement score
shoppers_data['engagement_score'] = (
    shoppers_data['total_duration'] * 0.4 + 
    shoppers_data['total_pages_viewed'] * 0.4 + 
    (1 - shoppers_data['BounceRates']) * 0.1 + 
    (1 - shoppers_data['ExitRates']) * 0.1
)

# Create bounce categories
def categorize_bounce(rate):
    if rate < 0.2:
        return 'Low'
    elif rate < 0.6:
        return 'Medium'
    else:
        return 'High'
        
shoppers_data['bounce_category'] = shoppers_data['BounceRates'].apply(categorize_bounce)

# Create exit rate categories
def categorize_exit(rate):
    if rate < 0.2:
        return 'Low'
    elif rate < 0.6:
        return 'Medium'
    else:
        return 'High'
        
shoppers_data['exit_category'] = shoppers_data['ExitRates'].apply(categorize_exit)

# Display summary of enrichment
print(f"Enriched dataset shape: {shoppers_data.shape}")
print(f"New features added: {shoppers_data.shape[1] - orig_shape[1]}")

# Display list of new features
new_features = [
    'weekend_multiplier', 'browsing_efficiency', 'content_product_ratio',
    'engagement_depth', 'visitor_Returning_Visitor', 'visitor_New_Visitor',
    'visitor_Other', 'loyalty_score', 'total_duration', 'total_pages_viewed',
    'page_value_norm', 'value_per_page', 'value_potential', 'behavior_cluster',
    'behavior_segment', 'interest_category', 'avg_time_per_page',
    'time_efficiency', 'admin_time_per_page', 'info_time_per_page',
    'product_time_per_page', 'engagement_score', 'bounce_category',
    'exit_category'
]
print("New features added:")
print(", ".join(new_features))

# Save to file
shoppers_data.to_csv('enriched_shoppers_data.csv', index=False)
print("Enriched dataset saved to 'enriched_shoppers_data.csv'")

Enriched dataset shape: (12205, 68)
New features added: 23
New features added:
weekend_multiplier, browsing_efficiency, content_product_ratio, engagement_depth, visitor_Returning_Visitor, visitor_New_Visitor, visitor_Other, loyalty_score, total_duration, total_pages_viewed, page_value_norm, value_per_page, value_potential, behavior_cluster, behavior_segment, interest_category, avg_time_per_page, time_efficiency, admin_time_per_page, info_time_per_page, product_time_per_page, engagement_score, bounce_category, exit_category
Enriched dataset saved to 'enriched_shoppers_data.csv'
