In [None]:
# Legacy User Analytics Pipeline

This notebook demonstrates a **legacy** approach to analyzing user analytics data using pandas. 

⚠️ **Warning**: This code contains several performance bottlenecks and inefficient patterns that should be optimized for production use.

## Dataset Overview
- **User Data**: Basic user information and demographics
- **Session Data**: User session information with device and location data
- **Event Data**: Individual user events and interactions

Let's start by loading and exploring our data...


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time

print("Loading user analytics data...")
start_time = time.time()

# Load our datasets
users_df = pd.read_csv('user_data.csv')
sessions_df = pd.read_csv('user_sessions.csv')
events_df = pd.read_csv('user_events.csv')

# Convert datetime columns
users_df['signup_date'] = pd.to_datetime(users_df['signup_date'])
users_df['first_seen'] = pd.to_datetime(users_df['first_seen'])
users_df['last_seen'] = pd.to_datetime(users_df['last_seen'])

sessions_df['session_start'] = pd.to_datetime(sessions_df['session_start'])
events_df['event_time'] = pd.to_datetime(events_df['event_time'])

load_time = time.time() - start_time
print(f"Data loaded in {load_time:.2f} seconds")
print(f"Users: {len(users_df):,}")
print(f"Sessions: {len(sessions_df):,}")
print(f"Events: {len(events_df):,}")


In [None]:
def calculate_engagement_score_slow(users_df):
    """
    INEFFICIENT: Using iterrows() to calculate engagement scores
    This is much slower than vectorized operations
    """
    print("Calculating engagement scores (inefficient method)...")
    start_time = time.time()
    
    engagement_scores = []
    
    # BAD: Using iterrows() - very slow for large datasets
    for index, row in users_df.iterrows():
        # Calculate days since signup
        days_since_signup = (row['last_seen'] - row['signup_date']).days
        
        # Calculate engagement score based on sessions and revenue
        if days_since_signup > 0:
            session_rate = row['total_sessions'] / days_since_signup
            revenue_per_session = row['total_revenue'] / row['total_sessions'] if row['total_sessions'] > 0 else 0
            engagement_score = (session_rate * 0.6) + (revenue_per_session * 0.4)
        else:
            engagement_score = 0
        
        engagement_scores.append(engagement_score)
    
    users_df['engagement_score'] = engagement_scores
    
    elapsed_time = time.time() - start_time
    print(f"Engagement scores calculated in {elapsed_time:.2f} seconds")
    return users_df

# Apply the slow function
users_df = calculate_engagement_score_slow(users_df)
print(f"Average engagement score: {users_df['engagement_score'].mean():.4f}")


In [None]:
def categorize_email_domain_slow(email_domain):
    """
    INEFFICIENT: Using multiple string operations in apply()
    This should be replaced with vectorized string operations
    """
    # BAD: Inefficient string operations inside apply()
    email_domain = str(email_domain).lower().strip()
    
    # Multiple if-else statements that could be optimized
    if email_domain.find('gmail') != -1:
        return 'Personal'
    elif email_domain.find('yahoo') != -1:
        return 'Personal'
    elif email_domain.find('hotmail') != -1:
        return 'Personal'
    elif email_domain.find('outlook') != -1:
        return 'Personal'
    elif email_domain.find('company') != -1:
        return 'Corporate'
    else:
        return 'Other'

print("Categorizing email domains (inefficient method)...")
start_time = time.time()

# BAD: Using apply() with complex lambda when vectorized operations would be faster
users_df['email_category'] = users_df['email_domain'].apply(
    lambda x: categorize_email_domain_slow(x)
)

# More inefficient string operations
users_df['email_provider'] = users_df['email_domain'].apply(
    lambda x: str(x).split('.')[0].upper() if '.' in str(x) else 'UNKNOWN'
)

elapsed_time = time.time() - start_time
print(f"Email categorization completed in {elapsed_time:.2f} seconds")

# Show the results
print("\nEmail category distribution:")
print(users_df['email_category'].value_counts())


In [None]:
def calculate_user_metrics_slow(users_df, sessions_df, events_df):
    """
    INEFFICIENT: Using nested loops for aggregations
    This should be replaced with groupby operations
    """
    print("Calculating user metrics (inefficient nested loops)...")
    start_time = time.time()
    
    user_metrics = []
    
    # BAD: Nested loops for aggregations - very slow!
    for user_id in users_df['user_id'].unique():
        user_sessions = []
        user_events = []
        
        # Inner loop 1: Find all sessions for this user
        for _, session in sessions_df.iterrows():
            if session['user_id'] == user_id:
                user_sessions.append(session)
        
        # Inner loop 2: Find all events for this user
        for _, event in events_df.iterrows():
            if event['user_id'] == user_id:
                user_events.append(event)
        
        # Calculate metrics manually instead of using built-in functions
        total_session_duration = 0
        for session in user_sessions:
            total_session_duration += session['session_duration']
        
        avg_session_duration = total_session_duration / len(user_sessions) if user_sessions else 0
        
        # Count events by type manually
        event_counts = {}
        for event in user_events:
            event_type = event['event_type']
            if event_type not in event_counts:
                event_counts[event_type] = 0
            event_counts[event_type] += 1
        
        # Calculate conversion rate manually
        purchases = event_counts.get('purchase', 0)
        total_events = len(user_events)
        conversion_rate = purchases / total_events if total_events > 0 else 0
        
        user_metrics.append({
            'user_id': user_id,
            'avg_session_duration': avg_session_duration,
            'total_events': total_events,
            'purchases': purchases,
            'conversion_rate': conversion_rate
        })
    
    elapsed_time = time.time() - start_time
    print(f"User metrics calculated in {elapsed_time:.2f} seconds")
    
    return pd.DataFrame(user_metrics)

# Apply the slow function
user_metrics_df = calculate_user_metrics_slow(users_df, sessions_df, events_df)
print(f"Calculated metrics for {len(user_metrics_df)} users")
print(f"Average conversion rate: {user_metrics_df['conversion_rate'].mean():.4f}")


In [None]:
def create_user_summary_slow(users_df, sessions_df, events_df):
    """
    INEFFICIENT: Creating multiple unnecessary intermediate DataFrames
    and performing inefficient merges
    """
    print("Creating user summary (memory-inefficient method)...")
    start_time = time.time()
    
    # BAD: Creating many intermediate DataFrames instead of chaining operations
    temp_df1 = users_df.copy()
    temp_df2 = temp_df1.copy()
    temp_df3 = temp_df2.copy()
    
    # BAD: Multiple separate operations instead of efficient chaining
    sessions_by_user = sessions_df.groupby('user_id').agg({
        'session_duration': ['mean', 'sum', 'count'],
        'pages_visited': ['mean', 'sum'],
        'revenue': ['sum', 'mean']
    }).reset_index()
    
    # BAD: Flattening column names inefficiently
    sessions_by_user.columns = ['user_id', 'avg_session_duration', 'total_session_duration', 
                               'session_count', 'avg_pages_visited', 'total_pages_visited',
                               'total_revenue', 'avg_revenue_per_session']
    
    # BAD: Creating another intermediate DataFrame
    events_by_user = events_df.groupby('user_id').agg({
        'event_id': 'count',
        'value': ['sum', 'mean']
    }).reset_index()
    
    events_by_user.columns = ['user_id', 'total_events', 'total_event_value', 'avg_event_value']
    
    # BAD: Inefficient merges - creating copies each time
    result_df = temp_df3.copy()
    result_df = result_df.merge(sessions_by_user, on='user_id', how='left')
    result_df = result_df.merge(events_by_user, on='user_id', how='left')
    
    # BAD: Filling NaN values inefficiently
    numeric_columns = ['avg_session_duration', 'total_session_duration', 'session_count',
                      'avg_pages_visited', 'total_pages_visited', 'total_revenue',
                      'avg_revenue_per_session', 'total_events', 'total_event_value',
                      'avg_event_value']
    
    for col in numeric_columns:
        result_df[col] = result_df[col].fillna(0)
    
    # BAD: More unnecessary intermediate DataFrames
    final_df = result_df.copy()
    
    # BAD: Inefficient categorical operations
    final_df['user_tier'] = final_df['total_revenue'].apply(
        lambda x: 'Premium' if x > 100 else 'Standard' if x > 50 else 'Basic'
    )
    
    elapsed_time = time.time() - start_time
    print(f"User summary created in {elapsed_time:.2f} seconds")
    
    return final_df

# Apply the memory-inefficient function
user_summary_df = create_user_summary_slow(users_df, sessions_df, events_df)
print(f"Created summary for {len(user_summary_df)} users")
print(f"Memory usage: {user_summary_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Show user tier distribution
print("\nUser tier distribution:")
print(user_summary_df['user_tier'].value_counts())


In [None]:
# Display sample results
print("Sample of final user summary:")
print(user_summary_df[['user_id', 'email_category', 'engagement_score', 'user_tier', 
                      'total_revenue', 'session_count', 'total_events']].head(10))

print("\n" + "="*60)
print("PERFORMANCE ISSUES IDENTIFIED:")
print("="*60)
print("1. ❌ Using iterrows() instead of vectorized operations")
print("   - Slows down engagement score calculation significantly")
print("   - Should use pandas vectorized operations instead")

print("\n2. ❌ Inefficient string operations with apply()")
print("   - Email categorization uses apply() with complex logic")
print("   - Should use pandas .str methods and .map() instead")

print("\n3. ❌ Nested loops for aggregations")
print("   - User metrics calculation uses nested loops")
print("   - Should use groupby() operations instead")

print("\n4. ❌ Memory-inefficient operations")
print("   - Creating multiple unnecessary intermediate DataFrames")
print("   - Should chain operations and avoid copying data")

print("\n5. ❌ Inefficient categorical operations")
print("   - Using apply() with lambda for simple conditions")
print("   - Should use pandas cut() or numpy.select() instead")

print("\n💡 These patterns can be optimized using modern pandas techniques!")
print("💡 Expected performance improvement: 5-10x faster execution")
print("💡 Expected memory reduction: 50-70% less memory usage")
