# User Behavior Analysis Dashboard

This notebook analyzes user behavior data to understand engagement patterns and conversion rates.

**Note:** This is a messy notebook that needs refactoring!


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load data - hardcoded path!
df = pd.read_csv('/Users/jgilhuly/Documents/dev/demos/demo-repo/notebook-to-package/user_behavior_data.csv')

# Quick data exploration
print("Data shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
df.head()


In [None]:
# Data cleaning and preparation - mixed with plotting!
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek

# Remove duplicates
df = df.drop_duplicates()

# Let's plot something right here in the data cleaning section
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
df['device_type'].value_counts().plot(kind='bar')
plt.title('Device Type Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
df['referrer_type'].value_counts().plot(kind='pie')
plt.title('Referrer Type Distribution')
plt.tight_layout()
plt.show()

# More data cleaning
df['conversion_rate'] = df['conversion'].mean()  # This is wrong - same for all rows!
print(f"Overall conversion rate: {df['conversion_rate'].iloc[0]:.2%}")


In [None]:
# User engagement analysis - with hardcoded values everywhere!
def analyze_engagement(data):
    # Hardcoded thresholds
    HIGH_ENGAGEMENT_PAGES = 5
    HIGH_ENGAGEMENT_TIME = 15.0
    
    high_engagement = data[
        (data['page_views'] >= HIGH_ENGAGEMENT_PAGES) | 
        (data['time_spent_minutes'] >= HIGH_ENGAGEMENT_TIME)
    ]
    
    return high_engagement

# Calculate metrics
high_eng_users = analyze_engagement(df)
print(f"High engagement users: {len(high_eng_users)}")

# More hardcoded analysis
desktop_users = df[df['device_type'] == 'desktop']
mobile_users = df[df['device_type'] == 'mobile'] 
tablet_users = df[df['device_type'] == 'tablet']

print(f"Desktop conversion rate: {desktop_users['conversion'].mean():.2%}")
print(f"Mobile conversion rate: {mobile_users['conversion'].mean():.2%}")
print(f"Tablet conversion rate: {tablet_users['conversion'].mean():.2%}")

# Let's plot this too
plt.figure(figsize=(8, 6))
device_conversion = df.groupby('device_type')['conversion'].mean()
device_conversion.plot(kind='bar')
plt.title('Conversion Rate by Device Type')
plt.ylabel('Conversion Rate')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Time-based analysis - duplicate code coming up!
hourly_data = df.groupby('hour').agg({
    'page_views': 'mean',
    'time_spent_minutes': 'mean', 
    'conversion': 'mean'
}).reset_index()

# Plot hourly trends
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.plot(hourly_data['hour'], hourly_data['page_views'])
plt.title('Avg Page Views by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Page Views')

plt.subplot(2, 2, 2)
plt.plot(hourly_data['hour'], hourly_data['time_spent_minutes'])
plt.title('Avg Time Spent by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Time (minutes)')

plt.subplot(2, 2, 3)
plt.plot(hourly_data['hour'], hourly_data['conversion'])
plt.title('Conversion Rate by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Conversion Rate')

plt.tight_layout()
plt.show()

# Peak hours analysis - hardcoded again!
peak_hours = [10, 14, 16, 18, 20]  # Should be calculated!
peak_data = df[df['hour'].isin(peak_hours)]
print(f"Peak hours data: {len(peak_data)} records")
print(f"Peak hours conversion: {peak_data['conversion'].mean():.2%}")


In [None]:
# Geographic analysis - more mixed concerns!
def calculate_country_stats(df):
    # Utility function mixed with analysis
    country_stats = df.groupby('country').agg({
        'page_views': ['mean', 'sum'],
        'time_spent_minutes': ['mean', 'sum'],
        'conversion': ['mean', 'sum'],
        'user_id': 'count'
    }).reset_index()
    
    # Flatten column names
    country_stats.columns = ['country', 'avg_page_views', 'total_page_views', 
                            'avg_time_spent', 'total_time_spent', 
                            'avg_conversion', 'total_conversions', 'user_count']
    
    return country_stats

country_analysis = calculate_country_stats(df)
print("Country Analysis:")
print(country_analysis.sort_values('avg_conversion', ascending=False))

# Yet another plotting section mixed in!
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
country_analysis.set_index('country')['avg_conversion'].plot(kind='bar')
plt.title('Average Conversion Rate by Country')
plt.ylabel('Conversion Rate')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
country_analysis.set_index('country')['user_count'].plot(kind='bar')
plt.title('User Count by Country')
plt.ylabel('Users')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Duplicate device analysis (we did this before!)
device_stats = df.groupby('device_type').agg({
    'page_views': 'mean',
    'time_spent_minutes': 'mean',
    'conversion': 'mean'
}).reset_index()

print("\nDevice Analysis (duplicate!):")
print(device_stats)


In [None]:
# Final analysis section - inconsistent naming and more duplicates!
def analyzeReferrerPerformance(dataframe):  # Inconsistent naming style!
    """Calculate referrer performance metrics"""
    referrer_perf = dataframe.groupby('referrer_type').agg({
        'page_views': 'mean',
        'time_spent_minutes': 'mean',
        'conversion': 'mean',
        'bounce_rate': 'mean',
        'user_id': 'count'
    }).reset_index()
    
    referrer_perf.columns = ['referrer', 'avg_pages', 'avg_time', 'conv_rate', 'bounce', 'users']
    return referrer_perf

# More hardcoded analysis
referrer_data = analyzeReferrerPerformance(df)
print("Referrer Performance:")
print(referrer_data.sort_values('conv_rate', ascending=False))

# Calculate some ratios - hardcoded magic numbers!
referrer_data['engagement_score'] = (referrer_data['avg_pages'] * 0.3 + 
                                   referrer_data['avg_time'] * 0.4 + 
                                   (1 - referrer_data['bounce']) * 0.3)

print("\nEngagement Scores:")
print(referrer_data[['referrer', 'engagement_score']].sort_values('engagement_score', ascending=False))

# Another plotting section
plt.figure(figsize=(15, 10))
plt.subplot(2, 3, 1)
referrer_data.set_index('referrer')['conv_rate'].plot(kind='bar')
plt.title('Conversion Rate by Referrer')
plt.xticks(rotation=45)

plt.subplot(2, 3, 2)
referrer_data.set_index('referrer')['avg_pages'].plot(kind='bar')
plt.title('Avg Page Views by Referrer')
plt.xticks(rotation=45)

plt.subplot(2, 3, 3)
referrer_data.set_index('referrer')['bounce'].plot(kind='bar')
plt.title('Bounce Rate by Referrer')
plt.xticks(rotation=45)

plt.subplot(2, 3, 4)
referrer_data.set_index('referrer')['engagement_score'].plot(kind='bar')
plt.title('Engagement Score by Referrer')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Final summary with more hardcoded values
print("\n=== FINAL SUMMARY ===")
print(f"Total users analyzed: {df['user_id'].nunique()}")
print(f"Total sessions: {len(df)}")
print(f"Overall conversion rate: {df['conversion'].mean():.2%}")
print(f"Average session duration: {df['time_spent_minutes'].mean():.1f} minutes")
print(f"Average pages per session: {df['page_views'].mean():.1f}")

# Best/worst performers - hardcoded logic!
best_country = df.groupby('country')['conversion'].mean().idxmax()
worst_country = df.groupby('country')['conversion'].mean().idxmin()
print(f"Best performing country: {best_country}")
print(f"Worst performing country: {worst_country}")

best_device = df.groupby('device_type')['conversion'].mean().idxmax()
worst_device = df.groupby('device_type')['conversion'].mean().idxmin()
print(f"Best performing device: {best_device}")
print(f"Worst performing device: {worst_device}")
