# Data Analysis - Finding Insights

## Learning Objectives
By the end of this lesson, you will be able to:
- Explore and understand your data
- Find patterns and relationships
- Clean messy data properly
- Make data-driven conclusions
- Present findings clearly

## Core Concepts
- **EDA**: Exploratory Data Analysis - first look at data
- **Statistics**: Mean, median, correlation to understand data
- **Outliers**: Unusual values that might be errors
- **Missing Data**: Handling gaps in your dataset
- **Insights**: Meaningful conclusions from data

## 1. Exploring Your Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create realistic customer data
np.random.seed(42)
customers = pd.DataFrame({
    'customer_id': range(1, 501),
    'age': np.random.normal(35, 12, 500).clip(18, 70).astype(int),
    'income': np.random.lognormal(10.5, 0.4, 500).clip(25000, 150000),
    'monthly_spend': np.random.gamma(2, 150, 500),
    'satisfaction': np.random.normal(3.5, 0.8, 500).clip(1, 5),
    'region': np.random.choice(['North', 'South', 'East', 'West'], 500),
    'membership_months': np.random.randint(1, 60, 500)
})

# Add some missing values (realistic)
customers.loc[np.random.choice(500, 25, replace=False), 'income'] = np.nan

print("First look at the data:")
print(customers.head())
print(f"\nDataset shape: {customers.shape}")
print(f"Data types:\n{customers.dtypes}")

# Basic statistics
print(f"\nBasic statistics:")
print(customers.describe())

# Check for missing data
print(f"\nMissing values:")
missing = customers.isnull().sum()
print(missing[missing > 0])

# Quick visualizations for understanding
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Age distribution
axes[0,0].hist(customers['age'], bins=20, alpha=0.7, color='skyblue')
axes[0,0].set_title('Age Distribution')
axes[0,0].set_xlabel('Age')

# Income vs Spending
axes[0,1].scatter(customers['income'], customers['monthly_spend'], alpha=0.6)
axes[0,1].set_title('Income vs Monthly Spending')
axes[0,1].set_xlabel('Income ($)')
axes[0,1].set_ylabel('Monthly Spend ($)')

# Satisfaction by region
sns.boxplot(data=customers, x='region', y='satisfaction', ax=axes[1,0])
axes[1,0].set_title('Satisfaction by Region')

# Spending distribution
axes[1,1].hist(customers['monthly_spend'], bins=25, alpha=0.7, color='lightgreen')
axes[1,1].set_title('Monthly Spending Distribution')
axes[1,1].set_xlabel('Monthly Spend ($)')

plt.tight_layout()
plt.show()

print("Data exploration completed - we can see patterns forming!")

## 2. Finding Patterns and Relationships

In [None]:
# Correlation analysis - what's related to what?
correlation_matrix = customers.select_dtypes(include=[np.number]).corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('How Variables Are Related')
plt.show()

print("Strong correlations (>0.5 or <-0.5):")
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.5:
            print(f"{correlation_matrix.columns[i]} & {correlation_matrix.columns[j]}: {corr_value:.2f}")

# Group analysis - compare different segments
print(f"\nRegional differences:")
regional_stats = customers.groupby('region').agg({
    'age': 'mean',
    'income': 'mean', 
    'monthly_spend': 'mean',
    'satisfaction': 'mean'
}).round(1)
print(regional_stats)

# Age group analysis
customers['age_group'] = pd.cut(customers['age'], 
                               bins=[0, 30, 50, 100], 
                               labels=['Young', 'Middle', 'Senior'])

age_analysis = customers.groupby('age_group').agg({
    'monthly_spend': ['mean', 'count'],
    'satisfaction': 'mean'
}).round(1)
print(f"\nAge group analysis:")
print(age_analysis)

# Find outliers in spending
Q1 = customers['monthly_spend'].quantile(0.25)
Q3 = customers['monthly_spend'].quantile(0.75)
IQR = Q3 - Q1
outlier_threshold = Q3 + 1.5 * IQR

outliers = customers[customers['monthly_spend'] > outlier_threshold]
print(f"\nHigh spenders (outliers): {len(outliers)} customers")
print(f"They spend on average: ${outliers['monthly_spend'].mean():.0f}")
print(f"Normal customers spend: ${customers['monthly_spend'].median():.0f}")

# Customer satisfaction insights
low_satisfaction = customers[customers['satisfaction'] < 3]
high_satisfaction = customers[customers['satisfaction'] > 4]

print(f"\nSatisfaction insights:")
print(f"Low satisfaction customers ({len(low_satisfaction)}): avg spend ${low_satisfaction['monthly_spend'].mean():.0f}")
print(f"High satisfaction customers ({len(high_satisfaction)}): avg spend ${high_satisfaction['monthly_spend'].mean():.0f}")

# Visualize key relationships
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Age vs satisfaction
sns.scatterplot(data=customers, x='age', y='satisfaction', ax=axes[0])
axes[0].set_title('Age vs Satisfaction')

# Income vs spending by region
sns.scatterplot(data=customers, x='income', y='monthly_spend', hue='region', ax=axes[1])
axes[1].set_title('Income vs Spending by Region')

# Spending by age group
sns.boxplot(data=customers, x='age_group', y='monthly_spend', ax=axes[2])
axes[2].set_title('Spending by Age Group')

plt.tight_layout()
plt.show()

print("Key insights: Age affects satisfaction, income drives spending, regional differences exist")

## 3. Data Cleaning and Business Insights

In [None]:
# Handle missing income data smartly
print("Data cleaning:")
print(f"Missing income values: {customers['income'].isnull().sum()}")

# Fill missing incomes based on age and spending patterns
def estimate_income(row):
    if pd.isna(row['income']):
        # Use spending and age to estimate
        similar_customers = customers[
            (customers['age'].between(row['age']-5, row['age']+5)) & 
            (customers['monthly_spend'].between(row['monthly_spend']*0.8, row['monthly_spend']*1.2)) &
            (customers['income'].notna())
        ]
        if len(similar_customers) > 0:
            return similar_customers['income'].median()
        else:
            return customers['income'].median()
    return row['income']

customers['income_clean'] = customers.apply(estimate_income, axis=1)
print(f"After cleaning: {customers['income_clean'].isnull().sum()} missing values")

# Create customer value segments
customers['customer_value'] = (
    customers['monthly_spend'] * customers['membership_months'] * customers['satisfaction']
)

# Segment customers
customers['value_segment'] = pd.cut(customers['customer_value'], 
                                   bins=3, 
                                   labels=['Low Value', 'Medium Value', 'High Value'])

segment_summary = customers.groupby('value_segment').agg({
    'customer_id': 'count',
    'monthly_spend': 'mean',
    'satisfaction': 'mean',
    'membership_months': 'mean'
}).round(1)

print(f"\nCustomer segments:")
print(segment_summary)

# Business recommendations based on analysis
high_value = customers[customers['value_segment'] == 'High Value']
low_satisfaction = customers[customers['satisfaction'] < 3]
young_customers = customers[customers['age'] < 30]

print(f"\nBusiness insights:")
print(f"1. High value customers: {len(high_value)} customers generate most revenue")
print(f"   - Average spend: ${high_value['monthly_spend'].mean():.0f}")
print(f"   - Average satisfaction: {high_value['satisfaction'].mean():.1f}")

print(f"\n2. At-risk customers: {len(low_satisfaction)} have low satisfaction")
print(f"   - Risk losing ${low_satisfaction['monthly_spend'].sum():,.0f}/month")
print(f"   - Focus on improving their experience")

print(f"\n3. Young customers: {len(young_customers)} under 30")
print(f"   - Spend less but growth potential: ${young_customers['monthly_spend'].mean():.0f}")
print(f"   - Target for loyalty programs")

# Visualize business insights
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Customer value segments
segment_counts = customers['value_segment'].value_counts()
axes[0,0].pie(segment_counts.values, labels=segment_counts.index, autopct='%1.1f%%')
axes[0,0].set_title('Customer Value Distribution')

# Satisfaction vs spending
colors = {'Low Value': 'red', 'Medium Value': 'orange', 'High Value': 'green'}
for segment in customers['value_segment'].unique():
    if pd.notna(segment):
        data = customers[customers['value_segment'] == segment]
        axes[0,1].scatter(data['satisfaction'], data['monthly_spend'], 
                         label=segment, color=colors[segment], alpha=0.6)
axes[0,1].set_xlabel('Satisfaction')
axes[0,1].set_ylabel('Monthly Spend')
axes[0,1].set_title('Satisfaction vs Spending by Segment')
axes[0,1].legend()

# Age distribution by satisfaction level
customers['satisfaction_level'] = pd.cut(customers['satisfaction'], 
                                        bins=[0, 3, 4, 5], 
                                        labels=['Low', 'Medium', 'High'])
sns.boxplot(data=customers, x='satisfaction_level', y='age', ax=axes[1,0])
axes[1,0].set_title('Age Distribution by Satisfaction')

# Monthly revenue by region
revenue_by_region = customers.groupby('region')['monthly_spend'].sum()
axes[1,1].bar(revenue_by_region.index, revenue_by_region.values, color='lightblue')
axes[1,1].set_title('Total Monthly Revenue by Region')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\nAction plan:")
print("• Retain high-value customers with VIP program")
print("• Address low satisfaction issues immediately") 
print("• Develop young customer loyalty initiatives")
print("• Focus marketing on high-performing regions")

# Practice Exercises

Apply data analysis concepts to real-world scenarios:

In [None]:
# Exercise 1: Sales team analysis
sales_team = pd.DataFrame({
    'salesperson': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'experience_years': [3, 7, 2, 5, 8],
    'monthly_sales': [45000, 78000, 32000, 65000, 89000],
    'customer_calls': [120, 200, 90, 150, 180],
    'region': ['North', 'South', 'North', 'East', 'South']
})

print("Sales team analysis:")
print(f"Top performer: {sales_team.loc[sales_team['monthly_sales'].idxmax(), 'salesperson']}")
print(f"Average sales: ${sales_team['monthly_sales'].mean():,.0f}")

# Find correlation between calls and sales
correlation = sales_team['customer_calls'].corr(sales_team['monthly_sales'])
print(f"Calls vs Sales correlation: {correlation:.2f}")

# Regional performance
region_performance = sales_team.groupby('region')['monthly_sales'].mean()
print(f"\nBest performing region: {region_performance.idxmax()}")

# Exercise 2: Website analytics
website_data = pd.DataFrame({
    'day': range(1, 31),
    'visitors': np.random.randint(800, 1200, 30),
    'page_views': np.random.randint(2000, 4000, 30),
    'bounce_rate': np.random.uniform(0.2, 0.6, 30),
    'conversions': np.random.randint(10, 50, 30)
})

print(f"\nWebsite analytics:")
print(f"Average daily visitors: {website_data['visitors'].mean():.0f}")
print(f"Best day (most conversions): Day {website_data.loc[website_data['conversions'].idxmax(), 'day']}")

# Calculate conversion rate
website_data['conversion_rate'] = website_data['conversions'] / website_data['visitors']
print(f"Average conversion rate: {website_data['conversion_rate'].mean():.1%}")

# Find high-performing days
good_days = website_data[website_data['conversion_rate'] > website_data['conversion_rate'].median()]
print(f"High-performing days: {len(good_days)} out of 30")

# Exercise 3: Product feedback analysis
feedback = pd.DataFrame({
    'product': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A'],
    'rating': [4.5, 3.2, 4.8, 4.1, 3.5, 4.6, 4.3, 3.8, 4.9, 4.0],
    'price': [100, 150, 200, 100, 150, 200, 100, 150, 200, 100],
    'customer_type': ['New', 'Returning', 'New', 'Returning', 'New', 'Returning', 'New', 'Returning', 'New', 'Returning']
})

print(f"\nProduct analysis:")
product_ratings = feedback.groupby('product')['rating'].mean()
print(f"Product ratings:")
for product, rating in product_ratings.items():
    print(f"  Product {product}: {rating:.1f}")

# Best product
best_product = product_ratings.idxmax()
print(f"Highest rated product: {best_product}")

# Customer type analysis
customer_analysis = feedback.groupby('customer_type')['rating'].mean()
print(f"\nRatings by customer type:")
print(customer_analysis.round(1))

print("Data analysis practice completed!")

In [None]:
# Exercise 1: E-commerce sales analysis
np.random.seed(123)
ecommerce_data = pd.DataFrame({
    'order_id': range(1, 2001),
    'customer_id': np.random.randint(1, 501, 2000),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'], 2000),
    'order_value': np.random.gamma(2, 50, 2000),
    'discount_percent': np.random.choice([0, 5, 10, 15, 20], 2000, p=[0.4, 0.25, 0.2, 0.1, 0.05]),
    'shipping_cost': np.random.uniform(5, 25, 2000),
    'order_date': pd.date_range('2023-01-01', periods=2000, freq='H'),
    'customer_rating': np.random.choice([1, 2, 3, 4, 5], 2000, p=[0.05, 0.1, 0.2, 0.35, 0.3]),
    'delivery_days': np.random.poisson(3, 2000) + 1
})

# Add some missing values and outliers
missing_idx = np.random.choice(2000, 100, replace=False)
ecommerce_data.loc[missing_idx, 'customer_rating'] = np.nan

outlier_idx = np.random.choice(2000, 20, replace=False)
ecommerce_data.loc[outlier_idx, 'order_value'] *= 10

print("E-commerce Sales Analysis")
print("=" * 30)
print(f"Dataset shape: {ecommerce_data.shape}")
print(f"Date range: {ecommerce_data['order_date'].min()} to {ecommerce_data['order_date'].max()}")

# Basic EDA
print(f"\nBasic Statistics:")
print(ecommerce_data.describe())

# Missing value analysis
print(f"\nMissing Values:")
print(ecommerce_data.isnull().sum())

# Category analysis
print(f"\nSales by Category:")
category_sales = ecommerce_data.groupby('product_category').agg({
    'order_value': ['sum', 'mean', 'count'],
    'customer_rating': 'mean'
}).round(2)
category_sales.columns = ['Total_Sales', 'Avg_Order_Value', 'Order_Count', 'Avg_Rating']
print(category_sales)

# Time-based analysis
ecommerce_data['month'] = ecommerce_data['order_date'].dt.month
ecommerce_data['day_of_week'] = ecommerce_data['order_date'].dt.day_name()
ecommerce_data['hour'] = ecommerce_data['order_date'].dt.hour

print(f"\nMonthly Sales Trend:")
monthly_sales = ecommerce_data.groupby('month')['order_value'].sum()
print(monthly_sales)

# Customer analysis
customer_metrics = ecommerce_data.groupby('customer_id').agg({
    'order_value': ['sum', 'count', 'mean'],
    'customer_rating': 'mean',
    'order_date': ['min', 'max']
}).round(2)

customer_metrics.columns = ['Total_Spent', 'Order_Count', 'Avg_Order_Value', 'Avg_Rating', 'First_Order', 'Last_Order']

# Identify VIP customers (top 10% by spending)
vip_threshold = customer_metrics['Total_Spent'].quantile(0.9)
vip_customers = customer_metrics[customer_metrics['Total_Spent'] >= vip_threshold]

print(f"\nCustomer Insights:")
print(f"Total unique customers: {len(customer_metrics)}")
print(f"VIP customers (top 10%): {len(vip_customers)}")
print(f"VIP threshold: ${vip_threshold:.2f}")
print(f"VIP customers contribute {(vip_customers['Total_Spent'].sum() / customer_metrics['Total_Spent'].sum() * 100):.1f}% of total revenue")

# Statistical tests
print(f"\nStatistical Analysis:")

# Test if Electronics orders have higher value than Books
electronics_orders = ecommerce_data[ecommerce_data['product_category'] == 'Electronics']['order_value']
books_orders = ecommerce_data[ecommerce_data['product_category'] == 'Books']['order_value']

t_stat, p_value = ttest_ind(electronics_orders, books_orders)
print(f"Electronics vs Books order value:")
print(f"  Electronics mean: ${electronics_orders.mean():.2f}")
print(f"  Books mean: ${books_orders.mean():.2f}")
print(f"  T-test p-value: {p_value:.4f}")
print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'}")

# Correlation between order value and rating
order_rating_corr, corr_p_value = pearsonr(
    ecommerce_data['order_value'], 
    ecommerce_data['customer_rating'].fillna(ecommerce_data['customer_rating'].mean())
)
print(f"\nOrder value vs Customer rating correlation: {order_rating_corr:.4f} (p-value: {corr_p_value:.4f})")

# Exercise 2: Marketing campaign effectiveness
campaign_data = pd.DataFrame({
    'customer_id': range(1, 1001),
    'campaign_type': np.random.choice(['Email', 'Social Media', 'Direct Mail', 'Control'], 1000),
    'age_group': np.random.choice(['18-25', '26-35', '36-45', '46-55', '55+'], 1000),
    'previous_purchases': np.random.poisson(3, 1000),
    'campaign_response': np.random.choice([0, 1], 1000, p=[0.7, 0.3]),
    'conversion_value': np.where(
        np.random.choice([0, 1], 1000, p=[0.7, 0.3]),
        np.random.gamma(2, 100, 1000),
        0
    ),
    'days_since_last_purchase': np.random.exponential(30, 1000).astype(int)
})

print(f"\n\nMarketing Campaign Analysis")
print("=" * 30)

# Campaign effectiveness by type
campaign_effectiveness = campaign_data.groupby('campaign_type').agg({
    'campaign_response': ['count', 'sum', 'mean'],
    'conversion_value': ['sum', 'mean']
}).round(3)

campaign_effectiveness.columns = ['Total_Reached', 'Responses', 'Response_Rate', 'Total_Revenue', 'Revenue_Per_Customer']
print(f"Campaign Effectiveness:")
print(campaign_effectiveness)

# Calculate ROI (assuming campaign costs)
campaign_costs = {'Email': 5, 'Social Media': 15, 'Direct Mail': 25, 'Control': 0}
campaign_effectiveness['Campaign_Cost_Per_Customer'] = [campaign_costs[idx] for idx in campaign_effectiveness.index]
campaign_effectiveness['ROI'] = (campaign_effectiveness['Revenue_Per_Customer'] - campaign_effectiveness['Campaign_Cost_Per_Customer']) / campaign_effectiveness['Campaign_Cost_Per_Customer']

print(f"\nROI Analysis:")
print(campaign_effectiveness[['Revenue_Per_Customer', 'Campaign_Cost_Per_Customer', 'ROI']])

# Age group analysis
age_response = campaign_data.groupby('age_group')['campaign_response'].mean().sort_values(ascending=False)
print(f"\nResponse Rate by Age Group:")
print(age_response)

# Chi-square test for campaign type vs response
contingency_table = pd.crosstab(campaign_data['campaign_type'], campaign_data['campaign_response'])
chi2, p_val, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-square test (Campaign Type vs Response):")
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p_val:.4f}")
print(f"Significant association: {'Yes' if p_val < 0.05 else 'No'}")

# Exercise 3: Financial data analysis
financial_data = pd.DataFrame({
    'transaction_id': range(1, 5001),
    'account_type': np.random.choice(['Checking', 'Savings', 'Credit', 'Investment'], 5000),
    'transaction_amount': np.random.normal(0, 500, 5000),
    'transaction_type': np.random.choice(['Deposit', 'Withdrawal', 'Transfer', 'Payment'], 5000),
    'customer_age': np.random.normal(45, 15, 5000).clip(18, 80).astype(int),
    'account_balance': np.random.lognormal(8, 1, 5000),
    'transaction_date': pd.date_range('2023-01-01', periods=5000, freq='2H'),
    'fraud_indicator': np.random.choice([0, 1], 5000, p=[0.98, 0.02])
})

print(f"\n\nFinancial Transaction Analysis")
print("=" * 35)

# Fraud detection analysis
fraud_analysis = financial_data.groupby('fraud_indicator').agg({
    'transaction_amount': ['mean', 'std', 'min', 'max'],
    'account_balance': 'mean',
    'customer_age': 'mean'
}).round(2)

print(f"Fraud vs Normal Transactions:")
print(fraud_analysis)

# Suspicious pattern detection
print(f"\nSuspicious Pattern Analysis:")

# Large transactions
large_transactions = financial_data[abs(financial_data['transaction_amount']) > financial_data['transaction_amount'].std() * 3]
print(f"Transactions >3 std dev from mean: {len(large_transactions)}")

# Off-hours transactions (late night/early morning)
financial_data['hour'] = financial_data['transaction_date'].dt.hour
off_hours = financial_data[(financial_data['hour'] < 6) | (financial_data['hour'] > 22)]
print(f"Off-hours transactions (10pm-6am): {len(off_hours)}")

# Risk scoring
def calculate_risk_score(row):
    score = 0
    
    # Large amount relative to account balance
    if abs(row['transaction_amount']) > row['account_balance'] * 0.5:
        score += 3
    
    # Off-hours transaction
    if row['hour'] < 6 or row['hour'] > 22:
        score += 2
    
    # Large absolute amount
    if abs(row['transaction_amount']) > 2000:
        score += 2
    
    # Young or very old customers
    if row['customer_age'] < 25 or row['customer_age'] > 70:
        score += 1
    
    return score

financial_data['risk_score'] = financial_data.apply(calculate_risk_score, axis=1)

# Risk distribution
risk_distribution = financial_data['risk_score'].value_counts().sort_index()
print(f"\nRisk Score Distribution:")
print(risk_distribution)

high_risk_transactions = financial_data[financial_data['risk_score'] >= 5]
print(f"High-risk transactions (score ≥5): {len(high_risk_transactions)}")

# Account type analysis
account_analysis = financial_data.groupby('account_type').agg({
    'transaction_amount': ['mean', 'std'],
    'account_balance': 'mean',
    'fraud_indicator': 'mean',
    'risk_score': 'mean'
}).round(3)

print(f"\nAccount Type Analysis:")
print(account_analysis)

print(f"\nPractice exercises completed:")
print(f"1. E-commerce sales analysis: Category performance, customer segmentation, statistical testing")
print(f"2. Marketing campaign effectiveness: ROI analysis, response rates, demographic insights")
print(f"3. Financial transaction analysis: Fraud detection, risk scoring, pattern recognition")