# Section 1: Battery Lifespan Analysis

In this notebook, we'll implement Maximum Likelihood Estimation for battery lifespan data. We'll explore how different modeling assumptions (random failures vs. aging) lead to different distributions.

## Setup: Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.optimize import minimize

# Set random seed for reproducibility
np.random.seed(42)

# Configure matplotlib
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

## Part 1: Load and Explore the Data

We have battery failure time data from two brands:

In [None]:
# Brand A battery lifespans (months) - from exercise handout
brand_a_data = np.array([18.2, 24.5, 15.3, 22.1, 28.9, 19.7, 26.3, 21.4])

# Brand B battery lifespans (months) - limited sample
brand_b_data = np.array([31.2, 28.7, 35.4, 29.8, 33.1, 30.9])

print("Brand A:")
print(f"  Sample size: {len(brand_a_data)}")
print(f"  Mean lifespan: {np.mean(brand_a_data):.2f} months")
print(f"  Std deviation: {np.std(brand_a_data):.2f} months")

print("\nBrand B:")
print(f"  Sample size: {len(brand_b_data)}")
print(f"  Mean lifespan: {np.mean(brand_b_data):.2f} months")
print(f"  Std deviation: {np.std(brand_b_data):.2f} months")

### Visualize the Data

In [None]:
# Create histograms for both brands
plt.figure(figsize=(12, 5))

# Left plot: Brand A
plt.subplot(1, 2, 1)
plt.hist(brand_a_data, bins=6, density=True, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('Lifespan (months)')
plt.ylabel('Density')
plt.title('Brand A Battery Lifespans')
plt.grid(True, alpha=0.3)

# Right plot: Brand B
plt.subplot(1, 2, 2)
# TODO 1: Create a histogram for Brand B with appropriate parameters
# Hint: Use bins=6, density=True, alpha=0.7, and choose a different color
plt.hist(_____, bins=_____, density=_____, alpha=_____, 
         color='_____', edgecolor='_____')
plt.xlabel('Lifespan (months)')
plt.ylabel('Density')
plt.title('Brand B Battery Lifespans')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Part 2: Choose Your Model

Different assumptions about failure mechanisms lead to different models:

In [None]:
# TODO 2: Change this to "gamma" to model aging/wear failures instead of random failures
model_type = "exponential"  # Options: "exponential" or "gamma"

print(f"Selected model: {model_type}")
print()
if model_type == "exponential":
    print("Exponential distribution assumes:")
    print("- Constant failure rate (memoryless)")
    print("- Random failures that can happen anytime")
    print("- Age doesn't affect failure probability")
elif model_type == "gamma":
    print("Gamma distribution (shape=2) assumes:")
    print("- Increasing failure rate with age")
    print("- Wear-and-tear failures")
    print("- Older batteries more likely to fail")

## Part 3: Calculate the MLE Analytically

In [None]:
def calculate_mle(data, model="exponential"):
    """
    Calculate the MLE for the chosen distribution.
    
    Parameters:
    data: array of observed lifetimes
    model: "exponential" or "gamma"
    
    Returns:
    param: MLE estimate of rate parameter
    """
    n = len(data)
    sum_data = np.sum(data)
    
    if model == "exponential":
        # TODO 3: Calculate lambda_hat for exponential distribution
        # For exponential: f(t|lambda) = lambda*e^(-lambda*t)
        # MLE formula: lambda_hat = n / sum(t_i)
        param = _____  # Replace with: n divided by sum_data
        
    elif model == "gamma":
        # For gamma with fixed shape k=2: f(t|lambda) = lambda^2*t*e^(-lambda*t)
        # TODO 4: Calculate lambda_hat for gamma(k=2, lambda)
        # MLE formula: lambda_hat = 2n / sum(t_i)
        param = _____  # Replace with: 2*n divided by sum_data
    
    return param

# Calculate MLE for both brands
param_a = calculate_mle(brand_a_data, model_type)
param_b = calculate_mle(brand_b_data, model_type)

if model_type == "exponential":
    print(f"Brand A: lambda = {param_a:.4f} failures/month")
    print(f"Brand A: mean lifespan = {1/param_a:.2f} months")
    print()
    print(f"Brand B: lambda = {param_b:.4f} failures/month")
    print(f"Brand B: mean lifespan = {1/param_b:.2f} months")
else:  # gamma
    print(f"Brand A: lambda = {param_a:.4f} (rate parameter)")
    print(f"Brand A: mean lifespan = {2/param_a:.2f} months")  # mean = k/lambda for gamma
    print()
    print(f"Brand B: lambda = {param_b:.4f} (rate parameter)")
    print(f"Brand B: mean lifespan = {2/param_b:.2f} months")

## Part 3a: Visualize the Likelihood Function

In [None]:
def log_likelihood(param, data, model="exponential"):
    """Calculate log-likelihood for the chosen distribution"""
    if param <= 0:
        return -np.inf
    n = len(data)
    sum_data = np.sum(data)
    
    if model == "exponential":
        # Log-likelihood: n*log(lambda) - lambda*sum(data)
        return n * np.log(param) - param * sum_data
    else:  # gamma with shape=2
        # Corrected log-likelihood for gamma(k=2,lambda): 
        # log L = 2n*log(lambda) + sum(log(t_i)) - lambda*sum(t_i)
        return 2 * n * np.log(param) + np.sum(np.log(data)) - param * sum_data

# Create range of parameter values to test
param_range = np.linspace(0.01, 0.15, 200)

# Calculate log-likelihood for each parameter value
log_likes_a = [log_likelihood(p, brand_a_data, model_type) for p in param_range]

plt.figure(figsize=(10, 6))
plt.plot(param_range, log_likes_a, 'b-', linewidth=2, label='Log-Likelihood')

# TODO 5: Add a vertical line at the MLE value
# Use plt.axvline() with the MLE parameter value
plt.axvline(_____, color='red', linestyle='--', 
           linewidth=2, label=f'MLE = {_____:.4f}')

plt.xlabel('Parameter (rate)')
plt.ylabel('Log-Likelihood')
plt.title(f'Log-Likelihood Function for Brand A ({model_type} model)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Part 3b: Find MLE using Numerical Optimization

In [None]:
def neg_log_likelihood(param, data, model="exponential"):
    """Negative log-likelihood for minimization"""
    if param <= 0:
        return np.inf
    # TODO 6: Return the negative of the log_likelihood
    # Why negative? Because scipy.optimize.minimize finds minima, not maxima
    return _____  # Replace with: -log_likelihood(param, data, model)

# Use scipy.optimize.minimize to find MLE
result = minimize(neg_log_likelihood, x0=0.05, args=(brand_a_data, model_type), 
                 bounds=[(0.001, 1)])
param_mle_numerical = result.x[0]

print(f"Analytical MLE: {param_a:.4f}")
print(f"Numerical MLE (scipy.optimize): {param_mle_numerical:.4f}")
print("They match! Both methods give the same answer.")
print()
print("Note: We use negative log-likelihood because scipy.optimize.minimize")
print("finds minima, not maxima. Minimizing -log(L) maximizes log(L).")

## Part 4: Compare Models to Data

In [None]:
# Create time points for plotting the models
t_range = np.linspace(0.01, 40, 200)  # Start at 0.01 to avoid division issues

# Calculate PDFs based on chosen model
if model_type == "exponential":
    # Exponential PDF: f(t|lambda) = lambda*e^(-lambda*t)
    pdf_a = param_a * np.exp(-param_a * t_range)
    pdf_b = param_b * np.exp(-param_b * t_range)
else:  # gamma with shape=2
    # Gamma(k=2) PDF: f(t|lambda) = lambda^2*t*e^(-lambda*t)
    pdf_a = (param_a**2) * t_range * np.exp(-param_a * t_range)
    pdf_b = (param_b**2) * t_range * np.exp(-param_b * t_range)

plt.figure(figsize=(12, 5))

# Brand A
plt.subplot(1, 2, 1)
plt.hist(brand_a_data, bins=6, density=True, alpha=0.5, color='skyblue', 
         edgecolor='black', label='Data')
plt.plot(t_range, pdf_a, 'b-', linewidth=2, 
         label=f'{model_type.capitalize()} model')
plt.xlabel('Lifespan (months)')
plt.ylabel('Density')
plt.title('Brand A: Data vs Model')
plt.legend()
plt.grid(True, alpha=0.3)

# Brand B
plt.subplot(1, 2, 2)
plt.hist(brand_b_data, bins=6, density=True, alpha=0.5, color='lightcoral', 
         edgecolor='black', label='Data')
# TODO 7: Plot the model PDF for Brand B
plt.plot(t_range, _____, 'r-', linewidth=2, 
         label=f'{model_type.capitalize()} model')
plt.xlabel('Lifespan (months)')
plt.ylabel('Density')
plt.title('Brand B: Data vs Model')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Part 5: Make the Decision

In [None]:
# Prices from the problem
price_a = 45  # dollars
price_b = 65  # dollars

# Calculate mean lifespans based on model
if model_type == "exponential":
    mean_a = 1 / param_a
    mean_b = 1 / param_b
else:  # gamma with shape=2
    mean_a = 2 / param_a
    mean_b = 2 / param_b

# Calculate cost per month for each brand
cost_per_month_a = price_a / mean_a
# TODO 8: Calculate the cost per month for Brand B
cost_per_month_b = _____  # Replace with: price_b divided by mean_b

print(f"Using {model_type} model:")
print()
print("Cost Analysis:")
print(f"Brand A: ${price_a} for {mean_a:.1f} months = ${cost_per_month_a:.2f}/month")
print(f"Brand B: ${price_b} for {mean_b:.1f} months = ${cost_per_month_b:.2f}/month")
print()

# Make a recommendation
if cost_per_month_a < cost_per_month_b:
    savings = cost_per_month_b - cost_per_month_a
    print(f"Recommendation: Choose Brand A (saves ${savings:.2f}/month)")
else:
    savings = cost_per_month_a - cost_per_month_b
    print(f"Recommendation: Choose Brand B (saves ${savings:.2f}/month)")

## Part 6: Model Comparison

Let's see how our decision changes with different modeling assumptions:

In [None]:
# Calculate for both models
models = ["exponential", "gamma"]
results = {}

for model in models:
    param_a_model = calculate_mle(brand_a_data, model)
    param_b_model = calculate_mle(brand_b_data, model)
    
    if model == "exponential":
        mean_a_model = 1 / param_a_model
        mean_b_model = 1 / param_b_model
    else:  # gamma
        mean_a_model = 2 / param_a_model
        mean_b_model = 2 / param_b_model
    
    cost_a_model = price_a / mean_a_model
    cost_b_model = price_b / mean_b_model
    
    results[model] = {
        'mean_a': mean_a_model,
        'mean_b': mean_b_model,
        'cost_a': cost_a_model,
        'cost_b': cost_b_model,
        'winner': 'A' if cost_a_model < cost_b_model else 'B'
    }

print("="*60)
print("MODEL COMPARISON")
print("="*60)
for model in models:
    r = results[model]
    print(f"\n{model.upper()} MODEL:")
    print(f"  Assumes: {'random failures' if model == 'exponential' else 'aging/wear'}")
    print(f"  Brand A: {r['mean_a']:.1f} months, ${r['cost_a']:.2f}/month")
    print(f"  Brand B: {r['mean_b']:.1f} months, ${r['cost_b']:.2f}/month")
    print(f"  Better value: Brand {r['winner']}")

print("\n" + "="*60)
print("KEY INSIGHT:")
print("The decision is the same regardless of model!")
print("But the interpretation of WHY batteries fail differs.")
print("="*60)

## Model Validation (Demonstration)

In [None]:
# Generate synthetic test data to validate our approach
np.random.seed(123)

if model_type == "exponential":
    # True parameters for exponential
    true_param_a = 0.045
    true_param_b = 0.032
    test_data_a = np.random.exponential(1/true_param_a, 50)
    test_data_b = np.random.exponential(1/true_param_b, 50)
else:  # gamma
    # True parameters for gamma(2, lambda)
    true_param_a = 0.09  # rate for gamma
    true_param_b = 0.064
    test_data_a = np.random.gamma(2, 1/true_param_a, 50)
    test_data_b = np.random.gamma(2, 1/true_param_b, 50)

# Apply our MLE method to the test data
test_param_a = calculate_mle(test_data_a, model_type)
test_param_b = calculate_mle(test_data_b, model_type)

print(f"Validation on synthetic {model_type} test data:")
print(f"Brand A - True parameter: {true_param_a:.4f}, Our estimate: {test_param_a:.4f}")
print(f"Brand B - True parameter: {true_param_b:.4f}, Our estimate: {test_param_b:.4f}")
print()
print("Our MLE method successfully recovers the true parameters!")

## Summary and Learning Objectives Review

In [None]:
print("="*60)
print("BATTERY DECISION SUMMARY")
print("="*60)
print(f"Model used: {model_type}")
print(f"Assumption: {'Random failures' if model_type == 'exponential' else 'Aging/wear failures'}")
print()
print(f"Brand A: mean life = {mean_a:.1f} months")
print(f"Brand B: mean life = {mean_b:.1f} months")
print()
print(f"Cost per month of use:")
print(f"  Brand A: ${cost_per_month_a:.2f}")
print(f"  Brand B: ${cost_per_month_b:.2f}")
print()
if cost_per_month_a < cost_per_month_b:
    print("Brand A is the better value!")
else:
    print("Brand B is the better value!")
print("="*60)

print("\n" + "="*60)
print("LEARNING OBJECTIVES ACHIEVED")
print("="*60)
print("✓ Identified appropriate distribution for failure times")
print("✓ Wrote down and maximized the likelihood function")
print("✓ Understood parameters and their interpretation")
print("✓ Used calculus to derive MLE analytically")
print("✓ Used scipy.optimize.minimize for numerical MLE")
print("✓ Visualized likelihood function across parameter range")
print("✓ Understood PDF vs likelihood distinction")
print("✓ Explained why we minimize negative log-likelihood")
print("✓ Made data-driven decision using MLE results")
print("✓ Explored how modeling assumptions affect analysis")
print("="*60)

print("\n" + "="*60)
print("FOOD FOR THOUGHT: Other Distributions for Modeling")
print("="*60)
print("What distributions might you use for these phenomena?")
print()
print("• Number of emails received per hour")
print("• Heights of students in a classroom")
print("• Time between earthquakes")
print("• Number of typos in a document")
print("• Stock price changes")
print("• Rainfall amounts on rainy days")
print("• Customer satisfaction scores (1-5 stars)")
print("• Proportion of defective items in a batch")
print()
print("Each phenomenon has characteristics that make certain")
print("distributions more appropriate than others!")
print("="*60)