In [None]:
# Week 5: Supervised Learning - Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Set up visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("‚úÖ Week 5 Regression Environment Ready!")

# Load your cleaned dataset
df = pd.read_csv('titanic_cleaned.csv')

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nNumerical columns available:")
print(df.select_dtypes(include=[np.number]).columns.tolist())

In [None]:
print("=== DATASET OVERVIEW FOR REGRESSION ANALYSIS ===")

# Display basic information
print("First 5 rows:")
display(df.head())

print("\nNumerical features statistics:")
display(df.select_dtypes(include=[np.number]).describe())

# Check for missing values in numerical columns
print("\nMissing values in numerical columns:")
print(df.select_dtypes(include=[np.number]).isnull().sum())

In [None]:
print("=== DEFINING REGRESSION PROBLEM ===")

"""
For the Titanic dataset, we have two main regression options:
1. Predict Fare (continuous value) - Good for regression
2. Predict Age (continuous value) - Also good for regression

Since 'Survived' is binary (0 or 1), it's better for classification (Week 6).
Let's choose to predict FARE based on other passenger characteristics.
"""

# Let's analyze potential target variables for regression
print("Potential target variables for regression:")
numerical_features = df.select_dtypes(include=[np.number]).columns

for feature in numerical_features:
    if df[feature].nunique() > 10:  # Continuous variables
        print(f"- {feature}: {df[feature].min():.1f} to {df[feature].max():.1f} (mean: {df[feature].mean():.2f})")

# We'll predict FARE based on other features
target_variable = 'Fare'
print(f"\nüéØ SELECTED TARGET VARIABLE: {target_variable}")
print(f"Description: Predicting passenger fare based on their characteristics")

In [None]:
print("=== PREPARING FEATURES AND TARGET ===")

# Select features for regression (using top correlated features from Week 4)
# We'll use features that make sense for predicting fare
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Survived']  # Survived might be questionable but let's try

# Create feature matrix (X) and target vector (y)
X = df[features]
y = df[target_variable]

print(f"Features (X): {list(X.columns)}")
print(f"Target (y): {target_variable}")
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Check for any missing values
print(f"\nMissing values in features: {X.isnull().sum().sum()}")
print(f"Missing values in target: {y.isnull().sum()}")

# Handle any missing values if present
if X.isnull().sum().sum() > 0:
    X = X.fillna(X.median())
    print("Filled missing values with median")

print("\nFeature statistics:")
display(X.describe())

In [None]:
print("=== TRAIN-TEST SPLIT ===")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% for testing, 80% for training
    random_state=42     # For reproducible results
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

print("\nWhy train-test split is important:")
print("‚úÖ Training set: Used to train the model")
print("‚úÖ Testing set: Used to evaluate model performance on unseen data")
print("‚úÖ Prevents overfitting: Ensures model generalizes to new data")

In [None]:
print("=== IMPLEMENTING LINEAR REGRESSION ===")

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("‚úÖ Linear Regression model trained successfully!")
print(f"Model coefficients: {model.coef_}")
print(f"Model intercept: {model.intercept_:.2f}")

# Display coefficient meanings
print("\nüìä Coefficient Interpretation:")
for i, feature in enumerate(features):
    print(f"{feature}: {model.coef_[i]:.2f} (each unit increase changes fare by ${model.coef_[i]:.2f})")

print(f"\nBase fare (intercept): ${model.intercept_:.2f}")

# Make predictions
y_pred = model.predict(X_test)
print(f"\nPredictions made on test set: {len(y_pred)} predictions")

In [None]:
print("=== LINEAR REGRESSION EQUATION ===")

# Display the actual regression equation
equation = f"Fare = {model.intercept_:.2f}"
for i, feature in enumerate(features):
    equation += f" + ({model.coef_[i]:.2f} √ó {feature})"

print("Regression Equation:")
print(equation)

print("\nüîç Example Prediction:")
example_passenger = X_test.iloc[0]
actual_fare = y_test.iloc[0]
predicted_fare = model.predict([example_passenger])[0]

print(f"Example passenger features:")
for feature in features:
    print(f"  {feature}: {example_passenger[feature]}")
print(f"Actual fare: ${actual_fare:.2f}")
print(f"Predicted fare: ${predicted_fare:.2f}")
print(f"Prediction error: ${abs(actual_fare - predicted_fare):.2f}")

In [None]:
print("=== MODEL EVALUATION METRICS ===")

# Calculate MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("üìà Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")

# Interpretation
print(f"\nüîç Metric Interpretation:")
print(f"MAE (${mae:.2f}): On average, predictions are ${mae:.2f} away from actual fares")
print(f"RMSE (${rmse:.2f}): Standard deviation of prediction errors (penalizes large errors more)")

# Compare with baseline (mean prediction)
baseline_mae = mean_absolute_error(y_test, [y_train.mean()] * len(y_test))
baseline_rmse = np.sqrt(mean_squared_error(y_test, [y_train.mean()] * len(y_test)))

print(f"\nüìä Baseline Comparison (predicting mean fare ${y_train.mean():.2f} for everyone):")
print(f"Baseline MAE: ${baseline_mae:.2f}")
print(f"Baseline RMSE: ${baseline_rmse:.2f}")

if mae < baseline_mae:
    improvement = ((baseline_mae - mae) / baseline_mae) * 100
    print(f"‚úÖ Our model improves over baseline by {improvement:.1f}%")
else:
    print("‚ùå Model performs worse than baseline - needs improvement")

In [None]:
print("=== MODEL PERFORMANCE VISUALIZATION ===")

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: Actual vs Predicted
axes[0, 0].scatter(y_test, y_pred, alpha=0.6, color='blue')
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual Fare ($)')
axes[0, 0].set_ylabel('Predicted Fare ($)')
axes[0, 0].set_title('Actual vs Predicted Fares\n(Perfect prediction = red line)')
axes[0, 0].grid(alpha=0.3)

# Plot 2: Residuals plot
residuals = y_test - y_pred
axes[0, 1].scatter(y_pred, residuals, alpha=0.6, color='green')
axes[0, 1].axhline(y=0, color='red', linestyle='--')
axes[0, 1].set_xlabel('Predicted Fare ($)')
axes[0, 1].set_ylabel('Residuals (Actual - Predicted)')
axes[0, 1].set_title('Residuals Plot\n(Random scatter around zero = good model)')
axes[0, 1].grid(alpha=0.3)

# Plot 3: Error distribution
axes[1, 0].hist(residuals, bins=30, color='orange', alpha=0.7, edgecolor='black')
axes[1, 0].axvline(x=0, color='red', linestyle='--', linewidth=2)
axes[1, 0].set_xlabel('Prediction Error ($)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Prediction Errors\n(Centered at zero = good model)')
axes[1, 0].grid(alpha=0.3)

# Plot 4: Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'coefficient': model.coef_,
    'abs_importance': abs(model.coef_)
}).sort_values('abs_importance', ascending=False)

axes[1, 1].barh(feature_importance['feature'], feature_importance['abs_importance'], color='purple')
axes[1, 1].set_xlabel('Absolute Coefficient Value')
axes[1, 1].set_title('Feature Importance in Predicting Fare')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("‚úÖ Model performance visualized!")

In [None]:
print("=== MODEL INTERPRETATION AND INSIGHTS ===")

print("üîç Key Insights from Linear Regression Model:")

# Analyze coefficients
print("\n1. FEATURE IMPACT ON FARE:")
for i, feature in enumerate(features):
    impact = "increases" if model.coef_[i] > 0 else "decreases"
    print(f"   ‚Ä¢ {feature}: {impact} fare by ${abs(model.coef_[i]):.2f} per unit")

print(f"\n2. BASE FARE: ${model.intercept_:.2f} (fare for reference passenger)")

print(f"\n3. MODEL PERFORMANCE:")
print(f"   ‚Ä¢ Average prediction error: ${mae:.2f}")
print(f"   ‚Ä¢ Typical prediction error: ${rmse:.2f}")

print(f"\n4. PRACTICAL INTERPRETATION:")
print("   Example: A 1st class passenger (Pclass=1) pays significantly more")
print("   than a 3rd class passenger (Pclass=3), holding other factors constant")

# Calculate R-squared
r_squared = model.score(X_test, y_test)
print(f"\n5. MODEL EXPLANATORY POWER:")
print(f"   ‚Ä¢ R-squared: {r_squared:.3f}")
print(f"   ‚Ä¢ This means the model explains {r_squared*100:.1f}% of fare variance")

In [None]:
print("=== COMPARING DIFFERENT FEATURE SETS ===")

# Try different feature combinations
feature_sets = {
    'Basic Features': ['Pclass', 'Age'],
    'With Family': ['Pclass', 'Age', 'SibSp', 'Parch'],
    'All Features': ['Pclass', 'Age', 'SibSp', 'Parch', 'Survived']
}

results = []

for set_name, feature_list in feature_sets.items():
    X_new = df[feature_list].fillna(df[feature_list].median())
    X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
        X_new, y, test_size=0.2, random_state=42
    )
    
    model_new = LinearRegression()
    model_new.fit(X_train_new, y_train_new)
    y_pred_new = model_new.predict(X_test_new)
    
    mae_new = mean_absolute_error(y_test_new, y_pred_new)
    rmse_new = np.sqrt(mean_squared_error(y_test_new, y_pred_new))
    r2_new = model_new.score(X_test_new, y_test_new)
    
    results.append({
        'Feature Set': set_name,
        'Features': ', '.join(feature_list),
        'MAE': mae_new,
        'RMSE': rmse_new,
        'R-squared': r2_new
    })

# Display comparison
results_df = pd.DataFrame(results)
print("\nüìä Feature Set Comparison:")
display(results_df)

# Find best feature set
best_model_idx = results_df['MAE'].idxmin()
print(f"\nüéØ BEST PERFORMING FEATURE SET: {results_df.loc[best_model_idx, 'Feature Set']}")
print(f"   MAE: ${results_df.loc[best_model_idx, 'MAE']:.2f}")
print(f"   RMSE: ${results_df.loc[best_model_idx, 'RMSE']:.2f}")
print(f"   R-squared: {results_df.loc[best_model_idx, 'R-squared']:.3f}")

In [None]:
print("=" * 70)
print("üìä WEEK 5 ASSIGNMENT REPORT: REGRESSION ANALYSIS")
print("=" * 70)

print(f"\nüéØ REGRESSION PROBLEM:")
print(f"Target Variable: {target_variable}")
print(f"Features Used: {', '.join(features)}")
print(f"Dataset: Titanic ({df.shape[0]} passengers)")

print(f"\nüìà MODEL PERFORMANCE SUMMARY:")
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")
print(f"R-squared: {model.score(X_test, y_test):.3f}")

print(f"\nüîç KEY FINDINGS:")
print(f"1. Most important feature: {feature_importance.iloc[0]['feature']}")
print(f"2. Average prediction error: ${mae:.2f}")
print(f"3. Model explains {model.score(X_test, y_test)*100:.1f}% of fare variation")

print(f"\nüí° BUSINESS INSIGHTS:")
print("‚Ä¢ Passenger class (Pclass) is the strongest predictor of fare")
print("‚Ä¢ Family size (SibSp + Parch) has moderate impact on fare")
print("‚Ä¢ Age has relatively small impact on fare pricing")

print(f"\nüöÄ RECOMMENDATIONS FOR IMPROVEMENT:")
print("1. Collect more relevant features (e.g., cabin location, ticket type)")
print("2. Try polynomial features or feature engineering")
print("3. Experiment with other regression algorithms")
print("4. Remove potentially problematic features (like Survived)")

print(f"\nüìö LEARNING OUTCOMES:")
print("‚úÖ Implemented first supervised learning model")
print("‚úÖ Understood train-test split methodology")
print("‚úÖ Evaluated model using MAE and RMSE metrics")
print("‚úÖ Interpreted linear regression coefficients")

In [None]:
# Save the trained model for future use
import joblib

# Save the model
joblib.dump(model, 'linear_regression_fare_model.pkl')

# Save predictions and actual values for comparison
results_comparison = pd.DataFrame({
    'Actual_Fare': y_test,
    'Predicted_Fare': y_pred,
    'Absolute_Error': abs(y_test - y_pred)
})

results_comparison.to_csv('regression_predictions.csv', index=False)

print("üíæ MODEL AND RESULTS SAVED:")
print(" - 'linear_regression_fare_model.pkl' (trained model)")
print(" - 'regression_predictions.csv' (prediction results)")
print(f"\nüìÅ Save this notebook as 'week5_regression_analysis.ipynb'")
print("üöÄ Upload to GitHub to complete Assignment 5!")