In [None]:
# Week 6: Supervised Learning - Classification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set up visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("‚úÖ Week 6 Classification Environment Ready!")

# Load your cleaned dataset
df = pd.read_csv('titanic_cleaned.csv')

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nColumns available:")
print(df.columns.tolist())

In [None]:
print("=== DATASET OVERVIEW FOR CLASSIFICATION ANALYSIS ===")

# Display basic information
print("First 5 rows:")
display(df.head())

print("\nTarget variable distribution:")
target_distribution = df['Survived'].value_counts()
print(target_distribution)
print(f"Survival rate: {(df['Survived'].mean() * 100):.1f}%")

# Check data types and missing values
print("\nData types:")
print(df.dtypes)
print(f"\nMissing values: {df.isnull().sum().sum()}")

In [None]:
print("=== PREPARING FOR CLASSIFICATION ===")

"""
For Titanic dataset, the natural classification problem is:
Predict Survival (0 = Did not survive, 1 = Survived)

We'll use features that are available BEFORE the event (not like 'Survived' in regression)
"""

# Select features that would be known before the Titanic sank
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target_variable = 'Survived'

print(f"üéØ CLASSIFICATION PROBLEM: Predict {target_variable}")
print(f"Features: {', '.join(features)}")

# Prepare feature matrix X
X = df[features].copy()

# Handle categorical variables (encode them)
print("\nüîß Preprocessing categorical variables...")

# Encode 'Sex' column (male=0, female=1)
X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})

# Encode 'Embarked' column (one-hot encoding)
embarked_encoded = pd.get_dummies(X['Embarked'], prefix='Embarked')
X = pd.concat([X, embarked_encoded], axis=1)
X = X.drop('Embarked', axis=1)

print("Updated features after encoding:")
print(X.columns.tolist())

# Target variable
y = df[target_variable]

print(f"\nüìä Final dataset shape:")
print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")

# Handle any missing values
if X.isnull().sum().sum() > 0:
    X = X.fillna(X.median())
    print("Filled missing values with median")

print(f"Missing values after cleaning: {X.isnull().sum().sum()}")

In [None]:
print("=== TRAIN-TEST SPLIT ===")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% for testing, 80% for training
    random_state=42,    # For reproducible results
    stratify=y          # Maintain same class distribution in both sets
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"Number of features: {X_train.shape[1]}")

print(f"\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print(f"\nClass distribution in testing set:")
print(y_test.value_counts(normalize=True))

In [None]:
print("=== DECISION TREE CLASSIFIER ===")

# Create and train Decision Tree
dt_model = DecisionTreeClassifier(
    max_depth=3,        # Limit tree depth to prevent overfitting
    random_state=42
)

dt_model.fit(X_train, y_train)

print("‚úÖ Decision Tree model trained successfully!")

# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Calculate accuracy
dt_accuracy = accuracy_score(y_test, y_pred_dt)

print(f"Decision Tree Accuracy: {dt_accuracy:.3f} ({dt_accuracy*100:.1f}%)")

# Feature importance
dt_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': dt_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüìä Decision Tree Feature Importance:")
display(dt_importance)

In [None]:
print("=== DECISION TREE VISUALIZATION ===")

plt.figure(figsize=(20, 10))
plot_tree(dt_model, 
          feature_names=X.columns,
          class_names=['Not Survived', 'Survived'],
          filled=True,
          rounded=True,
          fontsize=12)

plt.title('Decision Tree for Titanic Survival Prediction', fontsize=16)
plt.show()

print("üîç Decision Tree Interpretation:")
print("‚Ä¢ Each node shows the decision rule")
print("‚Ä¢ Color intensity shows class probability")
print("‚Ä¢ Leaf nodes show final predictions")

In [None]:
print("=== RANDOM FOREST CLASSIFIER ===")

# Create and train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,   # Number of trees in the forest
    max_depth=5,        # Limit depth of each tree
    random_state=42
)

rf_model.fit(X_train, y_train)

print("‚úÖ Random Forest model trained successfully!")

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {rf_accuracy:.3f} ({rf_accuracy*100:.1f}%)")

# Feature importance
rf_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüìä Random Forest Feature Importance:")
display(rf_importance)

# Compare with Decision Tree
print(f"\nüìà ACCURACY COMPARISON (Class Task):")
print(f"Decision Tree: {dt_accuracy:.3f} ({dt_accuracy*100:.1f}%)")
print(f"Random Forest: {rf_accuracy:.3f} ({rf_accuracy*100:.1f}%)")

if rf_accuracy > dt_accuracy:
    improvement = ((rf_accuracy - dt_accuracy) / dt_accuracy) * 100
    print(f"‚úÖ Random Forest improves accuracy by {improvement:.1f}%")
else:
    print("‚ùå Decision Tree performs better in this case")

In [None]:
print("=== LOGISTIC REGRESSION ===")

# Create and train Logistic Regression
lr_model = LogisticRegression(
    random_state=42,
    max_iter=1000  # Increase iterations for convergence
)

lr_model.fit(X_train, y_train)

print("‚úÖ Logistic Regression model trained successfully!")

# Make predictions
y_pred_lr = lr_model.predict(X_test)

# Calculate accuracy
lr_accuracy = accuracy_score(y_test, y_pred_lr)

print(f"Logistic Regression Accuracy: {lr_accuracy:.3f} ({lr_accuracy*100:.1f}%)")

# Get probability predictions
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]  # Probability of survival

print("\nüìä Logistic Regression Coefficients:")
lr_coefficients = pd.DataFrame({
    'feature': X.columns,
    'coefficient': lr_model.coef_[0]
}).sort_values('coefficient', key=abs, ascending=False)

display(lr_coefficients)

print("\nüîç Coefficient Interpretation:")
print("Positive coefficients increase probability of survival")
print("Negative coefficients decrease probability of survival")

In [None]:
print("=== DETAILED MODEL COMPARISON ===")

# Calculate metrics for all models
models = {
    'Logistic Regression': (y_pred_lr, y_prob_lr),
    'Random Forest': (y_pred_rf, rf_model.predict_proba(X_test)[:, 1]),
    'Decision Tree': (y_pred_dt, dt_model.predict_proba(X_test)[:, 1])
}

comparison_results = []

for model_name, (predictions, probabilities) in models.items():
    accuracy = accuracy_score(y_test, predictions)
    
    # Additional metrics from classification report
    report = classification_report(y_test, predictions, output_dict=True)
    
    comparison_results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision_0': report['0']['precision'],  # Not survived
        'Recall_0': report['0']['recall'],
        'Precision_1': report['1']['precision'],  # Survived
        'Recall_1': report['1']['recall'],
        'F1_Score': report['macro avg']['f1-score']
    })

# Create comparison dataframe
comparison_df = pd.DataFrame(comparison_results)
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

print("üìä COMPREHENSIVE MODEL COMPARISON:")
display(comparison_df.round(3))

# Find best model
best_model = comparison_df.iloc[0]
print(f"\nüéØ BEST PERFORMING MODEL: {best_model['Model']}")
print(f"   Accuracy: {best_model['Accuracy']:.3f} ({best_model['Accuracy']*100:.1f}%)")
print(f"   F1 Score: {best_model['F1_Score']:.3f}")

In [None]:
print("=== MODEL PERFORMANCE VISUALIZATION ===")

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: Accuracy Comparison
models_names = comparison_df['Model']
accuracies = comparison_df['Accuracy']

bars = axes[0, 0].bar(models_names, accuracies, color=['skyblue', 'lightcoral', 'lightgreen'])
axes[0, 0].set_title('Model Accuracy Comparison', fontweight='bold')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_ylim(0, 1)
axes[0, 0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                   f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')

# Plot 2: Feature Importance Comparison
feature_importance_comparison = pd.DataFrame({
    'Logistic Regression': abs(lr_model.coef_[0]),
    'Random Forest': rf_model.feature_importances_,
    'Decision Tree': dt_model.feature_importances_
}, index=X.columns)

# Normalize for better comparison
feature_importance_comparison = feature_importance_comparison.div(feature_importance_comparison.sum(axis=0), axis=1)

feature_importance_comparison.plot(kind='bar', ax=axes[0, 1], color=['skyblue', 'lightcoral', 'lightgreen'])
axes[0, 1].set_title('Feature Importance Comparison', fontweight='bold')
axes[0, 1].set_ylabel('Normalized Importance')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(axis='y', alpha=0.3)
axes[0, 1].legend()

# Plot 3: Confusion Matrix for Best Model
best_model_name = comparison_df.iloc[0]['Model']
if best_model_name == 'Logistic Regression':
    best_predictions = y_pred_lr
elif best_model_name == 'Random Forest':
    best_predictions = y_pred_rf
else:
    best_predictions = y_pred_dt

cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0],
            xticklabels=['Not Survived', 'Survived'],
            yticklabels=['Not Survived', 'Survived'])
axes[1, 0].set_title(f'Confusion Matrix - {best_model_name}', fontweight='bold')
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')

# Plot 4: Precision-Recall Comparison
precision_survived = comparison_df['Precision_1'].values
recall_survived = comparison_df['Recall_1'].values

for i, model in enumerate(models_names):
    axes[1, 1].scatter(recall_survived[i], precision_survived[i], s=100, label=model)
    axes[1, 1].text(recall_survived[i] + 0.01, precision_survived[i] + 0.01, model, fontsize=9)

axes[1, 1].set_xlabel('Recall (Survived)')
axes[1, 1].set_ylabel('Precision (Survived)')
axes[1, 1].set_title('Precision-Recall Comparison', fontweight='bold')
axes[1, 1].set_xlim(0, 1)
axes[1, 1].set_ylim(0, 1)
axes[1, 1].grid(alpha=0.3)
axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
print("=== MODEL INTERPRETATION AND BUSINESS INSIGHTS ===")

print("üîç KEY INSIGHTS FROM CLASSIFICATION MODELS:")

print(f"\n1. BEST PERFORMING MODEL: {best_model['Model']}")
print(f"   ‚Ä¢ Accuracy: {best_model['Accuracy']:.1%}")
print(f"   ‚Ä¢ Can correctly predict survival for {best_model['Accuracy']:.1%} of passengers")

print(f"\n2. MOST IMPORTANT FEATURES:")
print("   Across all models, these features consistently matter:")
top_features = rf_importance.head(3)
for _, row in top_features.iterrows():
    print(f"   ‚Ä¢ {row['feature']}: {row['importance']:.3f} importance")

print(f"\n3. PRACTICAL IMPLICATIONS:")
print("   ‚Ä¢ Gender (Sex) is the strongest predictor of survival")
print("   ‚Ä¢ Passenger class (Pclass) significantly impacts survival chances")
print("   ‚Ä¢ Fare paid correlates with survival probability")
print("   ‚Ä¢ Age has moderate influence on survival")

print(f"\n4. MODEL RELIABILITY:")
print(f"   ‚Ä¢ All models achieve > {comparison_df['Accuracy'].min()*100:.1f}% accuracy")
print(f"   ‚Ä¢ Significant improvement over guessing ({y_test.mean()*100:.1f}% baseline)")

# Calculate baseline (predict majority class)
baseline_accuracy = max(y_test.mean(), 1 - y_test.mean())
improvement = ((best_model['Accuracy'] - baseline_accuracy) / baseline_accuracy) * 100

print(f"   ‚Ä¢ Models improve over baseline by {improvement:.1f}%")

In [None]:
print("=== PROBABILITY ANALYSIS ===")

# Analyze probability distributions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot probability distributions for survived vs not survived
for i, (model_name, (predictions, probabilities)) in enumerate(models.items()):
    survived_probs = probabilities[y_test == 1]
    not_survived_probs = probabilities[y_test == 0]
    
    axes[0].hist(survived_probs, bins=20, alpha=0.5, label=f'{model_name} - Survived')
    axes[1].hist(not_survived_probs, bins=20, alpha=0.5, label=f'{model_name} - Not Survived')

axes[0].set_xlabel('Predicted Probability of Survival')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Probability Distribution - Actual Survivors')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].set_xlabel('Predicted Probability of Survival')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Probability Distribution - Actual Non-Survivors')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("üîç Probability Analysis Insights:")
print("‚Ä¢ Good models show high probabilities for correct classes")
print("‚Ä¢ Overlapping distributions indicate classification uncertainty")
print("‚Ä¢ Well-calibrated models separate the classes clearly")

In [None]:
print("=" * 70)
print("üìä WEEK 6 ASSIGNMENT REPORT: CLASSIFICATION ANALYSIS")
print("=" * 70)

print(f"\nüéØ CLASSIFICATION PROBLEM:")
print(f"Target Variable: {target_variable} (Binary Classification)")
print(f"Features: {', '.join(features)}")
print(f"Dataset: Titanic ({df.shape[0]} passengers)")

print(f"\nüìà MODEL PERFORMANCE SUMMARY:")
print("Algorithm          Accuracy    Precision   Recall     F1-Score")
print("-" * 60)
for _, row in comparison_df.iterrows():
    print(f"{row['Model']:18} {row['Accuracy']:.3f}      {row['Precision_1']:.3f}      {row['Recall_1']:.3f}      {row['F1_Score']:.3f}")

print(f"\nüîç KEY FINDINGS:")
print(f"1. Best Model: {best_model['Model']} with {best_model['Accuracy']:.1%} accuracy")
print(f"2. Most Important Feature: {rf_importance.iloc[0]['feature']}")
print(f"3. All models significantly beat baseline guessing ({baseline_accuracy:.1%})")

print(f"\nüí° BUSINESS INSIGHTS:")
print("‚Ä¢ Gender is the strongest survival predictor ('women and children first')")
print("‚Ä¢ Higher socioeconomic status (Pclass, Fare) improves survival chances")
print("‚Ä¢ Traveling with family has complex effects on survival probability")

print(f"\nüöÄ RECOMMENDATIONS:")
print("1. Random Forest provides good balance of accuracy and interpretability")
print("2. Consider feature engineering for better performance")
print("3. Collect additional relevant features if possible")

print(f"\nüìö LEARNING OUTCOMES:")
print("‚úÖ Implemented multiple classification algorithms")
print("‚úÖ Understood difference between regression and classification")
print("‚úÖ Compared model performance using multiple metrics")
print("‚úÖ Interpreted feature importance and model decisions")

In [None]:
# Save the trained models for future use
import joblib

# Save all models
joblib.dump(lr_model, 'logistic_regression_model.pkl')
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(dt_model, 'decision_tree_model.pkl')

# Save predictions and comparison results
predictions_comparison = pd.DataFrame({
    'Actual': y_test,
    'Logistic_Regression': y_pred_lr,
    'Random_Forest': y_pred_rf,
    'Decision_Tree': y_pred_dt
})

predictions_comparison.to_csv('classification_predictions.csv', index=False)
comparison_df.to_csv('model_comparison_results.csv', index=False)

print("üíæ MODELS AND RESULTS SAVED:")
print(" - 'logistic_regression_model.pkl'")
print(" - 'random_forest_model.pkl'")
print(" - 'decision_tree_model.pkl'")
print(" - 'classification_predictions.csv'")
print(" - 'model_comparison_results.csv'")
print(f"\nüìÅ Save this notebook as 'week6_classification_analysis.ipynb'")
print("üöÄ Upload to GitHub to complete Assignment 6!")