## 1. Import Libraries

Import all necessary libraries for data analysis, visualization, and modeling.

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

from google.colab import files
import os 

# Upload dataset
print("Please upload the dataset CSV for this deliverable (dataset2.csv).")
uploaded = files.upload()  

# Create directory to store figures inside Colab
dataset_id = "dataset2"
FIGURES_DIR = f"figures/{dataset_id}" 
os.makedirs(FIGURES_DIR, exist_ok=True)


# Set random seed for reproducibility
np.random.seed(42)

# Set plotting style
sns.set_style('whitegrid')
%matplotlib inline

## 2. Load Data

Load the credit risk dataset from the raw data folder.

In [None]:
# Load dataset
df = pd.read_csv('dataset2.csv')

# Display first few rows
print(f"Dataset shape: {df.shape}")
df.head()

## 3. Exploratory Data Analysis (EDA)

### 3.1 Basic Information

Examine the structure and basic statistics of the dataset.

In [None]:
# Display data info
print("Dataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

### 3.2 Missing Values

Check for missing values in the dataset.

In [None]:
# TODO: Check for missing values and visualize them
# Hint: Use df.isnull().sum() and consider creating a visualization

### 3.3 Target Variable Distribution

Analyze the distribution of the target variable (loan default).

In [None]:
# TODO: Analyze the loan_default distribution
# - Check value counts and percentages
# - Create a visualization (bar plot or pie chart)
# - Consider the class imbalance

### 3.4 Feature Exploration

Create visualizations to explore relationships between features and loan default.

In [None]:
# TODO: Create exploratory visualizations
# Ideas:
# - Distribution of credit scores for defaulters vs non-defaulters
# - Default rate by employment type, education, loan purpose
# - Correlation heatmap for financial metrics
# - Scatter plots for debt-to-income ratio vs default rate
# - Box plots for numerical features grouped by default status

# Example: Credit score distribution by default status
# plt.figure(figsize=(12, 6))
# sns.histplot(data=df, x='credit_score', hue='loan_default', bins=30, kde=True)
# plt.title('Credit Score Distribution by Default Status')
# plt.show()

## 4. Data Cleaning and Preprocessing

### 4.1 Handle Missing Values

Decide how to handle missing values appropriately for this financial dataset.

In [None]:
# TODO: Handle missing values
# Consider:
# - Missing employment_length_years: could impute with median or 0
# - Missing savings_account_balance: could impute with 0 or median
# - Think about what makes sense for financial data

### 4.2 Handle Duplicates

Check for and remove duplicate applications.

In [None]:
# TODO: Check for duplicates and remove if necessary

### 4.3 Handle Categorical Variables

Standardize and encode categorical variables. Note inconsistencies in the data!

In [None]:
# TODO: Standardize categorical values (e.g., 'Bachelor' vs 'bachelor')
# TODO: Encode categorical variables appropriately
# Consider:
# - Binary encoding for binary features
# - One-hot encoding for nominal features
# - Label encoding for ordinal features (if any)

### 4.4 Handle Outliers

Identify and handle outliers in financial metrics.

In [None]:
# TODO: Identify outliers in columns like annual_income, loan_amount, total_existing_debt
# Decide whether to cap, remove, or keep them
# For financial data, extreme values might be legitimate

### 4.5 Feature Engineering (Optional)

Consider creating new features that might be predictive.

In [None]:
# TODO: Create new features if you think they would help
# Ideas:
# - Total assets (savings + checking)
# - Debt service ratio
# - Credit utilization metrics

### 4.6 Feature Scaling

Scale numerical features for better model performance.

In [None]:
# TODO: Scale numerical features
# Consider which features need scaling

## 5. Train/Validation Split

Split the data into training and validation sets. Use stratification due to class imbalance!

In [None]:
# TODO: Prepare features (X) and target (y)
# Remove non-predictive columns (e.g., application_id)

# TODO: Split with stratification
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 6. Model Training

Train classification model(s). Consider:
- Logistic Regression (baseline)
- Random Forest Classifier
- Gradient Boosting (XGBoost, LightGBM)
- Consider using class weights or SMOTE for imbalanced data

In [None]:
# TODO: Import and train your chosen model(s)
# For imbalanced data, consider:
# - class_weight='balanced' parameter
# - Sampling techniques (SMOTE)

# Example:
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
# model.fit(X_train, y_train)

## 7. Model Evaluation

Evaluate your model. For imbalanced classification, focus on:
- Precision, Recall, F1-Score (especially for the positive class)
- ROC AUC Score
- Confusion Matrix
- Consider the business cost of false positives vs false negatives

In [None]:
# TODO: Make predictions
# y_pred = model.predict(X_test)
# y_pred_proba = model.predict_proba(X_test)[:, 1]

In [None]:
# TODO: Calculate and display evaluation metrics
# print(classification_report(y_test, y_pred))
# print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

## 8. Visualizations

Create visualizations to communicate your findings and model performance.

### 8.1 EDA Visualization

Create a visualization that highlights important patterns in the data.

In [None]:
# TODO: Create an EDA visualization
# Save it to content/figures/dataset2/eda_plot.png (inside Colab)

# Ideas:
# - Key financial metrics by default status
# - Default rate across different risk factors
# - Correlation heatmap

# Example template:
# fig, ax = plt.subplots(figsize=(14, 6))
# # Your visualization code here
# plt.tight_layout()
# plt.savefig('{FIGURE_DIR}/eda_plot.png', dpi=300, bbox_inches='tight')
# plt.show()

### 8.2 Model Performance Visualization

Create a visualization showing model performance.

In [None]:
# TODO: Create a model performance visualization
# Save it to content/figures/dataset2/model_performance.png (inside Colab)

# Ideas:
# - Confusion Matrix with percentages
# - ROC Curve
# - Precision-Recall Curve (important for imbalanced data)
# - Feature Importance

# Example template:
# fig, ax = plt.subplots(figsize=(10, 8))
# # Your visualization code here
# plt.tight_layout()
# plt.savefig('{FIGURE_DIR}/model_performance.png', dpi=300, bbox_inches='tight')
# plt.show()

## 9. Summary and Next Steps

Write a brief summary of:
- Key risk factors identified
- Model performance and business implications
- Recommendations for loan approval strategy

**Your Summary Here:**

- Key risk factors: ...
- Model performance: ...
- Business recommendations: ...
- Next steps: ...

In [None]:
# Download zip file of your figures 
zip_name = f"figures_{dataset_id}.zip"
!zip -r "$zip_name" "$FIGURES_DIR"
print(f"‚úÖ Created {zip_name} ‚Äî download it from the Files panel on the left.")

## üì§ Submission!

> ‚ö†Ô∏è Important: If your Colab session idles too long, it may restart. \
>  You will need to re-upload your dataset and re-run your notebook before saving figures.

Once you have completed your notebook:

1. **Download your notebook and figures** from Google Colab  

      - `File ‚Üí Download ‚Üí Download .ipynb`
      - Download `figures_dataset2.zip` from the left side pane

2. **Upload your completed notebook here:**  

      [Submission Form](https://airtable.com/appGZ1Cp7fr2YNekA/pagsuGiP3ZtSPovtP/form)

## üì£ Certification & Participation

To receive a **DalMLSociety Supervised Learning Certificate**, submit one of the three deliverable notebooks by:

> ‚è∞ November 23, 2025 @ 11:59 PM Atlantic Time 