# Phase 2: Data Understanding

## Fannie Mae 2008Q1 Stress Testing - Credit Default Risk Modeling

---

### CRISP-DM Phase 2: Load, Explore, and Describe the Data

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

print("Libraries imported successfully!")

## 2.1 Configuration

In [None]:
# Configuration
DATA_PATH = "2008Q1.csv"
SAMPLE_SIZE = 1000000  # Increased sample for better model performance

# Fannie Mae Performance File Column Names
COLUMN_NAMES = [
    'loan_sequence_number',         # 0
    'monthly_reporting_period',     # 1
    'current_actual_upb',           # 2
    'current_loan_delinquency',     # 3 - TARGET
    'loan_age',                     # 4
    'remaining_months_maturity',    # 5
    'repurchase_flag',              # 6
    'modification_flag',            # 7
    'zero_balance_code',            # 8
    'zero_balance_date',            # 9
    'current_interest_rate',        # 10
    'current_deferred_upb',         # 11
    'due_date_last_paid',           # 12
    'mi_recoveries',                # 13
    'net_sales_proceeds',           # 14
    'non_mi_recoveries',            # 15
    'expenses',                     # 16
    'legal_costs',                  # 17
    'maintenance_costs',            # 18
    'taxes_insurance_due',          # 19
    'miscellaneous_expenses',       # 20
    'actual_loss_calculation',      # 21
    'modification_cost',            # 22
    'step_modification_flag',       # 23
    'deferred_payment_mod',         # 24
    'estimated_ltv',                # 25
    'zero_balance_removal_upb',     # 26
    'delinquent_accrued_interest',  # 27
    'delinquency_due_disaster',     # 28
    'borrower_assistance_status',   # 29
]

# Add remaining columns
for i in range(30, 110):
    COLUMN_NAMES.append(f'col_{i}')

print(f"Configuration:")
print(f"  Data Path: {DATA_PATH}")
print(f"  Sample Size: {SAMPLE_SIZE:,}")
print(f"  Column Names: {len(COLUMN_NAMES)} defined")

## 2.2 Load Data

In [None]:
# Load Data
print(f"Loading {SAMPLE_SIZE:,} rows from {DATA_PATH}...")

df = pd.read_csv(
    DATA_PATH,
    sep='|',
    header=None,
    names=COLUMN_NAMES,
    nrows=SAMPLE_SIZE,
    low_memory=False,
    on_bad_lines='skip'
)

print(f"\n✓ Loaded {len(df):,} records with {df.shape[1]} columns")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 2.3 Data Structure

In [None]:
# Data Structure
print("Data Shape:", df.shape)
print("\nColumn Types:")
print(df.dtypes.value_counts())
print("\nFirst 5 rows:")
df.head()

## 2.4 Column Data Availability

In [None]:
# Analyze columns with data
col_data = []
for col in df.columns:
    non_null = df[col].notna().sum()
    if non_null > 0:
        col_data.append({
            'Column': col,
            'Non-Null Count': non_null,
            'Percent': f"{non_null/len(df)*100:.1f}%"
        })

col_df = pd.DataFrame(col_data).sort_values('Non-Null Count', ascending=False)
print(f"Columns with data: {len(col_df)} out of {len(df.columns)}")
print("\nTop 20 columns:")
col_df.head(20)

## 2.5 Target Variable Analysis

In [None]:
# Target Variable Distribution
print("Target Variable: current_loan_delinquency")
print("\nDistribution:")
target_dist = df['current_loan_delinquency'].value_counts(dropna=False)
print(target_dist.head(15))

# Visualize
fig, ax = plt.subplots(figsize=(10, 5))
target_dist.head(10).plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_title('Distribution of Loan Delinquency Status', fontsize=14, fontweight='bold')
ax.set_xlabel('Delinquency Status')
ax.set_ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 2.6 Key Features Analysis

In [None]:
# Analyze key numeric features
key_features = ['current_actual_upb', 'loan_age', 'remaining_months_maturity', 
                'current_interest_rate', 'current_deferred_upb', 'estimated_ltv']

for col in key_features:
    if col in df.columns:
        numeric_vals = pd.to_numeric(df[col], errors='coerce')
        valid_count = numeric_vals.notna().sum()
        print(f"\n{col}:")
        print(f"  Valid values: {valid_count:,} ({valid_count/len(df)*100:.1f}%)")
        if valid_count > 0:
            print(f"  Mean: {numeric_vals.mean():.2f}")
            print(f"  Std: {numeric_vals.std():.2f}")
            print(f"  Min: {numeric_vals.min():.2f}")
            print(f"  Max: {numeric_vals.max():.2f}")

In [None]:
# Save dataframe for next phase
df.to_pickle('phase2_data.pkl')
print("\n✓ Data saved to phase2_data.pkl for Phase 3")

---
## ✅ Phase 2 Complete

**Key Findings**:
- Dataset: 1M records, 110 columns
- Key features identified for modeling
- Target variable: current_loan_delinquency

**Next**: Phase 3 - Data Preparation