In [None]:
# ======================
# 1. SETUP & IMPORTS
# ======================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully")

# ======================
# 2. DATA LOADING & CLEANING
# ======================
df = pd.read_csv('loan_borrower_data.csv')

print(f"Data: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Default rate: {df['not.fully.paid'].mean():.1%}")

df['purpose'] = df['purpose'].str.replace('debt_consolidation', 'debt_consolidation')
print("Data cleaning completed")

# ======================
# 3. EXPLORATORY DATA ANALYSIS (EDA)
# ======================
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS")
print("="*50)

# Create EDA visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Loan Data - Exploratory Analysis', fontsize=16, fontweight='bold')

# Plot 1: Target variable distribution (Donut Chart)
loan_counts = df['not.fully.paid'].value_counts()

# Create the pie (donut) chart
axes[0,0].pie(
    loan_counts,
    labels=['Paid', 'Default'],        # assuming 0 = Paid, 1 = Default
    autopct='%1.1f%%',
    colors=['lightblue', 'salmon'],
    startangle=90,
    wedgeprops={'width': 0.4, 'edgecolor': 'white'}  # width < 1 makes it a donut
)

# Set chart title
axes[0,0].set_title('Loan Default Distribution')

# Add legend
axes[0,0].legend(title='Loan Status', loc='upper right')

# Remove axis for cleaner look
axes[0,0].axis('equal')


# Plot 2: Loan purposes distribution
purpose_counts = df['purpose'].value_counts()
purpose_counts.plot(kind='bar', ax=axes[0,1], color='lightgreen')
axes[0,1].set_title('Loan Purposes Distribution')
axes[0,1].set_xlabel('Loan Purpose')
axes[0,1].set_ylabel('Number of Loans')
axes[0,1].tick_params(axis='x', rotation=45)

# Plot 3: FICO score distribution by loan status
sns.boxplot(x='not.fully.paid', y='fico', data=df, ax=axes[0,2])
axes[0,2].set_title('FICO Scores by Loan Status')
axes[0,2].set_xlabel('Loan Status (0=Paid, 1=Default)')
axes[0,2].set_ylabel('FICO Score')

# Plot 4: Interest rates by loan status
sns.boxplot(x='not.fully.paid', y='int.rate', data=df, ax=axes[1,0])
axes[1,0].set_title('Interest Rates by Loan Status')
axes[1,0].set_xlabel('Loan Status (0=Paid, 1=Default)')
axes[1,0].set_ylabel('Interest Rate')

# Plot 5: Default rates by purpose
purpose_defaults = df.groupby('purpose')['not.fully.paid'].mean().sort_values(ascending=False)
purpose_defaults.plot(kind='bar', ax=axes[1,1], color='orange')
axes[1,1].set_title('Default Rates by Loan Purpose')
axes[1,1].set_xlabel('Loan Purpose')
axes[1,1].set_ylabel('Default Rate')
axes[1,1].tick_params(axis='x', rotation=45)
for i, v in enumerate(purpose_defaults):
    axes[1,1].text(i, v, f'{v:.1%}', ha='center', va='bottom')

# Plot 6: Correlation heatmap
numeric_cols = ['int.rate', 'installment', 'dti', 'fico', 'days.with.cr.line', 
               'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec']
correlation_matrix = df[numeric_cols + ['not.fully.paid']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1,2])
axes[1,2].set_title('Feature Correlation Heatmap')

plt.tight_layout()
plt.savefig('loan_eda_visualizations.png', dpi=300, bbox_inches='tight')
plt.show()

# Print key EDA insights
print("\nKEY EDA INSIGHTS:")
print(f"1. Default rate: {df['not.fully.paid'].mean():.1%}")
print(f"2. Most common loan purpose: {df['purpose'].value_counts().index[0]}")
print(f"3. Average FICO score: {df['fico'].mean():.0f}")
print(f"4. FICO score difference (default vs paid): {df[df['not.fully.paid']==1]['fico'].mean() - df[df['not.fully.paid']==0]['fico'].mean():.0f} points")
print(f"5. Highest default rate by purpose: {purpose_defaults.index[0]} ({purpose_defaults.iloc[0]:.1%})")
print(f"6. Average interest rate: {df['int.rate'].mean():.2%}")

print("EDA completed and visualizations saved")

# ======================
# 4. FEATURE ENGINEERING
# ======================
df['annual_inc'] = np.exp(df['log.annual.inc'])
df['fico_category'] = pd.cut(df['fico'], [300, 580, 670, 740, 850], 
                           labels=['Poor', 'Fair', 'Good', 'Excellent'])

purpose_dummies = pd.get_dummies(df['purpose'], prefix='purpose')
df = pd.concat([df, purpose_dummies], axis=1)

print("Feature engineering completed")

# ======================
# 5. PREPARE FOR MODELING
# ======================
features = ['credit.policy', 'int.rate', 'installment', 'dti', 'fico', 
           'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs']
features += [col for col in df.columns if 'purpose_' in col]

X = df[features]
y = df['not.fully.paid']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Data prepared: {X_train.shape} train, {X_test.shape} test")

# ======================
# 6. SCALE THE DATA
# ======================
scaler = StandardScaler()
numerical_features = ['int.rate', 'installment', 'dti', 'fico', 
                     'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs']

X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

print("Data scaling completed")

# ======================
# 7. TRAIN MODEL
# ======================
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

print("Model training completed")

# ======================
# 8. EVALUATE MODEL
# ======================
y_pred = model.predict(X_test)

print("\nModel Performance:")
print(classification_report(y_test, y_pred))

importance = pd.DataFrame({'feature': features, 'importance': abs(model.coef_[0])})
print("Top 5 Predictors:")
print(importance.nlargest(5, 'importance'))

# ======================
# 9. EXPORT RESULTS
# ======================
df['default_probability'] = model.predict_proba(X)[:, 1]
df['risk_tier'] = pd.cut(df['default_probability'], [0, 0.1, 0.3, 1], 
                        labels=['Low', 'Medium', 'High'])

df.to_csv('loan_analysis_results.csv', index=False)
print("Results saved to 'loan_analysis_results.csv'")

# ======================
# 10. KEY INSIGHTS
# ======================
print("\n" + "="*50)
print("FINAL BUSINESS INSIGHTS")
print("="*50)

print(f"Overall default rate: {df['not.fully.paid'].mean():.1%}")
print(f"Highest risk purpose: {df.groupby('purpose')['not.fully.paid'].mean().idxmax()}")
print(f"FICO difference (default vs paid): {df[df['not.fully.paid']==1]['fico'].mean() - df[df['not.fully.paid']==0]['fico'].mean():.0f} points")
print("Risk tier distribution:")
risk_dist = df['risk_tier'].value_counts(normalize=True)
for tier, percent in risk_dist.items():
    print(f"   - {tier} risk: {percent:.1%}")
print(f"Top predictor: {importance.nlargest(1, 'importance').iloc[0]['feature']}")
print(f"Model accuracy: {((y_pred == y_test).sum() / len(y_test)):.1%}")

print("\nAnalysis complete! Check 'loan_eda_visualizations.png' for EDA charts")