In [None]:
import pandas as pd
df = pd.read_csv('/Users/souadmouajel/Desktop/Ironhack/lab-sessions/week-7/loan-approval-prediction/data/clean/cleaned_loan_data.csv')
df.head()

In [None]:
# Define the numerical columns in order to prepare them for outliers detiction
num_cols = df.select_dtypes(include='number').columns.tolist()
# Then remove 'loan_id' from that list, if it exists
if 'loan_id' in num_cols:
    num_cols.remove('loan_id')

print(num_cols)

In [None]:
# Outliers detiction 
import matplotlib.pyplot as plt
import seaborn as sns

# Set up boxplots for all numerical columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(num_cols):
    plt.subplot((len(num_cols) + 2) // 3, 3, i + 1)
    sns.boxplot(x=df[col])
    plt.title(col)
    plt.tight_layout()
plt.show()


In [None]:
df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

num_cols = df.select_dtypes(include='number').columns.tolist()
# Then remove 'loan_id' from that list, if it exists
if 'loan_id' in num_cols:
    num_cols.remove('loan_id')

plt.figure(figsize=(15, 10))

for i, col in enumerate(num_cols):
    plt.subplot((len(num_cols) + 2) // 3, 3, i + 1)  # arrange plots in rows of 3
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f'Histogram of {col}')
    plt.tight_layout()

plt.show()

In [None]:
df.info()

In [None]:
df['loan_status'].unique()

In [None]:
# Tranform the target column
# # Step 1.1: Map target labels to binary (1 for Approved, 0 for Rejected)

df['loan_status'] = df['loan_status'].map({'approved': 1, 'rejected': 0})


In [None]:
df['loan_status'].unique()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Split features and target, train-test split
X = df.drop(columns='loan_status')  # Your full dataset
y = df['loan_status']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Drop 'loan_id' if present (don't reset index yet)
X_train_raw = X_train_raw.drop(columns=['loan_id'], errors='ignore')
X_test_raw = X_test_raw.drop(columns=['loan_id'], errors='ignore')

# 3. Define categorical columns
categorical_cols = ['education', 'self_employed']

# 4. Define numeric columns (all except categorical)
numeric_cols = [col for col in X_train_raw.columns if col not in categorical_cols]

# 5. Calculate IQR and remove outliers on numeric columns
numeric_data = X_train_raw[numeric_cols]  # Don't reset index here

Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)
IQR = Q3 - Q1

filter = ~(
    (numeric_data < (Q1 - 1.5 * IQR)) |
    (numeric_data > (Q3 + 1.5 * IQR))
).any(axis=1)

# Now apply the filter (which maintains original indices)
X_train_filtered = X_train_raw.loc[filter].reset_index(drop=True)
y_train_filtered = y_train.loc[filter].reset_index(drop=True)  # This will now work

# 6. Apply OneHotEncoder on filtered training data
encoder = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'  # numeric columns passthrough
)

X_train_encoded = encoder.fit_transform(X_train_filtered)

# 7. Get feature names and rebuild DataFrame
cat_encoded_cols = encoder.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_columns = list(cat_encoded_cols) + numeric_cols

X_train_df = pd.DataFrame(X_train_encoded, columns=all_columns)

# 8. Convert numeric columns to numeric type explicitly
for col in numeric_cols:
    X_train_df[col] = pd.to_numeric(X_train_df[col])

# 9. Plot boxplot of training features
plt.figure(figsize=(15,8))
sns.boxplot(data=X_train_df)
plt.xticks(rotation=45)
plt.title('Boxplot of Features After Outlier Removal and Encoding')
plt.tight_layout()
plt.show()


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Ensure categorical columns exist in data
categorical_cols = ['education', 'self_employed']
categorical_cols = [col for col in categorical_cols if col in X_train_filtered.columns]

# 2. OneHotEncode with error handling
try:
    encoder = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
        ],
        remainder='passthrough',
        verbose_feature_names_out=False  # Cleaner feature names
    )

    X_train_encoded = encoder.fit_transform(X_train_filtered)
    
    # 3. Get feature names
    cat_encoded_cols = encoder.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    numeric_cols = [col for col in X_train_filtered.columns if col not in categorical_cols]
    all_columns = list(cat_encoded_cols) + numeric_cols
    
    # 4. Create DataFrame with proper typing
    X_train_df = pd.DataFrame(X_train_encoded, columns=all_columns)
    
    # Convert numeric columns - more robust handling
    for col in numeric_cols:
        if col in X_train_df.columns:  # Double check column exists
            X_train_df[col] = pd.to_numeric(X_train_df[col], errors='coerce')
    
    # 5. Plotting with improved visuals
    plt.figure(figsize=(15, 8))
    sns.boxplot(data=X_train_df)
    plt.xticks(rotation=45, ha='right')  # Better label alignment
    plt.title('Boxplot of Features After Preprocessing')
    plt.grid(axis='y', alpha=0.3)  # Add subtle grid
    plt.tight_layout()
    plt.show()

except Exception as e:
    print(f"Error during encoding: {str(e)}")
    # Fallback to original data if encoding fails
    X_train_df = X_train_filtered.copy()

In [None]:
# 1. Remove loan_id column if present (more robust check)
#X_train_df = X_train_df.drop(columns=['loan_id'], errors='ignore')

# 2. Define expected numeric columns with validation
expected_numeric_cols = [
    'no_of_dependents', 'income_annum', 'loan_amount', 'loan_term',
    'cibil_score', 'residential_assets_value', 'commercial_assets_value',
    'luxury_assets_value', 'bank_asset_value'
]

# 3. Find which numeric columns actually exist in our data
numeric_cols = [col for col in expected_numeric_cols if col in X_train_df.columns]

# 4. Check if we have numeric columns to scale
if not numeric_cols:
    raise ValueError("No valid numeric columns found for scaling. Check your column names.")

# 5. Initialize and apply StandardScaler with proper column validation
from sklearn.preprocessing import StandardScaler

try:
    # Create a copy of numeric columns to preserve original data
    numeric_data = X_train_df[numeric_cols].copy()
    
    # Initialize and fit scaler
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(numeric_data)
    
    # Update DataFrame with scaled values
    X_train_df[numeric_cols] = scaled_values
    
    print(f"Successfully scaled columns: {numeric_cols}")
    
except Exception as e:
    print(f"Error during scaling: {e}")
    # Optionally: preserve original values if scaling fails
    # X_train_df[numeric_cols] = numeric_data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set style and context for better visuals
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.1)

# Create figure with adjusted size
plt.figure(figsize=(16, 8))

# Create boxplot with enhanced parameters
boxplot = sns.boxplot(
    data=X_train_df,
    palette="vlag",  # Cool blue-red diverging palette
    whis=1.5,        # Show outliers beyond 1.5*IQR (standard)
    linewidth=1.5,   # Thicker box lines
    fliersize=4      # Size of outlier markers
)

# Improve x-axis labels
plt.xticks(
    rotation=45,
    ha='right',      # Better horizontal alignment
    fontsize=12      # Slightly larger font
)

# Improve y-axis
plt.yticks(fontsize=12)
plt.ylabel("Scaled Values", fontsize=13, labelpad=10)

# Add informative title and subtitle
plt.title(
    "Distribution of Features After Standard Scaling",
    fontsize=16,
    pad=20
)
plt.suptitle(
    "Showing median, quartiles, and outliers for all processed features",
    y=0.95,
    fontsize=12,
    color='gray'
)

# Add horizontal grid lines for better readability
plt.grid(axis='y', alpha=0.4, linestyle='--')

# Adjust layout to prevent label cutoff
plt.tight_layout()

# Add annotation for scaled features
if len(numeric_cols) > 0:
    plt.annotate(
        f"Note: {len(numeric_cols)} numeric features were standardized",
        xy=(0.5, -0.15),
        xycoords='axes fraction',
        ha='center',
        fontsize=11,
        color='dimgray'
    )

plt.show()

In [None]:
# 9. Add target for correlation check
df_corr = X_train_df.copy()
df_corr['loan_status'] = y_train_filtered

# 10. Compute correlation matrix and plot heatmap
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix = df_corr.corr()

plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title('Correlation matrix including Target (loan_status)')
plt.show()

In [None]:
 X_test_raw.info()

In [None]:
# 1. Remove 'loan_id' if present (same as training)
X_test_raw = X_test_raw.drop(columns=['loan_id'], errors='ignore')

# 2. Apply the same outlier filter (using training's IQR thresholds)
# Use the IQR values calculated from training data
test_numeric_data = X_test_raw[numeric_cols]  # Use same numeric_cols from training

test_filter = ~(
    (test_numeric_data < (Q1 - 1.5 * IQR)) | 
    (test_numeric_data > (Q3 + 1.5 * IQR))
).any(axis=1)

X_test_filtered = X_test_raw.loc[test_filter].reset_index(drop=True)
y_test_filtered = y_test.loc[test_filter].reset_index(drop=True)

# 3. Apply the trained encoder to test data (no fit!)
X_test_encoded = encoder.transform(X_test_filtered)  # Use the same encoder from training

# 4. Create DataFrame with same columns as training
X_test_df = pd.DataFrame(X_test_encoded, columns=all_columns)  # Use same all_columns

# 5. Convert numeric columns (same as training)
for col in numeric_cols:
    X_test_df[col] = pd.to_numeric(X_test_df[col], errors='coerce')

# 6. Apply the trained scaler to test data (no fit!)
X_test_df[numeric_cols] = scaler.transform(X_test_df[numeric_cols])  # Use same scaler

# 7. (Optional) Visualize test data distributions
plt.figure(figsize=(15,8))
sns.boxplot(data=X_test_df)
plt.xticks(rotation=45)
plt.title('Boxplot of Test Data After Preprocessing')
plt.tight_layout()
plt.show()

In [None]:
# 1. Create correlation dataframe (with proper target alignment)
df_corr = X_test_df.copy()
df_corr['loan_status'] = y_test_filtered.values  # Ensure alignment

# 2. Compute correlation matrix with error handling
try:
    corr_matrix = df_corr.corr(numeric_only=True)  # Only numeric columns
except Exception as e:
    print(f"Correlation calculation error: {e}")
    # Fallback to only numeric columns if mixed data types
    numeric_cols = df_corr.select_dtypes(include=['number']).columns
    corr_matrix = df_corr[numeric_cols].corr()

# 3. Enhanced heatmap visualization
plt.figure(figsize=(14, 12))
heatmap = sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap='coolwarm',
    center=0,
    vmin=-1,
    vmax=1,
    linewidths=0.5,
    linecolor='lightgray',
    cbar_kws={'shrink': 0.8, 'label': 'Correlation Coefficient'}
)

# 4. Improve title and labels
plt.title(
    'Test Set Feature Correlations with Target (loan_status)\n',
    fontsize=16,
    pad=20
)
plt.xticks(
    rotation=45,
    ha='right',
    fontsize=10
)
plt.yticks(
    rotation=0,
    fontsize=10
)

# 5. Highlight target correlations
if 'loan_status' in corr_matrix.columns:
    target_corrs = corr_matrix['loan_status'].drop('loan_status')
    top_features = target_corrs.abs().sort_values(ascending=False).head(3).index
    
    # Annotate top features
    for feature in top_features:
        idx = corr_matrix.index.get_loc(feature)
        heatmap.add_patch(plt.Rectangle(
            (corr_matrix.columns.get_loc('loan_status'), idx),
            1, 1, fill=False, edgecolor='gold', lw=2
        ))

# 6. Adjust layout and display
plt.tight_layout()
plt.show()

# Optional: Print top correlations with target
if 'loan_status' in corr_matrix.columns:
    print("\nTop correlations with loan_status:")
    print(target_corrs.abs().sort_values(ascending=False).head(5))

In [None]:
# Check correlation values explicitly
print(df_corr.corr(numeric_only=True)['loan_status'].sort_values(ascending=False))

# Plot the relationship
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_corr, x='loan_status', y='cibil_score')
plt.title('Loan Status vs. CIBIL Score')
plt.show()