##1. Binning (Discretization)

Convert continuous variables into categories.

Example:

Age → Group into "young", "middle-aged", "senior".

Income → Low, medium, high income bins.

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Encode binned categories into numbers
le_age = LabelEncoder()
df['AgeGroup_enc'] = le_age.fit_transform(df['AgeGroup'].astype(str))

le_income = LabelEncoder()
df['IncomeGroup_enc'] = le_income.fit_transform(df['IncomeGroup'].astype(str))

# Select features (you can add more features here)
X = df[['AgeGroup_enc', 'IncomeGroup_enc', 'LoanAmount', 'CreditScore']]
y = df['Default']   # <-- replace with your actual target column

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy (with binned features): {accuracy:.4f}")


Model Accuracy (with binned features): 0.8664


##2. Addressing Multicollinearity

Check if numerical features are highly correlated with each other.

Use Variance Inflation Factor (VIF) or correlation matrix.

Drop or combine highly correlated features to avoid redundancy.

In [25]:
# Use the full numeric feature data (not the VIF table!)
X_all = df.select_dtypes(include=['int64', 'float64']).drop(columns=['Default'])
y = df['Default']   # target


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train-test split
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_all, y, test_size=0.2, random_state=42)

# Train model with all features
model_all = RandomForestClassifier(random_state=42)
model_all.fit(X_train_a, y_train_a)
y_pred_a = model_all.predict(X_test_a)
acc_all = accuracy_score(y_test_a, y_pred_a)

print("Accuracy with all numeric features:", round(acc_all, 4))


Accuracy with all numeric features: 0.8861


##3. Target Encoding with Caution

Encode categorical variables based on the target distribution.

Example: Encode LoanPurpose by calculating the default rate for each purpose.

Important: Apply using proper cross-validation to avoid data leakage.

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Features (include target-encoded LoanPurpose + others)
X = df[['LoanAmount', 'Income', 'LoanToIncome',
        'MonthsEmployed', 'CreditScore', 'EmpScore', 'LoanPurpose_encoded']]
y = df['Default']   # replace with your actual target column

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy (with target encoding): {accuracy:.4f}")


Model Accuracy (with target encoding): 0.8837


##4. Handling Rare Categories

Combine rare categories into an "Other" group in categorical variables to avoid sparse representations.

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Encode the categorical column
le = LabelEncoder()
df['LoanPurpose_enc'] = le.fit_transform(df['LoanPurpose_mod'])

# Features + target
X = df[['LoanAmount', 'Income', 'LoanToIncome', 'MonthsEmployed', 'CreditScore', 'EmpScore', 'LoanPurpose_enc']]
y = df['Default']   # replace with your actual target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy (with rare categories combined): {accuracy:.4f}")


Model Accuracy (with rare categories combined): 0.8837


##5. Robust Scaling

For datasets with many outliers, use RobustScaler, which uses median and IQR instead of mean and standard deviation.

In [11]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
features_to_scale = ['Income', 'LoanAmount', 'CreditScore', 'InterestRate', 'DTIRatio']

df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
df[features_to_scale].head()


Unnamed: 0,Income,LoanAmount,CreditScore,InterestRate,DTIRatio
0,0.052349,-0.626635,-0.196364,0.154181,-0.15
1,-0.475328,-0.025369,-0.421818,-0.753484,0.45
2,0.025848,0.013287,-0.447273,0.671603,-0.475
3,-0.753084,-0.673758,0.614545,-0.55662,-0.675
4,-0.9204,-0.96408,0.214545,-0.605401,0.575


##6. Synthetic Feature Creation

Derive interaction terms like Income / LoanAmount or MonthsEmployed * CreditScore.

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Model without synthetic features
X_base = df[['LoanAmount', 'Income', 'MonthsEmployed', 'CreditScore']]
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_base, y, test_size=0.2, random_state=42)

model_base = RandomForestClassifier(random_state=42)
model_base.fit(X_train_b, y_train_b)
y_pred_b = model_base.predict(X_test_b)
accuracy_base = accuracy_score(y_test_b, y_pred_b)



In [21]:

# Model with synthetic features
X_synth = df[['LoanAmount', 'Income', 'LoanToIncome', 'MonthsEmployed', 'CreditScore', 'EmpScore']]
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_synth, y, test_size=0.2, random_state=42)

model_synth = RandomForestClassifier(random_state=42)
model_synth.fit(X_train_s, y_train_s)
y_pred_s = model_synth.predict(X_test_s)
accuracy_synth = accuracy_score(y_test_s, y_pred_s)

print(f"Accuracy without synthetic features: {accuracy_base:.4f}")
print(f"Accuracy with synthetic features:    {accuracy_synth:.4f}")

Accuracy without synthetic features: 0.8827
Accuracy with synthetic features:    0.8827


##7. Pipeline Automation with ColumnTransformer

Automate preprocessing by applying different transformations to numerical and categorical columns without manual steps.

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

# Define preprocessing steps
numeric_features = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'InterestRate', 'LoanTerm', 'DTIRatio']
categorical_features = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Example usage
X = df[numeric_features + categorical_features]
y = df['Default']

pipeline.fit(X, y)


In [17]:
##Check Accuracy
print("Training Accuracy:", pipeline.score(X, y))


Training Accuracy: 0.8850740365071843
