In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
data = pd.read_csv('loan_data.csv')

# Define feature lists
numerical_features = ['loan_amnt', 'term', 'int_rate', 'emp_length', 'annual_inc', 'dti', 'T']
categorical_features = ['grade', 'home_ownership', 'purpose']

# One-Hot Encoding for 'grade' and 'home_ownership'
one_hot_features = ['grade', 'home_ownership']

# Target Encoding for 'purpose'
data['purpose_encoded'] = data.groupby('purpose')['loan_status'].transform('mean')

# Splitting the data into training and test sets
X = data.drop(['loan_status', 'purpose'], axis=1)
y = data['loan_status']

# Preprocessing pipelines
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, one_hot_features)
    ])

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Hyperparameter tuning (example for RandomForest)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30]
}

grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
data = pd.read_csv('loan_data.csv')

# Log transformation for annual_inc to reduce skewness
data['annual_inc_log'] = np.log1p(data['annual_inc'])

# Replace negative dti values with zero or the median (alternative approach)
data['dti'] = data['dti'].apply(lambda x: max(x, 0))

# Drop the original annual_inc column
data = data.drop(['annual_inc'], axis=1)

# Define feature lists
numerical_features = ['loan_amnt', 'term', 'int_rate', 'emp_length', 'annual_inc_log', 'dti', 'T']
categorical_features = ['grade', 'home_ownership']

# One-Hot Encoding for categorical variables
one_hot_features = ['grade', 'home_ownership']

# Target Encoding for purpose
data['purpose_encoded'] = data.groupby('purpose')['loan_status'].transform('mean')
data = data.drop(['purpose'], axis=1)

# Splitting the data into training and test sets
X = data.drop(['loan_status'], axis=1)
y = data['loan_status']

# Preprocessing pipelines
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, one_hot_features)
    ])

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Hyperparameter tuning (example for RandomForest)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30]
}

grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)
