In [None]:
#Steps:

# 1. -> Data Collection: You'll need a dataset containing customer details (like income, age, loan amount, etc.) and 
#a label indicating whether the customer is a good or bad credit risk (often binary: 1 for good credit, 0 for bad credit).

# 2. ->Data Preprocessing: Prepare the dataset by handling missing values, encoding categorical features, and splitting it into training and test sets.

# 3. ->Model Building: Train machine learning models such as Logistic Regression, Decision Trees, and Random Forests.

# 4. ->Evaluation: Evaluate the model’s performance using accuracy, precision, recall, and AUC score.

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

# Step 1: Load dataset
# Example: 'credit_data.csv' contains features like age, income, loan_amount, credit_history, etc.
df = pd.read_csv(r"C:\Users\kc\Downloads\loan_data_set.csv")

# Step 2: Preprocessing
# Drop missing values (if any) or you can use imputation techniques
df.dropna(inplace=True)

# Separate features and target variable
X = df.drop(columns=['Credit_History'])  # features (e.g., income, age, loan_amount, credit_history, etc.)
y = df['Credit_History']  # target (1 for good credit, 0 for bad credit)

# Convert categorical data to numeric using one-hot encoding (if needed)
X = pd.get_dummies(X, drop_first=True)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the feature data (scaling)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Model Building

# Logistic Regression Model
log_model = LogisticRegression(random_state=42)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

# Decision Tree Model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
tree_pred = tree_model.predict(X_test)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Step 4: Model Evaluation

# Function to evaluate models
def evaluate_model(model_name, y_test, y_pred):
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"AUC Score: {roc_auc_score(y_test, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Evaluate Logistic Regression
evaluate_model("Logistic Regression", y_test, log_pred)

# Evaluate Decision Tree
evaluate_model("Decision Tree", y_test, tree_pred)

# Evaluate Random Forest
evaluate_model("Random Forest", y_test, rf_pred)

# Step 5: Feature Importance (for Random Forest)
feature_importance = rf_model.feature_importances_
feature_names = X.columns
important_features = pd.Series(feature_importance, index=feature_names).sort_values(ascending=False)
print("Feature Importance in Random Forest:")
print(important_features)

# Step 6: Save the model for future use (optional)
import joblib
joblib.dump(rf_model, 'credit_scoring_rf_model.pkl')

Model: Logistic Regression
Accuracy: 0.8958
Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        15
         1.0       0.90      1.00      0.95       129

    accuracy                           0.90       144
   macro avg       0.45      0.50      0.47       144
weighted avg       0.80      0.90      0.85       144

AUC Score: 0.5000
Confusion Matrix:
[[  0  15]
 [  0 129]]
--------------------------------------------------
Model: Decision Tree
Accuracy: 0.8819
Classification Report:
              precision    recall  f1-score   support

         0.0       0.44      0.53      0.48        15
         1.0       0.94      0.92      0.93       129

    accuracy                           0.88       144
   macro avg       0.69      0.73      0.71       144
weighted avg       0.89      0.88      0.89       144

AUC Score: 0.7279
Confusion Matrix:
[[  8   7]
 [ 10 119]]
-------------------------------------------------

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['credit_scoring_rf_model.pkl']

In [None]:
### *Explanation of Code:*

#1. *Data Preprocessing:*
 #  - The dataset is loaded and cleaned (removing missing values).
   #- The target variable (credit_score) is binary (1 for good credit and 0 for bad credit).
   #- Categorical variables are one-hot encoded, and numeric variables are standardized using StandardScaler to normalize the data.
   #- The data is split into training and test sets using train_test_split().

#2. *Model Building:*
   #- Three models are built: Logistic Regression, Decision Tree, and Random Forest.
  # - Each model is trained on the training data and predictions are made on the test data.

#3. *Model Evaluation:*
  # - The evaluate_model() function computes key metrics like accuracy, classification report (precision, recall, F1-score), AUC score,
  #     and a confusion matrix for each model.
  # - This allows you to compare the models and choose the best one based on performance.

#4. *Feature Importance (Random Forest):*
  # - For Random Forest, feature importance is extracted to understand which features contribute most to predicting creditworthiness.

#5. *Model Saving:*
   #- The Random Forest model is saved using joblib.dump() so it can be reused without retraining.


In [None]:
#Here's a detailed explanation of the output:

### **Logistic Regression Model**:
#1. **Accuracy**: The model achieved 89.58% accuracy, meaning it correctly predicted the credit history (good/bad) in about 90% of the cases.
   
#2. **Classification Report**:
#   - **Precision, Recall, F1-Score** for each class (0: bad credit, 1: good credit):
#     - For class `0` (bad credit), precision, recall, and f1-score are all `0`. This means the model failed to predict any 
 #      `bad credit` cases (all `15` actual bad credit cases were incorrectly predicted as `good credit`).
 #    - For class `1` (good credit), precision is `0.90`, recall is `1.00`, and the f1-score is `0.95`.
 #        This means most `good credit` cases were correctly predicted.
 #  - **Support** refers to the number of instances of each class (15 for bad credit and 129 for good credit).

#3. **AUC Score**: The AUC score is `0.50`, which suggests that the model is no better than random guessing at
#       distinguishing between the two classes (since `0.5` is the baseline for a random classifier).
   
#4. **Confusion Matrix**: 
#   - Out of 15 bad credit cases, the model predicted 0 correctly.
#   - For the 129 good credit cases, the model correctly predicted all 129.
   
### **Decision Tree Model**:
#1. **Accuracy**: The model achieved 88.19% accuracy.
   
#2. **Classification Report**:
 #  - **Class 0** (bad credit): Precision is `0.44`, recall is `0.53`, and f1-score is `0.48`, 
 #   indicating that the model detected some bad credit cases, but it's still far from ideal.
  # - **Class 1** (good credit): Precision is `0.94`, recall is `0.92`, and f1-score is `0.93`,
   #    meaning the model performed well for good credit cases.
   
#3. **AUC Score**: The AUC score is `0.7279`, indicating that the decision tree is better at distinguishing between
  #         the two classes than the logistic regression model.

#4. **Confusion Matrix**:
 #  - Out of 15 bad credit cases, the model predicted 8 correctly.
 #  - For the 129 good credit cases, it predicted 119 correctly.

### **Random Forest Model**:
#1. **Accuracy**: The model achieved 88.89% accuracy.
   
#2. **Classification Report**:
#   - **Class 0** (bad credit): Precision, recall, and f1-score are all `0`, meaning that, like the logistic regression model,
 #          the random forest model failed to predict any bad credit cases.
  # - **Class 1** (good credit): Precision is `0.90`, recall is `0.99`, and f1-score is `0.94`, showing that it predicted good credit cases very well.
   
#3. **AUC Score**: The AUC score is `0.4961`, which, like the logistic regression model, 
#     indicates the model is not able to differentiate between the two classes (worse than the decision tree).

#4. **Confusion Matrix**:
 #  - Out of 15 bad credit cases, the model predicted none correctly.
  # - For the 129 good credit cases, the model predicted 128 correctly.

### **Feature Importance (Random Forest)**:
#The feature importance ranking shows which features were most significant in the random forest model's decision-making process. The top 3 features are:
#   - `Loan_Status_Y` (probably indicating whether the loan was approved),
 #  - `ApplicantIncome` (the applicant's income),
  # - `LoanAmount` (the loan amount requested).
   
#Many other features, like individual loan IDs (e.g., `Loan_ID_LP002205`), were deemed unimportant, with an importance value of `0.000000`.

### **Conclusion**:
#- **Logistic Regression** and **Random Forest** models have a high accuracy but struggle to predict the minority class (`bad credit` cases) effectively.
#- **Decision Tree** is better at identifying both good and bad credit cases, as reflected in its higher AUC score and more balanced confusion matrix.
#- **Feature Importance** shows that certain variables like loan status and income have a large influence on the model's predictions.
