Decision Trees


Classifying Loan Status Using Decision Trees
    Dataset: 
Lending Club Loan Data
    Preprocessing Steps:
Handle missing values if any.
Encode categorical variables (e.g., one-hot encoding for loan grade, sub-grade, etc.).
Standardize numerical features.
    Task: 
Implement a decision tree classifier to classify loan status and evaluate the model using accuracy and ROC-AUC


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

# Load the dataset
data = pd.read_csv("loans_full_schema.csv")

# Display basic information
print(data.info())

# Drop rows with missing values for simplicity (could also consider imputing)
data = data.dropna()

# Define 'Good' and 'Bad' loan statuses
good_statuses = ['Current', 'Fully Paid']
bad_statuses = ['Charged Off', 'Default', 'Late (31-120 days)', 'In Grace Period']

# Create binary target variable
data['loan_status_binary'] = data['loan_status'].apply(lambda x: 0 if x in good_statuses else 1)

# Define features and target
features = data.drop(columns=['loan_status', 'loan_status_binary'])
target = data['loan_status_binary']

# Apply one-hot encoding to categorical features
features = pd.get_dummies(features, columns=['sub_grade'])

# Standardize numerical features
scaler = StandardScaler()
scaledFeatures = scaler.fit_transform(features)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaledFeatures, target, test_size=0.2, random_state=42)

# Train the Decision Tree classifier
dtClassifier = DecisionTreeClassifier(random_state=42)
dtClassifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = dtClassifier.predict(X_test)
y_pred_proba = dtClassifier.predict_proba(X_test)[:, 1]

# Calculate and print accuracy and ROC-AUC scores
accuracyScore = accuracy_score(y_test, y_pred)
rocAuc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy Score: {accuracyScore:.2f}")
print(f"ROC-AUC Score: {rocAuc:.2f}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 56 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        10000 non-null  int64  
 1   emp_title                         9167 non-null   object 
 2   emp_length                        9183 non-null   float64
 3   state                             10000 non-null  object 
 4   homeownership                     10000 non-null  object 
 5   annual_income                     10000 non-null  float64
 6   verified_income                   10000 non-null  object 
 7   debt_to_income                    9976 non-null   float64
 8   annual_income_joint               1495 non-null   float64
 9   verification_income_joint         1455 non-null   object 
 10  debt_to_income_joint              1495 non-null   float64
 11  delinq_2y                         10000 non-null  int64  
 12  month

ValueError: could not convert string to float: 'supplies clerk'