In [23]:
import kagglehub
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [3]:
path = kagglehub.dataset_download("zhijinzhai/loandata")
print("Dataset path:", path)

Using Colab cache for faster access to the 'loandata' dataset.
Dataset path: /kaggle/input/loandata


In [9]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/loandata/Loan payments data.csv


In [10]:
df = pd.read_csv("/kaggle/input/loandata/Loan payments data.csv")

print(df.head())
print(df.columns)
print(df.info())

       Loan_ID loan_status  Principal  terms effective_date   due_date  \
0  xqd20166231     PAIDOFF       1000     30       9/8/2016  10/7/2016   
1  xqd20168902     PAIDOFF       1000     30       9/8/2016  10/7/2016   
2  xqd20160003     PAIDOFF       1000     30       9/8/2016  10/7/2016   
3  xqd20160004     PAIDOFF       1000     15       9/8/2016  9/22/2016   
4  xqd20160005     PAIDOFF       1000     30       9/9/2016  10/8/2016   

     paid_off_time  past_due_days  age             education  Gender  
0  9/14/2016 19:31            NaN   45  High School or Below    male  
1   10/7/2016 9:00            NaN   50              Bechalor  female  
2  9/25/2016 16:58            NaN   33              Bechalor  female  
3  9/22/2016 20:00            NaN   27               college    male  
4  9/23/2016 21:36            NaN   28               college  female  
Index(['Loan_ID', 'loan_status', 'Principal', 'terms', 'effective_date',
       'due_date', 'paid_off_time', 'past_due_days', 'ag

In [11]:
X = df.drop("loan_status", axis=1)
y = df["loan_status"]

In [14]:
# Preprocessing
numeric_transformer = SimpleImputer(strategy="mean")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [15]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])
numeric_transformer = SimpleImputer(strategy="mean")

In [19]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [20]:
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_val)

In [24]:
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy: 0.96
                    precision    recall  f1-score   support

        COLLECTION       1.00      1.00      1.00        24
COLLECTION_PAIDOFF       1.00      0.83      0.91        24
           PAIDOFF       0.93      1.00      0.96        52

          accuracy                           0.96       100
         macro avg       0.98      0.94      0.96       100
      weighted avg       0.96      0.96      0.96       100

