In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("loan_approval_dataset.csv")

In [3]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
# x(feature)
# no_of_dependents - иждивенцы
# education - образование
# self_employed	- устроен ли на работу
# income_annum - годовой доход
# loan_amount - Сумма кредита
# loan_term - длительность кредита
# cibil_score - кредитный рейтинг

# y(target)
# loan_status - одобрили или нет

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [6]:
df.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [7]:
df.columns = df.columns.str.strip()
print(df.columns)

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')


In [8]:
X_enc = ['education', 'self_employed', 'loan_status']
df_enc = pd.get_dummies(df[X_enc], drop_first=True, dtype=float)
df_enc.columns = df_enc.columns.str.strip().str.lower().str.replace(' ', '')

In [9]:
df = df.drop(['education', 'self_employed', 'loan_status'], axis=1)
df = pd.concat([df, df_enc], axis=1)

In [10]:
df.columns

Index(['loan_id', 'no_of_dependents', 'income_annum', 'loan_amount',
       'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'education_notgraduate', 'self_employed_yes', 'loan_status_rejected'],
      dtype='object')

In [11]:
X = df[['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'education_notgraduate', 'self_employed_yes']]
y = df['loan_status_rejected']

In [12]:
y

Unnamed: 0,loan_status_rejected
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
4264,1.0
4265,0.0
4266,1.0
4267,0.0


In [13]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

cv_scores = cross_val_score(
    LogisticRegression(max_iter=1000),
    scaler.fit_transform(X),
    y,
    cv=5,
    scoring='accuracy'
)

print("\nCross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Test Accuracy: 0.9028103044496487

Confusion Matrix:
 [[497  39]
 [ 44 274]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.92      0.93      0.92       536
         1.0       0.88      0.86      0.87       318

    accuracy                           0.90       854
   macro avg       0.90      0.89      0.90       854
weighted avg       0.90      0.90      0.90       854


Cross-Validation Scores: [0.92388759 0.91920375 0.91451991 0.92154567 0.9073857 ]
Mean CV Accuracy: 0.9173085212406413


In [14]:
from sklearn.metrics import roc_curve, roc_auc_score, auc

y_proba = model.predict_proba(X_test_scaled)[:, 1]

roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC Score: {roc_auc:.4f}")

ROC-AUC Score: 0.9673


In [15]:
X

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,education_notgraduate,self_employed_yes
0,2,9600000,29900000,12,778,0.0,0.0
1,0,4100000,12200000,8,417,1.0,1.0
2,3,9100000,29700000,20,506,0.0,0.0
3,3,8200000,30700000,8,467,0.0,0.0
4,5,9800000,24200000,20,382,1.0,1.0
...,...,...,...,...,...,...,...
4264,5,1000000,2300000,12,317,0.0,1.0
4265,0,3300000,11300000,20,559,1.0,1.0
4266,2,6500000,23900000,18,457,1.0,0.0
4267,1,4100000,12800000,8,780,1.0,0.0


In [16]:
X

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,education_notgraduate,self_employed_yes
0,2,9600000,29900000,12,778,0.0,0.0
1,0,4100000,12200000,8,417,1.0,1.0
2,3,9100000,29700000,20,506,0.0,0.0
3,3,8200000,30700000,8,467,0.0,0.0
4,5,9800000,24200000,20,382,1.0,1.0
...,...,...,...,...,...,...,...
4264,5,1000000,2300000,12,317,0.0,1.0
4265,0,3300000,11300000,20,559,1.0,1.0
4266,2,6500000,23900000,18,457,1.0,0.0
4267,1,4100000,12800000,8,780,1.0,0.0


In [17]:
y

Unnamed: 0,loan_status_rejected
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
4264,1.0
4265,0.0
4266,1.0
4267,0.0


In [18]:
df = pd.concat([X, y], axis=1)

In [19]:
df['education'] = 1 - df['education_notgraduate']
df.drop(columns='education_notgraduate', inplace=True)

In [20]:
df.rename(columns={'self_employed_yes': 'self_employed'}, inplace=True)

In [21]:
df['loan_status'] = 1 - df['loan_status_rejected']
df.drop(columns='loan_status_rejected', inplace=True)

In [22]:
df.rename(columns={'no_of_dependents': 'dependents'}, inplace=True)

In [23]:
df

Unnamed: 0,dependents,income_annum,loan_amount,loan_term,cibil_score,self_employed,education,loan_status
0,2,9600000,29900000,12,778,0.0,1.0,1.0
1,0,4100000,12200000,8,417,1.0,0.0,0.0
2,3,9100000,29700000,20,506,0.0,1.0,0.0
3,3,8200000,30700000,8,467,0.0,1.0,0.0
4,5,9800000,24200000,20,382,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
4264,5,1000000,2300000,12,317,1.0,1.0,0.0
4265,0,3300000,11300000,20,559,1.0,0.0,1.0
4266,2,6500000,23900000,18,457,0.0,0.0,0.0
4267,1,4100000,12800000,8,780,0.0,0.0,1.0


In [24]:
df.insert(0, 'user_id', range(1, len(df)+1))

In [25]:
df.to_csv('loan_data_preprocc.csv', index=False)

In [26]:
from joblib import dump, load

dump(model, 'log_reg.pkl')

['log_reg.pkl']