In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [2]:
sample_data = pd.read_excel('sample_data.xlsx')
sample_data.head()

Unnamed: 0,Application,loan_amount,loan_days,applied_at,gender_id,Unnamed: 5,birth_date,Marital status,children_count_id,education_id,...,closed_at,product_profile_id,credit_policy_id,user_id,face_id,prolongation_number,prolongation_total_days,wizard_type_id,step,created_at
0,1,3000,30,2021-02-01 00:21:56,1,,1995-02-03,2,1,5,...,2021-03-01 16:17:00,14,31,253430,,,,2,3,2021-02-01 00:20:01
1,2,1000,7,2021-02-01 00:24:08,1,,1984-01-19,2,2,4,...,2021-02-07 15:30:00,12,38,109618,,,,2,3,2021-02-01 00:20:04
2,3,1000,3,2021-02-01 00:36:35,2,,1994-08-02,2,1,4,...,2021-02-05 18:26:00,14,39,289130,,,,5,7,2021-02-01 00:22:13
3,4,1600,30,2021-02-01 00:34:22,1,,1992-11-12,1,1,3,...,2021-02-18 18:42:00,18,38,141625,226701.0,,,2,3,2021-02-01 00:22:15
4,5,2500,18,2021-02-01 23:22:57,2,,1997-10-22,1,1,5,...,2021-05-23 18:29:00,14,39,204249,,1.0,20.0,5,7,2021-02-01 00:23:03


In [3]:
sample_data.duplicated().sum()

0

In [4]:
current_year = datetime.now().year
sample_data.loc[:, 'age'] = current_year - pd.DatetimeIndex(sample_data['birth_date']).year
sample_dat = sample_data.drop(columns='birth_date')
sample_data['age'].head()

0    29
1    40
2    30
3    32
4    27
Name: age, dtype: int32

In [5]:
features = sample_data[
    ['loan_amount', 'loan_days', 'gender_id', 'children_count_id',
     'monthly_income', 'has_immovables', 'other_loans_active', 'income_frequency_id', 'seniority_years', 'age']].copy()
features.head()

Unnamed: 0,loan_amount,loan_days,gender_id,children_count_id,monthly_income,has_immovables,other_loans_active,income_frequency_id,seniority_years,age
0,3000,30,1,1,15000,0,1,2,5,29
1,1000,7,1,2,11000,0,2,2,8,40
2,1000,3,2,1,10000,0,0,2,5,30
3,1600,30,1,1,8000,0,0,2,3,32
4,2500,18,2,1,9000,1,2,2,1,27


In [6]:
features.isna().sum()

loan_amount            0
loan_days              0
gender_id              0
children_count_id      0
monthly_income         0
has_immovables         0
other_loans_active     0
income_frequency_id    0
seniority_years        0
age                    0
dtype: int64

In [7]:
sample_data['closed_in_time'] = ((sample_data['loan_closed'] == 1) & (sample_data['loan_overdue'] == 0)).astype(int)
target = sample_data['closed_in_time']
target.head()

0    1
1    1
2    0
3    1
4    0
Name: closed_in_time, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42, stratify=target)

In [9]:
decision_tree_gcv = GridSearchCV(
    DecisionTreeClassifier(class_weight='balanced'),
    param_grid={
        'max_depth': range(5, 11),
        "min_samples_split": [2,5,7,10],
        "min_samples_leaf": [1,2,5]
    },
    n_jobs=-1,
    cv=5,
    verbose=1
).fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [10]:
decision_tree_gcv.best_params_

{'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 10}

In [11]:
predictions = decision_tree_gcv.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.25      0.35      0.29        37
           1       0.75      0.65      0.70       113

    accuracy                           0.57       150
   macro avg       0.50      0.50      0.49       150
weighted avg       0.63      0.57      0.60       150


In [12]:
decision_tree_gcv.score(X_train, y_train)

0.7564469914040115

In [17]:
sample_data['loan_to_income_ratio'] = sample_data['loan_amount'] / sample_data['monthly_income']

In [18]:
clustering_features = ['loan_to_income_ratio', 'loan_days', 'age']
clustering_data = sample_data[clustering_features].copy()
clustering_data.head()

Unnamed: 0,loan_to_income_ratio,loan_days,age
0,0.2,30,29
1,0.090909,7,40
2,0.1,3,30
3,0.2,30,32
4,0.277778,18,27


In [19]:
scaler = StandardScaler()
clustering_data_scaled = scaler.fit_transform(clustering_data)

In [20]:
dbscan = DBSCAN()
clusters = dbscan.fit_predict(clustering_data_scaled)
sample_data['suspicious'] = clusters == -1
sample_data[sample_data['suspicious']].loc[:, clustering_features]

Unnamed: 0,loan_to_income_ratio,loan_days,age
77,200.0,15,62
111,0.188889,4,63


In [21]:
clustering_data.describe()

Unnamed: 0,loan_to_income_ratio,loan_days,age
count,499.0,499.0,499.0
mean,0.63014,22.124248,35.160321
std,8.945472,9.15159,8.29128
min,0.015789,3.0,23.0
25%,0.1,15.0,29.0
50%,0.166667,28.0,33.0
75%,0.3,30.0,39.0
max,200.0,30.0,65.0
