In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
data = pd.read_csv('data1.csv', delimiter=',')
data.head()

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1350 entries, 0 to 1349
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Id                                    1350 non-null   int64  
 1   SeriousDlqin2yrs                      1350 non-null   int64  
 2   RevolvingUtilizationOfUnsecuredLines  1350 non-null   float64
 3   age                                   1350 non-null   int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  1350 non-null   int64  
 5   DebtRatio                             1350 non-null   float64
 6   MonthlyIncome                         1094 non-null   float64
 7   NumberOfOpenCreditLinesAndLoans       1350 non-null   int64  
 8   NumberOfTimes90DaysLate               1350 non-null   int64  
 9   NumberRealEstateLoansOrLines          1350 non-null   int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  1350 non-null   int64  
 11  NumberOfDependent

Можно заметить, некоторые люди должны выплачивать долг в несколько раз превышающий их доход.

In [4]:
data.describe()

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,1350.0,1350.0,1350.0,1350.0,1350.0,1350.0,1094.0,1350.0,1350.0,1350.0,1350.0,1307.0
mean,675.5,0.06,3.577895,52.048889,0.257778,356.123363,6438.473492,8.434074,0.08,0.986667,0.062222,0.737567
std,389.855743,0.237575,84.914699,15.009875,0.751718,1156.603074,7849.754675,5.129287,0.376634,1.008401,0.306555,1.086949
min,1.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,338.25,0.0,0.03114,40.0,0.0,0.175125,3300.0,5.0,0.0,0.0,0.0,0.0
50%,675.5,0.0,0.156891,52.0,0.0,0.367049,5222.5,8.0,0.0,1.0,0.0,0.0
75%,1012.75,0.0,0.543145,63.0,0.0,0.807001,8055.25,11.0,0.0,2.0,0.0,1.0
max,1350.0,1.0,2340.0,97.0,10.0,15466.0,208333.0,31.0,5.0,8.0,5.0,8.0


In [5]:
data[data['DebtRatio'] > 3500][['SeriousDlqin2yrs','MonthlyIncome']].describe()

Unnamed: 0,SeriousDlqin2yrs,MonthlyIncome
count,35.0,4.0
mean,0.085714,0.0
std,0.284029,0.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,0.0


Мне кажется, все люди долг которых слишком большой можно убрать из выборки.

Так как у большинства из них не было просрочек, а из-за больших значений им может быть присвоен больший вес.

In [6]:
data[data['DebtRatio'] > 3500].head(35)

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
6,7,0,0.305682,57,0,5710.0,,8,0,3,0,0.0
90,91,0,0.039388,51,0,15466.0,0.0,7,0,0,0,0.0
109,110,0,0.041258,61,0,4739.0,,11,0,4,0,
124,125,0,0.277957,39,0,4090.0,,9,0,2,0,0.0
214,215,1,0.655316,63,0,5223.0,,19,0,2,0,0.0
235,236,0,0.0,66,0,3730.0,,8,0,1,0,0.0
284,285,0,0.260971,57,0,5439.0,,16,0,3,0,
321,322,0,0.042607,58,0,3976.0,,13,0,3,0,0.0
326,327,0,0.013115,43,0,4211.0,0.0,21,0,2,0,2.0
351,352,0,0.0,58,0,8380.0,,20,0,5,0,0.0


In [7]:
data = data.drop(data[data['DebtRatio'] > 3500].index)
data.describe()

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,1315.0,1315.0,1315.0,1315.0,1315.0,1315.0,1090.0,1315.0,1315.0,1315.0,1315.0,1275.0
mean,675.285932,0.059316,3.664126,52.012167,0.257795,213.76619,6462.100917,8.36654,0.082129,0.953612,0.062357,0.745098
std,389.403199,0.236304,86.036486,15.11246,0.754291,629.659664,7854.435471,5.089408,0.381388,0.983954,0.308291,1.090906
min,1.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,338.5,0.0,0.03055,40.0,0.0,0.167155,3300.0,5.0,0.0,0.0,0.0,0.0
50%,677.0,0.0,0.155742,52.0,0.0,0.353943,5250.0,8.0,0.0,1.0,0.0,0.0
75%,1012.5,0.0,0.541676,63.0,0.0,0.708352,8089.25,11.0,0.0,2.0,0.0,1.0
max,1350.0,1.0,2340.0,97.0,10.0,3400.0,208333.0,31.0,5.0,8.0,5.0,8.0


In [8]:
data['MonthlyIncome'].fillna(data['MonthlyIncome'].mean(), inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1315 entries, 0 to 1349
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Id                                    1315 non-null   int64  
 1   SeriousDlqin2yrs                      1315 non-null   int64  
 2   RevolvingUtilizationOfUnsecuredLines  1315 non-null   float64
 3   age                                   1315 non-null   int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  1315 non-null   int64  
 5   DebtRatio                             1315 non-null   float64
 6   MonthlyIncome                         1315 non-null   float64
 7   NumberOfOpenCreditLinesAndLoans       1315 non-null   int64  
 8   NumberOfTimes90DaysLate               1315 non-null   int64  
 9   NumberRealEstateLoansOrLines          1315 non-null   int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  1315 non-null   int64  
 11  NumberOfDependent

In [9]:
data['DebtRatio'] = data['DebtRatio'] * data['MonthlyIncome']
data.head()

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,7323.197016,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,316.878123,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,258.914887,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,118.963951,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,1584.975094,63588.0,7,0,1,0,0.0


In [21]:
X = data[['age', 'DebtRatio', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfOpenCreditLinesAndLoans']].copy()
y = data['SeriousDlqin2yrs']
for i in ['age', 'DebtRatio', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfOpenCreditLinesAndLoans']:
    maxv = max(X[i])
    X[i] = X[i] / maxv

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
model = RandomForestClassifier(max_depth=3, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [14]:
predictionClass = np.array([1 if y_pred[i] > 0.5 else 0 for i in range(y_pred.shape[0])])
print(classification_report(y_test, predictionClass))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       247
           1       0.00      0.00      0.00        16

    accuracy                           0.94       263
   macro avg       0.47      0.50      0.48       263
weighted avg       0.88      0.94      0.91       263



  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
predictionClass = np.array([1 if y_pred[i] > 0.1 else 0 for i in range(y_pred.shape[0])])
print(classification_report(y_test, predictionClass))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       247
           1       0.05      0.06      0.05        16

    accuracy                           0.87       263
   macro avg       0.49      0.49      0.49       263
weighted avg       0.88      0.87      0.88       263

