In [94]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
from sklearn.utils.class_weight import compute_class_weight
from joblib import dump
import numpy as np
pd.set_option("display.float_format", lambda x: "%0.3f" % x)
np.set_printoptions(suppress=True)

def print_line(): print("-" * 15)

In [95]:
data = pd.read_csv("Data/processed_data")
data

Unnamed: 0,Montly income (RUB),int_rate,loan_amount (RUB),other loans,is_loss,MORTGAGE,OTHER,OWN,RENT,Debt consolidation,car,credit card,house,major purchase,other,small business,vacation
0,38022,0.153,200000,4,1,0,0,0,1,0,1,0,0,0,0,0,0
1,60836,0.186,240000,4,0,0,0,0,1,0,1,0,0,0,0,0,0
2,63371,0.160,960000,11,1,0,0,0,1,0,1,0,0,0,0,0,0
3,53231,0.106,360000,9,0,1,0,0,0,0,1,0,0,0,0,0,0
4,105196,0.060,280000,28,0,1,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38568,126742,0.130,1940000,33,0,1,0,0,0,0,0,0,0,0,1,0,0
38569,63371,0.135,2016000,18,0,0,0,0,1,0,0,0,0,0,1,0,0
38570,82382,0.175,2000000,20,0,0,0,0,1,0,0,0,0,0,1,0,0
38571,466413,0.182,1920000,9,0,0,0,0,1,0,0,0,0,0,1,0,0


In [96]:
X = data.drop("is_loss", axis= 1)
Y = data["is_loss"]
X.reset_index(inplace=True, drop=True)
Y.reset_index(inplace=True, drop=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42, stratify=Y)
X_test.reset_index(inplace=True, drop=True)
Y_test.reset_index(inplace=True, drop=True)
X_train.reset_index(inplace= True, drop= True)
Y_train.reset_index(inplace= True, drop= True)
Y_test = pd.DataFrame(Y_test)
print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\nY_train: {Y_train.shape}\nY_test: {Y_test.shape}")

X_train: (28929, 16)
X_test: (9644, 16)
Y_train: (28929,)
Y_test: (9644, 1)


In [97]:
min = X_train.min()
max = X_train.max()

Преобразуем числовые данные в единый масштаб (Нормализация). Долго

In [98]:
# X_train_norm = X_train.copy()
# X_test_norm = X_test.copy()
# for col in X_train.columns[:4]:
#     X_train_norm[col] = X_train_norm[col].apply(lambda val: (val - min) / (max - min))[col]
#     X_test_norm[col] = X_test_norm[col].apply(lambda val: (val - min) / (max - min))[col]
# X_train_norm.to_csv("Data/X_train_norm", index= False)
# X_test_norm.to_csv("Data/X_test_norm", index= False)

In [99]:
X_train_norm = pd.read_csv("Data/X_train_norm")
X_test_norm = pd.read_csv("Data/X_test_norm")

In [146]:
def get_metric_results(learned_model, metric):
    return metric(Y_test, learned_model.predict(X_test_norm))

class_weights = compute_class_weight("balanced", classes= np.unique(Y_train), y= Y_train)
print_line()
print(class_weights)
model = RandomForestClassifier(n_estimators= 150, random_state=42,
                               class_weight=dict(enumerate(class_weights)),
                               max_depth= 5)
model.fit(X_train_norm, Y_train)
prediction_train = model.predict(X_train_norm)
recall_train = recall_score(Y_train, prediction_train)
# plt.figure(figsize=(15, 10))
# plot_tree(model, feature_names=X_train.columns, filled=True, rounded=True)
# plt.title("Получившееся дерево: ")
# plt.show()
print(f"Recall на тренировочных данных: {recall_train}")
print_line()
print(f"Recall на тестовых данных: {recall_score(Y_test, model.predict(X_test_norm))}")
print_line()
print(f"Precision на тесте: {precision_score(Y_test, model.predict(X_test_norm))}")
# print(precision_score(Y_test, prediction))
features = [0.013008762558239082,0.2968179447052687,0.2463768115942029,0.29411764705882354,0,0,1,0,0,0,0,0,0,1,0,0]
for key, (min_val, max_val) in enumerate(zip(min, max, strict=True)):
    features[key] = (features[key] - min_val) / (max_val - min_val)
features = np.reshape(features, shape= (1, -1))
print_line()
print(f"Предсказание модели для значения 1: {model.predict(pd.DataFrame(features, 
                                                                        columns = X_train.columns))}")

---------------
[0.58022785 3.616125  ]
Recall на тренировочных данных: 0.6995
---------------
Recall на тестовых данных: 0.6894223555888972
---------------
Precision на тесте: 0.20820117806977798
---------------
Предсказание модели для значения 1: [1]


In [130]:
prediction = model.predict(X_train_norm)
prediction = pd.DataFrame(prediction, columns=['Prediction'])
results = pd.concat([X_train_norm, prediction["Prediction"], Y_train], axis= 1, join="inner")
results.rename(columns={"is_loss": "Real Value"}, inplace= True) 
results[results["Real Value"] == 1]

Unnamed: 0,Montly income (RUB),int_rate,loan_amount (RUB),other loans,MORTGAGE,OTHER,OWN,RENT,Debt consolidation,car,credit card,house,major purchase,other,small business,vacation,Prediction,Real Value
1,0.009,0.441,0.275,0.271,1,0,0,0,1,0,0,0,0,0,0,0,1,1
7,0.007,0.510,0.043,0.200,1,0,0,0,0,0,0,0,0,1,0,0,1,1
20,0.011,0.491,0.565,0.376,0,0,0,1,0,0,0,0,0,0,1,0,1,1
25,0.002,0.871,0.072,0.035,0,0,1,0,0,0,0,0,0,1,0,0,1,1
30,0.006,0.379,0.203,0.200,0,0,0,1,1,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28921,0.010,0.516,0.217,0.059,0,0,0,1,1,0,0,0,0,0,0,0,1,1
28922,0.006,0.551,0.333,0.094,0,0,0,1,0,0,0,0,0,1,0,0,1,1
28923,0.030,0.606,0.565,0.235,0,0,0,1,0,0,0,0,0,0,1,0,1,1
28925,0.004,0.414,0.275,0.106,0,0,0,1,1,0,0,0,0,0,0,0,1,1


In [102]:
# model_path = "KNN.sav"
# dump(model, model_path)