In [66]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
from joblib import dump
import numpy as np
pd.set_option("display.float_format", lambda x: "%0.3f" % x)
np.set_printoptions(suppress=True)

def print_line(): print("-" * 15)

In [57]:
data = pd.read_csv("Data/processed_data")
data

Unnamed: 0,Montly income (RUB),int_rate,loan_amount (RUB),other loans,is_loss,MORTGAGE,OTHER,OWN,RENT,Debt consolidation,car,credit card,house,major purchase,other,small business,vacation
0,38022,0.153,200000,4,1,0,0,0,1,0,1,0,0,0,0,0,0
1,60836,0.186,240000,4,0,0,0,0,1,0,1,0,0,0,0,0,0
2,63371,0.160,960000,11,1,0,0,0,1,0,1,0,0,0,0,0,0
3,53231,0.106,360000,9,0,1,0,0,0,0,1,0,0,0,0,0,0
4,105196,0.060,280000,28,0,1,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38568,126742,0.130,1940000,33,0,1,0,0,0,0,0,0,0,0,1,0,0
38569,63371,0.135,2016000,18,0,0,0,0,1,0,0,0,0,0,1,0,0
38570,82382,0.175,2000000,20,0,0,0,0,1,0,0,0,0,0,1,0,0
38571,466413,0.182,1920000,9,0,0,0,0,1,0,0,0,0,0,1,0,0


In [58]:
X = data.drop("is_loss", axis= 1)
Y = data["is_loss"]
X.reset_index(inplace=True, drop=True)
Y.reset_index(inplace=True, drop=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42, stratify=Y)
X_test.reset_index(inplace=True, drop=True)
Y_test.reset_index(inplace=True, drop=True)
X_train.reset_index(inplace= True, drop= True)
Y_train.reset_index(inplace= True, drop= True)
Y_test = pd.DataFrame(Y_test)
print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\nY_train: {Y_train.shape}\nY_test: {Y_test.shape}")

X_train: (28929, 16)
X_test: (9644, 16)
Y_train: (28929,)
Y_test: (9644, 1)


In [59]:
min = X_train.min()
max = X_train.max()
X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
for col in X_train.columns[:4]:
    X_train_norm[col] = X_train_norm[col].apply(lambda val: (val - min) / (max - min))[col]
    X_test_norm[col] = X_test_norm[col].apply(lambda val: (val - min) / (max - min))[col]
X_train_norm.to_csv("Data/X_train_norm", index= False)
X_test_norm.to_csv("Data/X_test_norm", index= False)

In [None]:
def get_metric_results(learned_model, metric):
    return metric(Y_test, learned_model.predict(X_test_norm))

model = RandomForestClassifier(n_estimators=1, random_state=42)
model.fit(X_train_norm, Y_train)
predict = model.predict(X_test_norm)
recall = recall_score(Y_test, predict)
# plt.figure(figsize=(15, 10))
# plot_tree(model, feature_names=X_train.columns, filled=True, rounded=True)
# plt.title("Получившееся дерево: ")
# plt.show()
print(recall)
features = [63371,0.1596,960000,11,0,0,0,1,0,1,0,0,0,0,0,0]
for key, (min_val, max_val) in enumerate(zip(min, max, strict=True)):
    features[key] = (features[key] - min_val) / (max_val - min_val)
features = np.reshape(features, shape= (1, -1))
print_line()
print(f"Предсказание модели для значения 1: {model.predict(features)}")

0.06376594148537135
---------------
Предсказание модели для значения 1: [0]




In [61]:
prediction = model.predict(X_test)
prediction = pd.DataFrame(prediction, columns=['Prediction'])
results = pd.concat([X_test, prediction["Prediction"], Y_test], axis= 1, join="inner")
results.rename(columns={"is_loss": "Real Value"}, inplace= True) 
results[results["Real Value"] == 1]

Unnamed: 0,Montly income (RUB),int_rate,loan_amount (RUB),other loans,MORTGAGE,OTHER,OWN,RENT,Debt consolidation,car,credit card,house,major purchase,other,small business,vacation,Prediction,Real Value
7,64892,0.160,400000,9,0,0,0,1,0,0,0,0,0,1,0,0,0,1
8,85551,0.153,1824000,48,0,0,0,1,1,0,0,0,0,0,0,0,0,1
19,108745,0.124,2720000,31,0,0,1,0,1,0,0,0,0,0,0,0,0,1
27,171102,0.197,2800000,27,1,0,0,0,0,0,0,0,0,1,0,0,0,1
32,95057,0.115,960000,21,0,0,0,1,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9622,101394,0.175,2800000,28,0,0,0,1,0,0,0,0,0,0,1,0,0,1
9624,46387,0.100,800000,25,0,0,0,1,1,0,0,0,0,0,0,0,0,1
9631,73003,0.119,960000,10,0,0,0,1,0,0,0,0,0,1,0,0,0,1
9635,88719,0.110,2464000,33,0,0,0,1,0,0,1,0,0,0,0,0,0,1


In [62]:
# model_path = "KNN.sav"
# dump(model, model_path)