In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
from sklearn.utils.class_weight import compute_class_weight
from joblib import dump
import numpy as np
pd.set_option("display.float_format", lambda x: "%0.3f" % x)
np.set_printoptions(suppress=True)

def print_line(): print("-" * 15)

In [5]:
data = pd.read_csv("Data/processed_data")
data

Unnamed: 0,Montly income (RUB),int_rate,loan_amount (RUB),other loans,is_loss,MORTGAGE,OTHER,OWN,RENT,Debt consolidation,car,credit card,house,major purchase,other,small business,vacation
0,38022,0.153,200000,4,1,0,0,0,1,0,1,0,0,0,0,0,0
1,60836,0.186,240000,4,0,0,0,0,1,0,1,0,0,0,0,0,0
2,63371,0.160,960000,11,1,0,0,0,1,0,1,0,0,0,0,0,0
3,53231,0.106,360000,9,0,1,0,0,0,0,1,0,0,0,0,0,0
4,105196,0.060,280000,28,0,1,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38568,126742,0.130,1940000,33,0,1,0,0,0,0,0,0,0,0,1,0,0
38569,63371,0.135,2016000,18,0,0,0,0,1,0,0,0,0,0,1,0,0
38570,82382,0.175,2000000,20,0,0,0,0,1,0,0,0,0,0,1,0,0
38571,466413,0.182,1920000,9,0,0,0,0,1,0,0,0,0,0,1,0,0


In [3]:
X = data.drop("is_loss", axis= 1)
Y = data["is_loss"]
X.reset_index(inplace=True, drop=True)
Y.reset_index(inplace=True, drop=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42, stratify=Y)
X_test.reset_index(inplace=True, drop=True)
Y_test.reset_index(inplace=True, drop=True)
X_train.reset_index(inplace= True, drop= True)
Y_train.reset_index(inplace= True, drop= True)
Y_test = pd.DataFrame(Y_test)
print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\nY_train: {Y_train.shape}\nY_test: {Y_test.shape}")

X_train: (28929, 16)
X_test: (9644, 16)
Y_train: (28929,)
Y_test: (9644, 1)


In [4]:
min = X_train.min()
max = X_train.max()
min.to_csv("Data/min_vals",index=False)
max.to_csv("Data/max_vals", index=False)

Преобразуем числовые данные в единый масштаб (Нормализация). Долго

In [5]:
# X_train_norm = X_train.copy()
# X_test_norm = X_test.copy()
# for col in X_train.columns[:4]:
#     X_train_norm[col] = X_train_norm[col].apply(lambda val: (val - min) / (max - min))[col]
#     X_test_norm[col] = X_test_norm[col].apply(lambda val: (val - min) / (max - min))[col]
# X_train_norm.to_csv("Data/X_train_norm", index= False)
# X_test_norm.to_csv("Data/X_test_norm", index= False)

In [6]:
X_train_norm = pd.read_csv("Data/X_train_norm")
X_test_norm = pd.read_csv("Data/X_test_norm")

In [7]:
def get_metric_results(learned_model, metric):
    return metric(Y_test, learned_model.predict(X_test_norm))

class_weights = compute_class_weight("balanced", classes= np.unique(Y_train), y= Y_train)
print_line()
print(class_weights)
model = RandomForestClassifier(n_estimators= 106, max_depth= 4,
                               random_state=42,
                               class_weight=dict(enumerate(class_weights)))
model.fit(X_train_norm, Y_train)
prediction_train = model.predict(X_train_norm)
recall_train = recall_score(Y_train, prediction_train)
# plt.figure(figsize=(15, 10))
# for estimator in model.estimators_:
#     plot_tree(estimator, feature_names=X_train.columns, filled=True, rounded=True)
# plt.show()
print(f"Recall на тренировочных данных: {recall_train}")
print_line()
print(f"Recall на тестовых данных: {recall_score(Y_test, model.predict(X_test_norm))}")
print_line()
print(f"Precision на тесте: {precision_score(Y_test, model.predict(X_test_norm))}")
print_line()
feature_importance = pd.DataFrame({ 'feature': X_train.columns, 
                                   'importance': model.feature_importances_ }
                                   ).sort_values('importance', ascending=False)
print(feature_importance)

features = [0.013008762558239082,0.2968179447052687,0.2463768115942029,0.29411764705882354,0,0,1,0,0,0,0,0,0,1,0,0]
for key, (min_val, max_val) in enumerate(zip(min, max, strict=True)):
    features[key] = (features[key] - min_val) / (max_val - min_val)
features = np.reshape(features, shape= (1, -1))
print(features)

---------------
[0.58022785 3.616125  ]
Recall на тренировочных данных: 0.71175
---------------
Recall на тестовых данных: 0.7164291072768192
---------------
Precision на тесте: 0.20675470881143104
---------------
                feature  importance
1              int_rate       0.643
0   Montly income (RUB)       0.117
14       small business       0.088
2     loan_amount (RUB)       0.071
10          credit card       0.030
3           other loans       0.020
12       major purchase       0.010
4              MORTGAGE       0.005
7                  RENT       0.004
9                   car       0.003
8    Debt consolidation       0.002
6                   OWN       0.002
13                other       0.002
15             vacation       0.001
5                 OTHER       0.001
11                house       0.001
[[-0.00066702  1.26561265 -0.01449266 -0.0200692   0.          0.
   1.          0.          0.          0.          0.          0.
   0.          1.          0.          0. 

Подбираем лучшие параметры для модели (ОЧЕНЬ ДОЛГО). Результат: n_estimators = 106, max_depth = 4

In [None]:
best_f = 0
for estimators in range(10, 150, 3):
    for depth in range(3, 15):
        test_model = RandomForestClassifier(n_estimators=estimators, max_depth= depth, 
                                            random_state=42,
                                            class_weight=dict(enumerate(class_weights)))
        test_model.fit(X_train_norm, Y_train)
        f_score = fbeta_score(Y_test, test_model.predict(X_test_norm), beta=1.5)
        if f_score > best_f:
            best_f = f_score
            best_estimators = estimators
            best_max_depth = depth

print(f"Лучший параметр estimators: {best_estimators}\nЛучший параметр max_depth: {best_max_depth}")

In [None]:
prediction = model.predict(X_test_norm)
prediction = pd.DataFrame(prediction, columns=['Prediction'])
results = pd.concat([X_test_norm, prediction["Prediction"], Y_test], axis= 1, join="inner")
results.rename(columns={"is_loss": "Real Value"}, inplace= True) 
results[(results["Real Value"] == 1) & (results["Prediction"] == 1)]

In [8]:
model_path = "RandomForest.sav"
dump(model, model_path)

['RandomForest.sav']