In [1]:
# Импортируем необходимые библиотеки
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score

In [2]:
# Коллекция оценок
def score_metrics(y_pred, y_true):
    print("ACCURACY: {}".format(accuracy_score(y_true, y_pred)))
    print("PRECISION: {}".format(precision_score(y_true, y_pred, labels=np.unique(y_true))))
    print("RECALL: {}".format(recall_score(y_true, y_pred, labels=np.unique(y_true))))
    print("CONFUSION MATRIX \n{}".format(confusion_matrix(y_true, y_pred)))

In [3]:
# Считывание данных
path = r"path"
train_data = pd.read_csv(path + "\\credit_train.csv")
additional = pd.read_csv(path + "\\additional_credit_train_wo_target.csv")
test_data = pd.read_csv(path + "\\credit_test.csv")
full_train_data = pd.concat([train_data, additional], ignore_index=True)

In [4]:
train_data

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
0,52372,M,38,MAR,SPC,1216800,10,1.1,0308454,GRD,ОБЛ ВОЛОГОДСКАЯ,25000.0,0.0,0.0,0
1,75213,F,36,MAR,SPC,1773800,10,1.1,0498147,GRD,ОБЛ БЕЛГОРОДСКАЯ,25000.0,4.0,0.0,0
2,119931,M,23,UNM,SPC,2648900,10,1.1,0431980,SCH,ЧЕЛЯБИНСКАЯ ОБЛ,25000.0,,,0
3,134365,M,24,UNM,SPC,2037900,12,1.6,0770546,GRD,РЕСП КАРАЧАЕВО-ЧЕРКЕССКАЯ,50000.0,,,1
4,138695,F,47,MAR,SPC,1201900,10,1.6,0617275,SCH,УЛЬЯНОВСКАЯ ОБЛ,18000.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136591,167442,F,55,MAR,WRP,3473200,6,1.1,0328384,UGR,АО ХАНТЫ-МАНСИЙСКИЙ АВТОНОМНЫЙ ОКРУГ - Ю,35000.0,4.0,0.0,0
136592,134582,M,30,UNM,SPC,2299000,7,1.6,0278594,SCH,ОБЛ МОСКОВСКАЯ,30000.0,1.0,0.0,0
136593,170180,F,36,DIV,SPC,1892900,10,1.1,0425852,GRD,ОМСКАЯ ОБЛАСТЬ,25000.0,0.0,0.0,0
136594,59295,F,32,MAR,UMN,1511280,10,1.2,0419264,GRD,САНКТ-ПЕТЕРБУРГ,50000.0,5.0,0.0,0


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34150 entries, 0 to 34149
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   client_id             34150 non-null  int64  
 1   gender                34150 non-null  object 
 2   age                   34150 non-null  int64  
 3   marital_status        34150 non-null  object 
 4   job_position          34150 non-null  object 
 5   credit_sum            34150 non-null  object 
 6   credit_month          34150 non-null  int64  
 7   tariff_id             34150 non-null  float64
 8   score_shk             34150 non-null  object 
 9   education             34150 non-null  object 
 10  living_region         34112 non-null  object 
 11  monthly_income        34150 non-null  float64
 12  credit_count          32285 non-null  float64
 13  overdue_credit_count  32285 non-null  float64
dtypes: float64(4), int64(3), object(7)
memory usage: 3.6+ MB


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136596 entries, 0 to 136595
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   client_id             136596 non-null  int64  
 1   gender                136596 non-null  object 
 2   age                   136596 non-null  int64  
 3   marital_status        136596 non-null  object 
 4   job_position          136596 non-null  object 
 5   credit_sum            136596 non-null  object 
 6   credit_month          136596 non-null  int64  
 7   tariff_id             136596 non-null  float64
 8   score_shk             136596 non-null  object 
 9   education             136596 non-null  object 
 10  living_region         136442 non-null  object 
 11  monthly_income        136595 non-null  float64
 12  credit_count          129231 non-null  float64
 13  overdue_credit_count  129231 non-null  float64
 14  open_account_flg      136596 non-null  int64  
dtype

In [7]:
# Предобработка данных:
# Заполнение пропусков
# Модели лучше работают с числовыми данными, поэтому нужно осуществить приведение типов

full_train_data.gender.replace({"F": 0, "M": 1}, inplace=True)
test_data.gender.replace({"F": 0, "M": 1}, inplace=True)


mar_stat = np.unique(full_train_data.marital_status)
mar_stat_dct = dict(zip(mar_stat, range(mar_stat.shape[0])))
full_train_data.marital_status.replace(mar_stat_dct, inplace=True)
test_data.marital_status.replace(mar_stat_dct, inplace=True)


job = np.unique(full_train_data.job_position)
job_dct = dict(zip(job, range(job.shape[0])))
full_train_data.job_position.replace(job_dct, inplace=True)
test_data.job_position.replace(job_dct, inplace=True)


full_train_data[["credit_sum"]] = full_train_data[["credit_sum"]].apply(lambda x: x.str.replace(',','.')).astype(np.float64)
credit_sum_mean = full_train_data.credit_sum.mean()
credit_sum_std = full_train_data.credit_sum.std() + 1e-7
test_data[["credit_sum"]] = test_data[["credit_sum"]].apply(lambda x: x.str.replace(',','.')).astype(np.float64)
full_train_data.credit_sum = (full_train_data.credit_sum.values - credit_sum_mean) / credit_sum_std
test_data.credit_sum = (test_data.credit_sum.values - credit_sum_mean) / credit_sum_std


credit_month_mean = full_train_data.credit_month.mean()
credit_month_std = full_train_data.credit_month.std()
full_train_data.credit_month = (full_train_data.credit_month.values - credit_month_mean) / credit_month_std
test_data.credit_month = (test_data.credit_month.values - credit_month_mean) / credit_month_std


full_train_data[["score_shk"]] = full_train_data[["score_shk"]].apply(lambda x: x.str.replace(',','.')).astype(np.float64)
test_data[["score_shk"]] = test_data[["score_shk"]].apply(lambda x: x.str.replace(',','.')).astype(np.float64)


edu = dict(zip(np.unique(full_train_data.education), range(full_train_data.education.unique().shape[0])))
full_train_data.education.replace(edu, inplace=True)
test_data.education.replace(edu, inplace=True)


reg = full_train_data.living_region.value_counts().index.values
reg_dct = dict(zip(reg, range(reg.shape[0])))
full_train_data.living_region.replace(reg_dct, inplace=True)
test_data.living_region.replace(reg_dct, inplace=True)
mode_reg = full_train_data.living_region.value_counts().index[0]
full_train_data.living_region.fillna(mode_reg, inplace=True)
test_data.living_region.fillna(mode_reg, inplace=True)


mon_income_mean = full_train_data.monthly_income.mean()
full_train_data.monthly_income.fillna(mon_income_mean, inplace=True)
test_data.monthly_income.fillna(mon_income_mean, inplace=True)
mon_income_mean = full_train_data.monthly_income.mean()
mon_income_std = full_train_data.monthly_income.std()
full_train_data.monthly_income = (full_train_data.monthly_income.values - mon_income_mean) / mon_income_std
test_data.monthly_income = (test_data.monthly_income.values - mon_income_mean) / mon_income_std

mode_credit_count = full_train_data.credit_count.value_counts().index[0]
full_train_data.credit_count.fillna(mode_credit_count, inplace=True)
test_data.credit_count.fillna(mode_credit_count, inplace=True)

mode_overdue_credit_count = full_train_data.overdue_credit_count.value_counts().index[0]
full_train_data.overdue_credit_count.fillna(mode_overdue_credit_count, inplace=True)
test_data.overdue_credit_count.fillna(mode_overdue_credit_count, inplace=True)

In [8]:
# Проверка на отсутствующие значения
print("TRAIN NA: \n{}".format(np.sum(full_train_data.isna())), end='\n\n')
print("TEST NA: \n{}".format(np.sum(test_data.isna())))

TRAIN NA: 
client_id                   0
gender                      0
age                         0
marital_status              0
job_position                0
credit_sum                  0
credit_month                0
tariff_id                   0
score_shk                   0
education                   0
living_region               0
monthly_income              0
credit_count                0
overdue_credit_count        0
open_account_flg        91940
dtype: int64

TEST NA: 
client_id               0
gender                  0
age                     0
marital_status          0
job_position            0
credit_sum              0
credit_month            0
tariff_id               0
score_shk               0
education               0
living_region           0
monthly_income          0
credit_count            0
overdue_credit_count    0
dtype: int64


In [9]:
# Работаем с данными, которые без целевого значения
no_target = full_train_data.iloc[136596:, 1:-1]

In [10]:
# Используем K-Means - алгоритм кластеризации
tmp_res = []
for n in [7, 10, 30, 50, 75]:
    k_means = KMeans(n_clusters=2, n_init=n)
    k_means_targets = k_means.fit_predict(no_target.values)
    tmp_res.append(k_means_targets)
    print("{}:\n{}".format(n, k_means_targets.sum()))

7:
19681
10:
19681
30:
19681
50:
19681
75:
19681


In [11]:
# Заполним неизвестные таргеты средним относительно предсказания K_Means
full_train_data.iloc[136596:, -1] = np.round(np.sum(np.array(tmp_res[:]), axis=0) / 5)

In [12]:
full_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228536 entries, 0 to 228535
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   client_id             228536 non-null  int64  
 1   gender                228536 non-null  int64  
 2   age                   228536 non-null  int64  
 3   marital_status        228536 non-null  int64  
 4   job_position          228536 non-null  int64  
 5   credit_sum            228536 non-null  float64
 6   credit_month          228536 non-null  float64
 7   tariff_id             228536 non-null  float64
 8   score_shk             228536 non-null  float64
 9   education             228536 non-null  int64  
 10  living_region         228536 non-null  float64
 11  monthly_income        228536 non-null  float64
 12  credit_count          228536 non-null  float64
 13  overdue_credit_count  228536 non-null  float64
 14  open_account_flg      228536 non-null  float64
dtype

In [13]:
X_train_full = full_train_data.drop(["client_id", "living_region", "open_account_flg"], axis=1).values
y_train_full = full_train_data.values[:, -1]
y_train_full[y_train_full == 0] = -1
X_test = test_data.values[:, 1:]

In [14]:
# попробуем только данные с файла credit_train.csv
X_train, X_val, y_train, y_val = train_test_split(X_train_full[:136596], y_train_full[:136596], test_size=0.2) # Без дополнения

In [15]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((109276, 12), (27320, 12), (109276,), (27320,))

In [16]:
# добавления 1, чтобы разделяющая плоскость не крутилась вокруг наачала координат
X_b = np.c_[np.ones(shape=(X_train.shape[0], 1)), X_train]
X_val = np.c_[np.ones(shape=(X_val.shape[0], 1)), X_val]

In [17]:
def custom_svm(X_train, y_train):
    """
    Алгоритм бинарной классификации с помощью опорных векторов.
    Основная идея: Алгоритм найдёт такую разделительную плоскость, что расстояние между плоскостью и опорными векторами
    будет максимальным."""
    weights = np.random.randn(X_b.shape[1], 1) # рандомная инициализация весов
    C = 10 # параметр регуляризации, от переобучения
    epochs = 100
    t = 1
    errors = []
    b_error = float('inf')
    b_weights = np.NaN
    for epoch in range(1, epochs + 1):
        eta = t / (epoch + 1e-7)
        #eta = 0.01
        hinge_error = 0
        grad = np.zeros((X_b.shape[1], 1))
        for point, target in zip(X_b, y_train):
            hinge_error += max(0, (1 - point.dot(weights) * target)) + weights.T.dot(weights) / 2 / C # петлевая функция потерь
            #hinge_error += C * max(0, (1 - point.dot(weights) * target))
            grad += (weights / C - point.reshape(-1, 1) * target) if point.dot(weights) * target < 1 else weights / C #градиент
            #grad += (- C * point.reshape(-1, 1) * target) if point.dot(weights) * target < 1 else 0
        if hinge_error < b_error:
            b_error = hinge_error
            b_weights = weights.copy()
        errors.append(hinge_error) # Для визуализации графика потерь
        weights -= eta * grad / X_b.shape[0] # оптимизация весов при помощи градиентного спуска
        print(f"Epoch: {epoch} --- Error: {hinge_error / X_b.shape[0]}")
    print("ACCURACY: {}".format(np.sum(np.sign(X_b.dot(weights)).flatten() == y_train)/X_b.shape[0] * 100) + '%')
    return b_weights

In [18]:
weights = custom_svm(X_b, y_train)

Epoch: 1 --- Error: [[5.3524106]]
Epoch: 2 --- Error: [[197.01827209]]
Epoch: 3 --- Error: [[74.2914833]]
Epoch: 4 --- Error: [[54.62454781]]
Epoch: 5 --- Error: [[40.78290014]]
Epoch: 6 --- Error: [[30.22967747]]
Epoch: 7 --- Error: [[21.76936254]]
Epoch: 8 --- Error: [[14.74895518]]
Epoch: 9 --- Error: [[8.77498782]]
Epoch: 10 --- Error: [[3.59310108]]
Epoch: 11 --- Error: [[7.18082818]]
Epoch: 12 --- Error: [[18.93822273]]
Epoch: 13 --- Error: [[14.87769824]]
Epoch: 14 --- Error: [[11.19054476]]
Epoch: 15 --- Error: [[7.81830065]]
Epoch: 16 --- Error: [[4.71493412]]
Epoch: 17 --- Error: [[1.84359707]]
Epoch: 18 --- Error: [[6.39480744]]
Epoch: 19 --- Error: [[11.24179025]]
Epoch: 20 --- Error: [[8.75409707]]
Epoch: 21 --- Error: [[6.41529151]]
Epoch: 22 --- Error: [[4.20981311]]
Epoch: 23 --- Error: [[2.12435958]]
Epoch: 24 --- Error: [[1.83098916]]
Epoch: 25 --- Error: [[7.6675304]]
Epoch: 26 --- Error: [[5.80422323]]
Epoch: 27 --- Error: [[4.02674919]]
Epoch: 28 --- Error: [[2.328

In [19]:
val_pred = np.sign(X_val.dot(weights)).flatten()
print("VAL_ACCURACY: {}".format(np.sum(val_pred == y_val)/X_val.shape[0] * 100) + '%')

VAL_ACCURACY: 81.0285505124451%


In [20]:
score_metrics(val_pred, y_val)

ACCURACY: 0.810285505124451
PRECISION: 0.25720384204909286
RECALL: 0.05097292724196278
CONFUSION MATRIX 
[[21896   696]
 [ 4487   241]]


In [21]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2) # c дополнениями

In [22]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((182828, 12), (45708, 12), (182828,), (45708,))

In [23]:
X_b = np.c_[np.ones(shape=(X_train.shape[0], 1)), X_train]
X_val = np.c_[np.ones(shape=(X_val.shape[0], 1)), X_val]

In [24]:
weights = custom_svm(X_b, y_train)

Epoch: 1 --- Error: [[20.74676198]]
Epoch: 2 --- Error: [[274.18198454]]
Epoch: 3 --- Error: [[221.13545874]]
Epoch: 4 --- Error: [[188.94634258]]
Epoch: 5 --- Error: [[166.29154692]]
Epoch: 6 --- Error: [[149.01895706]]
Epoch: 7 --- Error: [[135.17185514]]
Epoch: 8 --- Error: [[123.68146772]]
Epoch: 9 --- Error: [[113.90380157]]
Epoch: 10 --- Error: [[105.42254356]]
Epoch: 11 --- Error: [[97.95392404]]
Epoch: 12 --- Error: [[91.29634406]]
Epoch: 13 --- Error: [[85.30173752]]
Epoch: 14 --- Error: [[79.85834359]]
Epoch: 15 --- Error: [[74.87985458]]
Epoch: 16 --- Error: [[70.29831312]]
Epoch: 17 --- Error: [[66.05931047]]
Epoch: 18 --- Error: [[62.11864917]]
Epoch: 19 --- Error: [[58.43996708]]
Epoch: 20 --- Error: [[54.99300997]]
Epoch: 21 --- Error: [[51.7523521]]
Epoch: 22 --- Error: [[48.69643286]]
Epoch: 23 --- Error: [[45.80682053]]
Epoch: 24 --- Error: [[43.06764222]]
Epoch: 25 --- Error: [[40.46513697]]
Epoch: 26 --- Error: [[37.9873017]]
Epoch: 27 --- Error: [[35.62360776]]
Epo

In [25]:
val_pred = np.sign(X_val.dot(weights)).flatten()
print("VAL_ACCURACY: {}".format(np.sum(val_pred == y_val)/X_val.shape[0] * 100) + '%')

VAL_ACCURACY: 71.32449461800998%


In [26]:
score_metrics(val_pred, y_val)

ACCURACY: 0.7132449461800998
PRECISION: 0.20231525104320905
RECALL: 0.17307692307692307
CONFUSION MATRIX 
[[31098  5926]
 [ 7181  1503]]


In [27]:
svm_mod = SVC(C=0.1)
svm_mod.fit(X_b, y_train)
val_pred = svm_mod.predict(X_val)
score_metrics(val_pred, y_val)

ACCURACY: 0.8100113765642776
PRECISION: 0.0
RECALL: 0.0
CONFUSION MATRIX 
[[37024     0]
 [ 8684     0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
rnd_clf = RandomForestClassifier(300, criterion='entropy')
rnd_clf.fit(X_b, y_train)
val_pred = rnd_clf.predict(X_val)
score_metrics(val_pred, y_val)

ACCURACY: 0.8064452612234182
PRECISION: 0.3961783439490446
RECALL: 0.035812989405803776
CONFUSION MATRIX 
[[36550   474]
 [ 8373   311]]
