In [None]:
!unzip drive/MyDrive/default+of+credit+card+clients.zip -d data

Archive:  drive/MyDrive/default+of+credit+card+clients.zip
 extracting: data/default of credit card clients.xls  


In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_excel('/content/data/default of credit card clients.xls', skiprows=1)

In [None]:
scaller = StandardScaler()
columns_for_normalize = ["LIMIT_BAL", "AGE", 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

df_norm = df.copy()
df_norm[columns_for_normalize] = scaller.fit_transform(df[columns_for_normalize])
df_norm = df_norm.loc[:, df_norm.columns != "ID"]

In [None]:
feautures = ["LIMIT_BAL", 'AGE', "PAY_0", 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

In [None]:
def library_calculation(data):
    X = data.loc[:, data.columns != "default payment next month"]
    y = data["default payment next month"]

    dtree = DecisionTreeClassifier(random_state=42)
    dtree.fit(X, y)

    importances = dtree.feature_importances_

    imp_df = pd.DataFrame({"Feature" : X.columns, "Importance" : importances})
    imp_df = imp_df.sort_values(by="Importance", ascending=False)
    return imp_df

In [None]:
feautures = library_calculation(df_norm).head(15)['Feature'].tolist()
feautures

['PAY_0',
 'AGE',
 'BILL_AMT1',
 'LIMIT_BAL',
 'BILL_AMT6',
 'BILL_AMT2',
 'PAY_AMT3',
 'PAY_AMT1',
 'BILL_AMT3',
 'PAY_AMT5',
 'PAY_AMT2',
 'PAY_AMT6',
 'BILL_AMT5',
 'BILL_AMT4',
 'PAY_AMT4']

In [None]:
def calculate_gain_ratio(X, y):

    def entropy(target):
        _, counts = np.unique(target, return_counts=True)
        probabilities = counts / len(target)
        return -np.sum(probabilities * np.log2(probabilities + 1e-10))

    X = X.loc[:, X.columns != "default payment next month"]
    gain_ratios = {}
    H_T = entropy(y)

    for feature in X.columns:
        feature_values = X[feature]
        H_T_given_A = 0
        IV = 0

        for value in np.unique(feature_values):
            subset_indices = feature_values == value
            subset_y = y[subset_indices]
            subset_size = len(subset_y)

            if subset_size == 0:
                continue

            H_T_given_A += (subset_size / len(y)) * entropy(subset_y)

            prob = subset_size / len(y)
            IV -= prob * np.log2(prob + 1e-10)

        IG = H_T - H_T_given_A

        if IV == 0:
            gain_ratio = 0
        else:
            gain_ratio = IG / IV

        gain_ratios[feature] = gain_ratio

    return gain_ratios

In [None]:
gain_ratios = calculate_gain_ratio(df_norm, df_norm['default payment next month'])

In [None]:
gain_ratios

{'LIMIT_BAL': np.float64(0.004731211965701588),
 'SEX': np.float64(0.0011810595525375365),
 'EDUCATION': np.float64(0.002798633042210645),
 'MARRIAGE': np.float64(0.0008091198463126944),
 'AGE': np.float64(0.0007279586790713947),
 'PAY_0': np.float64(0.052937699626075405),
 'PAY_2': np.float64(0.038402374899222456),
 'PAY_3': np.float64(0.029459342312626334),
 'PAY_4': np.float64(0.026760156434092266),
 'PAY_5': np.float64(0.025486803342993452),
 'PAY_6': np.float64(0.021587253921446052),
 'BILL_AMT1': np.float64(0.04287204548191936),
 'BILL_AMT2': np.float64(0.042841885877182795),
 'BILL_AMT3': np.float64(0.042804017348320296),
 'BILL_AMT4': np.float64(0.04245733698373595),
 'BILL_AMT5': np.float64(0.04193617754100063),
 'BILL_AMT6': np.float64(0.04180148382269046),
 'PAY_AMT1': np.float64(0.021453406714947925),
 'PAY_AMT2': np.float64(0.020471277450910083),
 'PAY_AMT3': np.float64(0.0208783228652973),
 'PAY_AMT4': np.float64(0.019915771020802213),
 'PAY_AMT5': np.float64(0.0199684213

In [None]:
X = df_norm[feautures]
pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, df_norm['default payment next month'], test_size=0.2, random_state=42)
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")
print(f'F1 score: {f1_score(y_test, y_pred):.2f}')

Accuracy: 0.69
Recall: 0.64
F1 score: 0.47


In [None]:
class Perceptron:
    def __init__(self, learning_rate=0.01, n_iters=1000, random_state=None):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        self.random_state = random_state

    def fit(self, X, y):
        rgen = np.random.RandomState(self.random_state)


        n_samples, n_features = X.shape


        self.weights = rgen.normal(loc=0.0, scale=0.01, size=n_features)
        self.bias = 0.0


        for _ in range(self.n_iters):
            errors = 0
            for idx, x_i in enumerate(X):

                linear_output = np.dot(x_i, self.weights) + self.bias

                y_pred = self.step_function(linear_output)

                update = self.lr * (y[idx] - y_pred)
                if update != 0:
                    errors += 1
                    self.weights += update * x_i
                    self.bias += update

            if errors == 0:
                break

    def step_function(self, x):

        return np.where(x >= 0, 1, 0)

    def predict(self, X):

        linear_output = np.dot(X, self.weights) + self.bias
        y_pred = self.step_function(linear_output)
        return y_pred


In [None]:
X_train_np = X_train.values.astype(np.float64)
y_train_np = y_train.values.astype(np.int64).flatten()
X_test_np = X_test.values.astype(np.float64)
y_test_np = y_test.values.astype(np.int64)

perseptron = Perceptron()
perseptron.fit(X_train_np, y_train_np)
y_pred_np = perseptron.predict(X_test_np)

print(f"Accuracy: {accuracy_score(y_test_np, y_pred_np):.2f}")
print(f"Recall: {recall_score(y_test_np, y_pred_np):.2f}")
print(f'F1 score: {f1_score(y_test_np, y_pred_np):.2f}')

Accuracy: 0.68
Recall: 0.13
F1 score: 0.16


In [None]:
class MLP:
    def __init__(self, layer_sizes, learning_rate=0.01, epochs=1000):
        self.layer_sizes = layer_sizes
        self.lr = learning_rate
        self.epochs = epochs
        self.weights = []
        self.biases = []

        for i in range(len(layer_sizes)-1):
            scale = np.sqrt(2 / layer_sizes[i])
            self.weights.append(np.random.randn(layer_sizes[i], layer_sizes[i+1]) * scale)
            self.biases.append(np.zeros(layer_sizes[i+1]))

    def leaky_relu(self, x, alpha=0.01):
        return np.where(x > 0, x, alpha * x)

    def leaky_relu_derivative(self, x, alpha=0.01):
        return np.where(x > 0, 1, alpha)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def forward(self, X):
        self.activations = [X]
        self.z_values = []
        for i in range(len(self.weights)):
            z = np.dot(self.activations[-1], self.weights[i]) + self.biases[i]
            self.z_values.append(z)
            activation = self.leaky_relu(z) if i < len(self.weights)-1 else self.sigmoid(z)
            self.activations.append(activation)
        return self.activations[-1]

    def backward(self, y):
        m = y.shape[0]
        y = y.reshape(-1, 1)
        error = self.activations[-1] - y
        deltas = [error * self.sigmoid_derivative(self.activations[-1])]

        for i in reversed(range(len(self.weights)-1)):
            error = np.dot(deltas[-1], self.weights[i+1].T)
            delta = error * self.leaky_relu_derivative(self.activations[i+1])
            deltas.append(delta)

        deltas.reverse()

        for i in range(len(self.weights)):
            dW = np.dot(self.activations[i].T, deltas[i]) / m
            db = np.sum(deltas[i], axis=0) / m
            self.weights[i] -= self.lr * dW
            self.biases[i] -= self.lr * db

    def fit(self, X, y):
        for epoch in range(self.epochs):
            y_pred = self.forward(X)
            self.backward(y)
            if epoch % 100 == 0:
                loss = np.mean((y_pred - y.reshape(-1, 1))**2)
                print(f"Epoch {epoch}, Loss: {loss:.4f}")

    def predict(self, X, threshold=0.5):
        return (self.forward(X) >= threshold).astype(int)

In [None]:
mlp = MLP(layer_sizes=[len(feautures), 4, 1], learning_rate=0.1)

mlp.fit(X_train_np, y_train_np)
y_pred_np = mlp.predict(X_test_np)

print(f"Accuracy: {accuracy_score(y_test_np, y_pred_np):.2f}")
print(f"Recall: {recall_score(y_test_np, y_pred_np):.2f}")
print(f'F1 score: {f1_score(y_test_np, y_pred_np):.2f}')

Epoch 0, Loss: 0.3726
Epoch 100, Loss: 0.2077
Epoch 200, Loss: 0.1875
Epoch 300, Loss: 0.1735
Epoch 400, Loss: 0.1631
Epoch 500, Loss: 0.1554
Epoch 600, Loss: 0.1506
Epoch 700, Loss: 0.1481
Epoch 800, Loss: 0.1468
Epoch 900, Loss: 0.1460
Accuracy: 0.82
Recall: 0.31
F1 score: 0.43


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, recall_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
def create_mlp(input_shape, num_classes=1):
    model = Sequential([
        Dense(input_shape, activation='relu', input_shape=(input_shape,)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(4, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(num_classes, activation='sigmoid' if num_classes == 1 else 'softmax')
    ])

    # Компиляция модели
    optimizer = Adam(learning_rate=0.001)
    loss = 'binary_crossentropy' if num_classes == 1 else 'categorical_crossentropy'
    metrics = ['accuracy', 'Recall']

    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    return model


def train_model(model, X_train, y_train, X_val, y_val, epochs=1000):
    callbacks = [
        EarlyStopping(monitor='val_recall', patience=10, mode='max', verbose=1),
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)
    ]

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=1,
        callbacks=callbacks,
        verbose=1
    )
    return history

# 4. Оценка модели
def evaluate_model(model, X_test, y_test):
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    print(classification_report(y_test, y_pred))
    print(f"Test Recall: {recall_score(y_test, y_pred):.4f}")

In [None]:
model = create_mlp(len(feautures))
train_model(model, X_train_np, y_train_np, X_test_np, y_test_np)
evaluate_model(model, X_test_np, y_test_np)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - Recall: 0.4096 - accuracy: 0.7312 - loss: 0.5791 - val_Recall: 0.3016 - val_accuracy: 0.8188 - val_loss: 0.4473 - learning_rate: 0.0010
Epoch 2/100


  current = self.get_monitor_value(logs)


[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - Recall: 0.2914 - accuracy: 0.8040 - loss: 0.4729 - val_Recall: 0.3062 - val_accuracy: 0.8195 - val_loss: 0.4458 - learning_rate: 0.0010
Epoch 3/100
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - Recall: 0.3211 - accuracy: 0.8124 - loss: 0.4587 - val_Recall: 0.3199 - val_accuracy: 0.8195 - val_loss: 0.4423 - learning_rate: 0.0010
Epoch 4/100
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - Recall: 0.3298 - accuracy: 0.8159 - loss: 0.4497 - val_Recall: 0.3115 - val_accuracy: 0.8187 - val_loss: 0.4436 - learning_rate: 0.0010
Epoch 5/100
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - Recall: 0.3341 - accuracy: 0.8185 - loss: 0.4450 - val_Recall: 0.3123 - val_accuracy: 0.8195 - val_loss: 0.4418 - learning_rate: 0.0010
Epoch 6/100
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - Recall: 0.3375 - accura