In [1]:
import pandas as pd
import numpy as np

# Работа с датасетом

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv')

In [3]:
trn = train.copy()
tst = test.copy()

In [4]:
trn['Gender'] = trn['Gender'].replace({'Male': 0, 'Female': 1})
trn['Vehicle_Damage'] = trn['Vehicle_Damage'].replace({'No': 0, 'Yes': 1})
trn['Vehicle_Age'] = trn['Vehicle_Age'].replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})
tst['Gender'] = tst['Gender'].replace({'Male': 0, 'Female': 1})
tst['Vehicle_Damage'] = tst['Vehicle_Damage'].replace({'No': 0, 'Yes': 1})
tst['Vehicle_Age'] = tst['Vehicle_Age'].replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})

In [None]:
trn['Response'].value_counts()

In [None]:
trn.head()

In [8]:
X = trn.iloc[:, 0:-1]
y = trn.iloc[:, -1]

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Решение проблемы дисбаланся

## 1) Сокращение элементов до равного колличества


In [17]:
from sklearn.utils import resample

In [18]:
y_train.value_counts()

Response
0    8071963
1    1131875
Name: count, dtype: int64

In [19]:
tmp = pd.concat([x_train, y_train], axis=1)

In [None]:
tmp

In [27]:
t_0 = tmp[tmp['Response'] == 0]
t_1 = tmp[tmp['Response'] == 1]

new_t_0 = resample(t_0, replace=True, n_samples=2263750, random_state=42)

X_t = pd.concat([new_t_0, t_1])

In [28]:
X_t = X_t.sample(frac=1)

In [29]:
x_train, y_train = X_t.iloc[:, 0:-1], X_t.iloc[:, -1]

## 2) Уравновешивание весов

In [None]:
weights = {0: 0.11, 1: 0.89}

# Модели обучения

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, f1_score, precision_score
import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
# Стандартизация
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.fit_transform(x_val)

## 1) Logistic regression

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)

< 1 min

In [None]:
y_pred = model.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)
rec = recall_score(y_val, y_pred)
prec = precision_score(y_val, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Recall: {:.2f}%".format(rec * 100))
print("Precision: {:.2f}%".format(prec * 100))

In [None]:
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, 
            annot=True,
            fmt='g', 
            xticklabels=['true','false'],
            yticklabels=['true','false'])
plt.ylabel('Actual', fontsize=13)
plt.title('Confusion Matrix', fontsize=17, pad=20)
plt.gca().xaxis.set_label_position('top') 
plt.xlabel('Prediction', fontsize=13)
plt.gca().xaxis.tick_top()

plt.gca().figure.subplots_adjust(bottom=0.2)
plt.gca().figure.text(0.5, 0.05, 'Prediction', ha='center', fontsize=13)
plt.show()

## 2) Bayes classifier

In [None]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)

< 1 min

In [None]:
y_pred = gnb.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)
rec = recall_score(y_val, y_pred)
prec = precision_score(y_val, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Recall: {:.2f}%".format(rec * 100))
print("Precision: {:.2f}%".format(prec * 100))

In [None]:
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, 
            annot=True,
            fmt='g', 
            xticklabels=['true','false'],
            yticklabels=['true','false'])
plt.ylabel('Actual', fontsize=13)
plt.title('Confusion Matrix', fontsize=17, pad=20)
plt.gca().xaxis.set_label_position('top') 
plt.xlabel('Prediction', fontsize=13)
plt.gca().xaxis.tick_top()

plt.gca().figure.subplots_adjust(bottom=0.2)
plt.gca().figure.text(0.5, 0.05, 'Prediction', ha='center', fontsize=13)
plt.show()

## 3) Decision tree

In [None]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(x_train, y_train)

4 minutes

In [None]:
y_pred = clf.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)
rec = recall_score(y_val, y_pred)
prec = precision_score(y_val, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Recall: {:.2f}%".format(rec * 100))
print("Precision: {:.2f}%".format(prec * 100))

In [None]:
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, 
            annot=True,
            fmt='g', 
            xticklabels=['true','false'],
            yticklabels=['true','false'])
plt.ylabel('Actual', fontsize=13)
plt.title('Confusion Matrix', fontsize=17, pad=20)
plt.gca().xaxis.set_label_position('top') 
plt.xlabel('Prediction', fontsize=13)
plt.gca().xaxis.tick_top()

plt.gca().figure.subplots_adjust(bottom=0.2)
plt.gca().figure.text(0.5, 0.05, 'Prediction', ha='center', fontsize=13)
plt.show()

## 4) PyCaret

In [None]:
pip install pycaret

In [None]:
pip install --user scikit-learn

In [5]:
from sklearn.metrics._scorer import _SCORERS

In [6]:
from pycaret.classification import *
s = setup(trn, target = 'Response', session_id = 1)

Unnamed: 0,Description,Value
0,Session id,1
1,Target,Response
2,Target type,Binary
3,Original data shape,"(11504798, 12)"
4,Transformed data shape,"(11504798, 12)"
5,Transformed train set shape,"(8053358, 12)"
6,Transformed test set shape,"(3451440, 12)"
7,Numeric features,11
8,Preprocess,True
9,Imputation type,simple


In [32]:
from catboost import CatBoostRegressor

cbr = CatBoostRegressor(iterations=100)
cbr.fit(x_train, y_train)

Learning rate set to 0.5
0:	learn: 0.4047826	total: 432ms	remaining: 42.8s
1:	learn: 0.3855670	total: 887ms	remaining: 43.4s
2:	learn: 0.3797721	total: 1.28s	remaining: 41.5s
3:	learn: 0.3777844	total: 1.66s	remaining: 39.9s
4:	learn: 0.3762370	total: 1.92s	remaining: 36.5s
5:	learn: 0.3755187	total: 2.18s	remaining: 34.2s
6:	learn: 0.3746061	total: 2.43s	remaining: 32.3s
7:	learn: 0.3741498	total: 2.71s	remaining: 31.2s
8:	learn: 0.3736551	total: 2.98s	remaining: 30.2s
9:	learn: 0.3733561	total: 3.25s	remaining: 29.3s
10:	learn: 0.3730453	total: 3.53s	remaining: 28.6s
11:	learn: 0.3727914	total: 3.85s	remaining: 28.2s
12:	learn: 0.3722915	total: 4.18s	remaining: 28s
13:	learn: 0.3721109	total: 4.43s	remaining: 27.2s
14:	learn: 0.3718420	total: 4.69s	remaining: 26.6s
15:	learn: 0.3716183	total: 4.97s	remaining: 26.1s
16:	learn: 0.3714039	total: 5.26s	remaining: 25.7s
17:	learn: 0.3712951	total: 5.5s	remaining: 25.1s
18:	learn: 0.3710238	total: 5.81s	remaining: 24.8s
19:	learn: 0.370925

<catboost.core.CatBoostRegressor at 0x7d363471d0c0>

In [34]:
y_pred = cbr.predict(x_val)

In [50]:
y_pred = np.array(y_pred).astype('float32')

In [47]:
y_val = np.array(y_val.tolist()).astype('float32')

In [48]:

accuracy = accuracy_score(y_val, y_pred)
rec = recall_score(y_val, y_pred)
prec = precision_score(y_val, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Recall: {:.2f}%".format(rec * 100))
print("Precision: {:.2f}%".format(prec * 100))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

обучение происходит на укороченном датасете, так как полный датасет слишком крупный, его обработка занимает много времени (> 20 min)

In [None]:
best = s.compare_models()

## 5) Простая сеть на Keras

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
model = keras.Sequential([
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='softmax')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'recall', 'precision'])

history = model.fit(x_train,
                    y_train,
                    batch_size=256,
                    epochs=30,
                    validation_data=(x_val, y_val))

In [None]:
hist_dict = history.history
print(hist_dict.keys())
loss_val = hist_dict['loss']
val_loss_values = hist_dict['val_loss']
epochs = range(1, len(loss_val)+1)
plt.plot(epochs, loss_val, 'bo', label='Потери на обучении')
plt.plot(epochs, val_loss_values, 'b', label='Потери при проверке')
plt.xlabel('Эпохи')
plt.ylabel('Потери')
plt.legend()

plt.figure()
acc = hist_dict['accuracy']
val_acc = hist_dict['val_accuracy']
epochs = range(1, len(loss_val)+1)
plt.plot(epochs, acc, 'bo', label='Точность на обучении')
plt.plot(epochs, val_acc, 'b', label='Точность при проверке')
plt.xlabel('Эпохи')
plt.ylabel('Точность')
plt.legend()
plt.show()

plt.figure()
acc = hist_dict['recall']
val_acc = hist_dict['val_recall']
epochs = range(1, len(loss_val)+1)
plt.plot(epochs, acc, 'bo', label='Точность на обучении')
plt.plot(epochs, val_acc, 'b', label='Точность при проверке')
plt.xlabel('Эпохи')
plt.ylabel('Точность')
plt.legend()
plt.show()

plt.figure()
acc = hist_dict['precision']
val_acc = hist_dict['val_precision']
epochs = range(1, len(loss_val)+1)
plt.plot(epochs, acc, 'bo', label='Точность на обучении')
plt.plot(epochs, val_acc, 'b', label='Точность при проверке')
plt.xlabel('Эпохи')
plt.ylabel('Точность')
plt.legend()
plt.show()