In [75]:
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report
import numpy as np

Классификация с PyTorch

In [76]:
df = pd.read_csv("neo_task_compleated.csv")
df

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,0.016016,0.035813,56014.078517,1.024333e+06,26.10,False
1,0.030518,0.068240,7864.348060,3.268186e+07,24.70,False
2,0.055533,0.124177,55257.544508,6.538636e+07,23.40,False
3,0.019256,0.043057,41531.404722,1.260796e+07,25.70,False
4,0.139494,0.311918,67639.394481,7.130590e+07,21.40,False
...,...,...,...,...,...,...
90831,0.017561,0.039268,23264.740825,1.635007e+06,25.90,False
90832,0.110804,0.247765,24802.519406,3.351901e+07,21.90,False
90833,0.035039,0.078350,116288.999548,5.471396e+07,24.40,False
90834,0.044112,0.098637,45763.317060,2.694877e+07,23.90,False


In [77]:
X = df.drop(["hazardous"], axis=1)
y = df["hazardous"]
X

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude
0,0.016016,0.035813,56014.078517,1.024333e+06,26.10
1,0.030518,0.068240,7864.348060,3.268186e+07,24.70
2,0.055533,0.124177,55257.544508,6.538636e+07,23.40
3,0.019256,0.043057,41531.404722,1.260796e+07,25.70
4,0.139494,0.311918,67639.394481,7.130590e+07,21.40
...,...,...,...,...,...
90831,0.017561,0.039268,23264.740825,1.635007e+06,25.90
90832,0.110804,0.247765,24802.519406,3.351901e+07,21.90
90833,0.035039,0.078350,116288.999548,5.471396e+07,24.40
90834,0.044112,0.098637,45763.317060,2.694877e+07,23.90


In [78]:
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)
print(X.shape, y.shape)

(17680, 5) (17680,)


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, shuffle=True)

In [80]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [81]:
from torch.utils.data import TensorDataset, DataLoader

In [82]:
# а теперь преобразуем обучающую выборку в объект Dataset
train_ds = TensorDataset(torch.from_numpy(X_train).type(torch.float32), torch.from_numpy(np.array(y_train).reshape(-1, 1)).type(torch.float32))
train_dl = DataLoader(train_ds, batch_size=260)

In [83]:
import torch.nn as nn

In [84]:
class ClassificationModel(nn.Module):
    def __init__(self):
        super(ClassificationModel, self).__init__()
        self.first_linear = nn.Linear(5, 120)
        self.first_relu = nn.ReLU()
        self.second_linear = nn.Linear(120, 240)
        self.second_relu = nn.ReLU()
        self.third_linear = nn.Linear(240, 60)
        self.third_relu = nn.ReLU()
        self.fourth_linear = nn.Linear(60, 20)
        self.fourth_relu = nn.ReLU()
        self.fifth_linear = nn.Linear(20, 1)
        self.fifth_sigmoid = nn.Sigmoid()

    def forward(self, x):
        y = self.first_linear(x)
        y = self.first_relu(y)
        y = self.second_linear(y)
        y = self.second_relu(y)
        y = self.third_linear(y)
        y = self.third_relu(y)
        y = self.fourth_linear(y)
        y = self.fourth_relu(y)
        y = self.fifth_linear(y)
        y = self.fifth_sigmoid(y)
        return y

In [85]:
model = ClassificationModel()

In [86]:
print(model)

ClassificationModel(
  (first_linear): Linear(in_features=5, out_features=120, bias=True)
  (first_relu): ReLU()
  (second_linear): Linear(in_features=120, out_features=240, bias=True)
  (second_relu): ReLU()
  (third_linear): Linear(in_features=240, out_features=60, bias=True)
  (third_relu): ReLU()
  (fourth_linear): Linear(in_features=60, out_features=20, bias=True)
  (fourth_relu): ReLU()
  (fifth_linear): Linear(in_features=20, out_features=1, bias=True)
  (fifth_sigmoid): Sigmoid()
)


In [87]:
loss = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0025)

In [88]:
epochs = 50
for epoch in range(epochs):
    # за одну эпоху смотрим все батчи (по batch_size элементов)
    for x_b, y_b in train_dl:
        # делаем прямое распространение (получаем предсказание)
        outputs = model(x_b)
        # вычисляем значение функции потерь
        loss_value = loss(outputs, y_b)
        # делаем backward - вычисляются значения .grad у слоев модели
        loss_value.backward()
        # делаем шаг градиентного спуска с заданным у оптимизатора learning_rate
        optimizer.step()
        # зануляем .grad у слоев модели - для нового батча будем акумулировать новый .grad
        optimizer.zero_grad()

    # в конце эпохи выводим значение функции потерь для последнего рассмотренного батча
    print(f'Эпоха {epoch + 1}, Значение функции потерь: {loss_value.item()}')

Эпоха 1, Значение функции потерь: 0.08142730593681335
Эпоха 2, Значение функции потерь: 0.07998696714639664
Эпоха 3, Значение функции потерь: 0.07933244854211807
Эпоха 4, Значение функции потерь: 0.07915818691253662
Эпоха 5, Значение функции потерь: 0.07858816534280777
Эпоха 6, Значение функции потерь: 0.07845388352870941
Эпоха 7, Значение функции потерь: 0.0786895677447319
Эпоха 8, Значение функции потерь: 0.07852181792259216
Эпоха 9, Значение функции потерь: 0.07836280763149261
Эпоха 10, Значение функции потерь: 0.07824348658323288
Эпоха 11, Значение функции потерь: 0.07824599742889404
Эпоха 12, Значение функции потерь: 0.07812630385160446
Эпоха 13, Значение функции потерь: 0.07804616540670395
Эпоха 14, Значение функции потерь: 0.07784207165241241
Эпоха 15, Значение функции потерь: 0.07770325243473053
Эпоха 16, Значение функции потерь: 0.07756882160902023
Эпоха 17, Значение функции потерь: 0.07737623155117035
Эпоха 18, Значение функции потерь: 0.07750768214464188
Эпоха 19, Значение ф

In [89]:
y_pred = np.around(model(torch.from_numpy(X_test).type(torch.float32)).detach().numpy())

In [92]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.76      0.97      0.85      2066
         1.0       0.98      0.80      0.88      3238

    accuracy                           0.87      5304
   macro avg       0.87      0.89      0.87      5304
weighted avg       0.89      0.87      0.87      5304

