In [1]:
import numpy as np
import pandas as pd
import os
import torch
from sklearn.model_selection import train_test_split
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset
import requests
from io import BytesIO

In [2]:
url = "https://github.com/KopylovAlexey/Hackaton/blob/main/selected4000_target_dial_trx_fin.parquet?raw=true"

response = requests.get(url)
response.raise_for_status()  # Проверка успешности запроса

file_content = BytesIO(response.content)
target_dial_trx = pd.read_parquet(file_content)

target_dial_trx.shape

(3860, 19)

In [3]:
target_dial_trx.columns

Index(['target_1', 'target_2', 'target_3', 'target_4', 'month',
       'embedding_sum', 'amount', 'event_type', 'event_subtype', 'currency',
       'src_type11', 'src_type12', 'dst_type11', 'dst_type12', 'src_type21',
       'src_type22', 'src_type31', 'src_type32', 'target_tensor'],
      dtype='object')

In [4]:
target_dial_trx.head()

Unnamed: 0,target_1,target_2,target_3,target_4,month,embedding_sum,amount,event_type,event_subtype,currency,src_type11,src_type12,dst_type11,dst_type12,src_type21,src_type22,src_type31,src_type32,target_tensor
0,0,0,1,1,9,7.354199,8953.072754,54.0,55.0,11.0,19.0,344.0,433.0,10049.0,43634.0,28.0,1810.0,51.0,"[0, 0, 1, 1]"
1,0,0,0,1,3,9.266455,1324.851074,37.0,18.0,11.0,19.0,344.0,433.0,10049.0,44828.0,60.0,189.0,4.0,"[0, 0, 0, 1]"
2,0,0,0,0,4,16.050846,3507.233887,25.0,47.0,11.0,19.0,344.0,1166.0,30836.0,19200.0,63.0,2202.0,76.0,"[0, 0, 0, 0]"
3,0,0,1,0,9,16.712837,5543.849609,37.0,18.0,11.0,19.0,344.0,364.0,22652.0,3191.0,70.0,2388.0,74.0,"[0, 0, 1, 0]"
4,0,0,0,0,9,17.876408,7202.513184,5.0,1.0,11.0,1.0,189.0,433.0,8693.0,4820.0,11.0,844.0,4.0,"[0, 0, 0, 0]"


In [5]:
target_dial_trx.describe()

Unnamed: 0,target_1,target_2,target_3,target_4,embedding_sum,amount,event_type,event_subtype,currency,src_type11,src_type12,dst_type11,dst_type12,src_type21,src_type22,src_type31,src_type32
count,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0,3860.0
mean,0.088083,0.004145,0.053886,0.031088,14.556624,58635.06,43.864508,40.356995,11.0,25.202591,395.861917,635.648705,19601.424093,24220.192746,52.665285,1278.151295,49.011658
std,0.283452,0.064257,0.225822,0.173578,4.376096,299194.5,12.839372,17.164102,0.0,27.344356,179.322032,314.77842,7038.031391,13986.994476,22.947926,751.470493,28.69025
min,0.0,0.0,0.0,0.0,-4.716033,0.1973093,3.0,1.0,11.0,1.0,45.0,364.0,1348.0,6.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,12.204848,4704.632,37.0,19.0,11.0,19.0,344.0,364.0,10049.0,12069.0,37.0,568.0,24.0
50%,0.0,0.0,0.0,0.0,15.295784,12858.07,51.0,49.0,11.0,19.0,344.0,433.0,22652.0,25145.5,56.0,1295.0,53.0
75%,0.0,0.0,0.0,0.0,17.382257,36399.88,54.0,55.0,11.0,19.0,344.0,869.0,22652.0,35894.0,70.0,1930.0,77.0
max,1.0,1.0,1.0,1.0,26.893627,8344542.0,56.0,59.0,11.0,180.0,1095.0,1637.0,31488.0,48141.0,88.0,2496.0,89.0


# Построение модели при помощи Torch, предсказание меток продукта 1

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.optim as optim
from torch import nn
torch.manual_seed(42)
from sklearn.metrics import accuracy_score  # Импорт функции f1_score

In [7]:
target_dial_trx.columns

Index(['target_1', 'target_2', 'target_3', 'target_4', 'month',
       'embedding_sum', 'amount', 'event_type', 'event_subtype', 'currency',
       'src_type11', 'src_type12', 'dst_type11', 'dst_type12', 'src_type21',
       'src_type22', 'src_type31', 'src_type32', 'target_tensor'],
      dtype='object')

In [8]:
# Сплит на X и y
X = target_dial_trx.drop(['target_tensor', 'target_1', 'target_2', 'target_3', 'target_4'], axis=1)
y = target_dial_trx[['target_1']]

In [9]:
y

Unnamed: 0,target_1
0,0
1,0
2,0
3,0
4,0
...,...
3855,0
3856,1
3857,0
3858,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
X_train

Unnamed: 0,month,embedding_sum,amount,event_type,event_subtype,currency,src_type11,src_type12,dst_type11,dst_type12,src_type21,src_type22,src_type31,src_type32
370,04,14.330950,3871.135498,51.0,29.0,11.0,19.0,344.0,433.0,10049.0,32160.0,33.0,1397.0,71.0
1419,02,3.372370,10510.201904,25.0,47.0,11.0,19.0,902.0,1166.0,30836.0,46676.0,37.0,1835.0,26.0
1697,10,19.292397,45158.107422,54.0,55.0,11.0,19.0,344.0,364.0,22652.0,9382.0,62.0,1221.0,4.0
932,10,10.230025,38.805630,6.0,12.0,11.0,128.0,959.0,1302.0,8693.0,16346.0,58.0,1810.0,51.0
729,06,17.469090,1980.236328,37.0,55.0,11.0,19.0,344.0,433.0,10049.0,33972.0,70.0,439.0,81.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,03,12.445700,1262.562134,51.0,29.0,11.0,19.0,344.0,988.0,14906.0,8876.0,51.0,1721.0,4.0
1294,06,10.071744,12124.925781,25.0,47.0,11.0,19.0,344.0,1166.0,30836.0,27976.0,54.0,1626.0,68.0
860,11,18.728542,74.293770,36.0,12.0,11.0,128.0,456.0,433.0,8693.0,30736.0,84.0,815.0,4.0
3507,07,13.110930,248244.703125,54.0,55.0,11.0,19.0,344.0,364.0,22652.0,46571.0,41.0,567.0,81.0


In [12]:
# Стандартизация
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Преобразование данных в тензоры
X_train = torch.Tensor(X_train).float()
y_train = torch.Tensor(y_train.values).long().squeeze()
X_test = torch.Tensor(X_test).float()
y_test = torch.Tensor(y_test.values).long().squeeze()

In [14]:
X_train

tensor([[-1.3145, -0.0465, -0.2625,  ..., -0.8844,  0.1525,  0.7713],
        [-2.0911, -2.5147, -0.2238,  ..., -0.7079,  0.7372, -0.8070],
        [ 1.0152,  1.0709, -0.0217,  ...,  0.3952, -0.0824, -1.5786],
        ...,
        [ 1.4035,  0.9439, -0.2846,  ...,  1.3658, -0.6243, -1.5786],
        [-0.1497, -0.3213,  1.1626,  ..., -0.5314, -0.9553,  1.1221],
        [-1.7028,  1.6800, -0.1588,  ..., -0.3108, -0.6257, -0.8070]])

In [15]:
len(X_train)

2895

In [16]:
y_train

tensor([0, 0, 0,  ..., 0, 0, 0])

In [17]:
y_test

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

In [18]:
print(len(X_train), len(y_train), len(X_test), len(y_test))

2895 2895 965 965


In [19]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_train.dtype, y_train.dtype)

torch.Size([2895, 14]) torch.Size([2895])
torch.Size([965, 14]) torch.Size([965])
torch.float32 torch.int64


In [20]:
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(in_features=14, out_features=6)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(in_features=6, out_features=2) # всего два признака на выходе
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        x = self.softmax(x)
        return x

model = Classifier()

In [21]:
optimizer = torch.optim.Adam(params=model.parameters(), # оптимизируем вновь созданные параметры
                            lr=0.004)
loss_fn = nn.CrossEntropyLoss()

In [22]:
def train(model, X_train, y_train, loss_fn, optimizer, epochs=3000):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train)
        loss = loss_fn(output, y_train)
        loss.backward()
        optimizer.step()
        if epoch % 100 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')
    return model

def evaluate(model, X_test, y_test, loss_fn):
    model.eval()
    with torch.no_grad():
        output = model(X_test)
        loss = loss_fn(output, y_test)
        y_preds = torch.argmax(output, dim=1)
        accuracy = accuracy_score(y_test, y_preds)
        print(f'Test Loss: {loss.item()}, Test Accuracy: {accuracy}')
    return y_preds


In [23]:
trained_model = train(model, X_train, y_train, loss_fn, optimizer, epochs=3000)
y_preds = evaluate(trained_model, X_test, y_test, loss_fn)
y_preds

Epoch 0, Loss: 0.6448355913162231
Epoch 100, Loss: 0.41509202122688293
Epoch 200, Loss: 0.40268373489379883
Epoch 300, Loss: 0.4007340669631958
Epoch 400, Loss: 0.4000353217124939
Epoch 500, Loss: 0.39968785643577576
Epoch 600, Loss: 0.3994764983654022
Epoch 700, Loss: 0.39934056997299194
Epoch 800, Loss: 0.3992266356945038
Epoch 900, Loss: 0.3991149663925171
Epoch 1000, Loss: 0.39888766407966614
Epoch 1100, Loss: 0.39867669343948364
Epoch 1200, Loss: 0.3985312283039093
Epoch 1300, Loss: 0.398375928401947
Epoch 1400, Loss: 0.39805424213409424
Epoch 1500, Loss: 0.39740678668022156
Epoch 1600, Loss: 0.39661556482315063
Epoch 1700, Loss: 0.39599841833114624
Epoch 1800, Loss: 0.395485520362854
Epoch 1900, Loss: 0.39505478739738464
Epoch 2000, Loss: 0.39460065960884094
Epoch 2100, Loss: 0.3942180275917053
Epoch 2200, Loss: 0.39390456676483154
Epoch 2300, Loss: 0.39358142018318176
Epoch 2400, Loss: 0.39330539107322693
Epoch 2500, Loss: 0.39309436082839966
Epoch 2600, Loss: 0.3929172158241272

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [24]:
# Вызов функции evaluate на обученной нейронной сети и тестовых данных
y_pred = evaluate(trained_model, X_test, y_test, loss_fn)
y_pred

Test Loss: 0.4109188914299011, Test Accuracy: 0.9025906735751296


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [25]:
# Вызов функции f1_score на векторе предсказанных меток и векторе истинных меток
acc = accuracy_score(y_test, y_pred)

# Вывод значения f1_score
print(f'accuracy: {acc}')

accuracy: 0.9025906735751296
