In [1]:
import pandas as pd
import matplotlib.pylab as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
frame_test = pd.read_csv("hq_markup_train.csv")
frame_test.fillna(-2, inplace=True)
frame = pd.read_csv("markup_train.csv")

def get_sample(index, frame):
    uuid = frame.iloc[index, 0]
    with open(f'data/{uuid}', 'r') as f:
        content = f.read()
    rows = [line.split('\t') for line in content.strip().split('\n')]
    data = pd.DataFrame(rows, columns=['time', 'delta_p', 'p_'], dtype=float)
    return data


def find_empty_indexes_2(data):
    """
    Поиск id файлов из строк датафрейма для их исключения на этапе удаления данных из датафрейма
    """
    B=[]
    for i in range(data.shape[0]):
        try:
            df=get_sample(i,data)
        except ValueError:
            B.append(data.iloc[i,0])
    return B


# class SiamDataset(Dataset):
#     def __init__(self, siam_dataset_describe:pd.DataFrame):
#         super().__init__()
#         self.siam_dataset_describe = siam_dataset_describe

#     def __len__(self):
#         return self.siam_dataset_describe.shape[0]

#     def __getitem__(self, idx):
#         x = get_sample(idx, self.siam_dataset_describe) #.to_numpy(dtype=np.float64)
#         # t = x["time"].to_numpy(dtype=np.float64)
#         x = x[["delta_p", "p_"]].to_numpy(dtype=np.float64)
        


#         # 7) Возвращаем (X, Y)                                                  # Давление(атм) ([:, 1]) Давление(атм) ([:,1]) Давление(атм) ([:,1]) Давление(атм) ([:,1]) Давление(атм) ([:,1]) В какой момент? [:, 0]   В какой момент? [:, 0]
#         return x, self.siam_dataset_describe.iloc[idx][['Влияние ствола скважины_details', 'Радиальный режим_details', 'Линейный режим_details', 'Билинейный режим_details', 'Сферический режим_details', 'Граница постоянного давления_details', 'Граница непроницаемый разлом_details']].to_numpy(dtype=np.float32)

# B=find_empty_indexes_2(frame)#Поиск id с отсутствующими файлами для исключения
# frame=frame[~frame['file_name'].isin(B)]#Фильтрация данных
# frame = frame[~frame["file_name"].isin(frame_test["file_name"])]

In [19]:
frame_test.columns[11:]

Index(['Влияние ствола скважины_details', 'Радиальный режим_details',
       'Линейный режим_details', 'Билинейный режим_details',
       'Сферический режим_details', 'Граница постоянного давления_details',
       'Граница непроницаемый разлом_details'],
      dtype='object')

In [2]:

class PositionalEncoding(nn.Module):
    """
    Классическое позиционное кодирование (NLP-стиль).
    """
    def __init__(self, d_model, max_len=4000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() 
                             * -(torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe / (d_model ** 0.5)
        pe = pe.unsqueeze(0)    # [1, max_len, d_model]
        self.register_buffer('pe', pe) 

    def forward(self, x):
        """
        x shape: [B, T, d_model]
        Добавляем позиционное кодирование к x.
        """
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]
        # print(f"Encoded x: {x}")
        return x

class TransformerFlowModel(nn.Module):
    """
    Пример трансформера для задач:
      - 8 бинарных признаков (классификация)
      - 7 регрессионных выходов
    С учётом attention mask (padding).
    """
    def __init__(self,
                 input_dim=3,     # (delta_p, p_, log_time) например
                 d_model=64,
                 nhead=4,
                 num_layers=2,
                 dim_feedforward=128,
                 dropout=0.1,
                 n_class=8,
                 n_reg=5):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        # self.pos_encoder = PositionalEncoding(d_model)
        torch.nn.init.xavier_uniform_(self.input_proj.weight, gain=1.0)
        torch.nn.init.zeros_(self.input_proj.bias)

        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model,
                                                   nhead=nhead,
                                                   dim_feedforward=dim_feedforward,
                                                   dropout=dropout,
                                                   activation="relu",
                                                   batch_first=True)  
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers)

        self.pool = nn.AdaptiveAvgPool1d(1)  # mean-pool по временной оси

        self.class_head = nn.Linear(d_model, n_class)
        self.reg_head   = nn.Linear(d_model, n_reg)

    def forward(self, x, src_key_padding_mask=None):
        """
        x: [B, T, input_dim]
        src_key_padding_mask: [B, T], True = игнорируем (паддинг)
        """
        # 1) Линейная проекция входа -> d_model
        x_proj = self.input_proj(x)  # [B, T, d_model]
        if torch.isnan(x_proj).any():
            print("NaN after input projection")
        # 2) Позиционное кодирование
        # x_encoded = self.pos_encoder(x_proj)  # [B, T, d_model]
        # if torch.isnan(x_encoded).any():
            # print("NaN after positional encoding")
        # 3) Пропускаем через энкодер
        #    Важно: указываем mask=... (или src_key_padding_mask=...).
        #    Маска должна быть типа bool, shape [B, T]
        x_trans = self.transformer_encoder(
            # x_encoded,
            x_proj,
            src_key_padding_mask=src_key_padding_mask
        )  # [B, T, d_model]
        if torch.isnan(x_trans).any():
            print("NaN after transformer")

        # 4) Pooling, [B, d_model]
        x_trans_perm = x_trans.permute(0, 2, 1)  # -> [B, d_model, T]
        pooled = self.pool(x_trans_perm).squeeze(-1)  # [B, d_model]
        if torch.isnan(pooled).any():
            print("NaN after pooling")

        # 5) Выход
        class_logits = self.class_head(pooled)  # [B, 8]
        reg_output   = self.reg_head(pooled)    # [B, 7]
        if torch.isnan(pooled).any():
            print("NaN after class_logits")
        return class_logits, reg_output


class ConvTransformer(nn.Module):
    def __init__(self, d_model=32, nhead=2, num_layers=1, dim_feedforward = 128):
        super().__init__()
        # 1) Conv1d: in_channels=2 (delta_p, p_), out_channels=d_model
        #    kernel_size=3 -> можно менять
        self.conv = nn.Conv1d(in_channels=2, out_channels=d_model, kernel_size=51, padding=25)

        # 2) TransformerEncoder (упрощённый)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=0.1,
            activation='relu',
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers
        )

        # 3) Heads для классификации (8) и регрессии (7)
        self.class_head = nn.Linear(d_model, 8)
        self.reg_head   = nn.Linear(d_model, 5)

    def forward(self, x, src_key_padding_mask=None):
        """
        x: [B, T, 2]
        src_key_padding_mask: [B, T] (bool), True=игнорировать позицию
        """
        # -- (A) Свёртка --
        # Conv1d ожидает [B, C, T], значит permute:
        x = x.permute(0, 2, 1)  # -> [B, 2, T]

        # Прогон через conv:
        # Выход будет [B, d_model, T]
        x_conv = self.conv(x)  # [B, d_model, T]

        # -- (B) Для Transformer делаем [B, T, d_model]
        x_conv = x_conv.permute(0, 2, 1)  # [B, T, d_model]

        # -- (C) Прогон через TransformerEncoder, с учётом mask
        x_trans = self.transformer_encoder(
            x_conv,
            src_key_padding_mask=src_key_padding_mask
        )  # [B, T, d_model]

        # -- (D) Возьмём, например, средний вектор по времени
        x_pooled = x_trans.mean(dim=1)  # [B, d_model]

        # -- (E) Предсказываем 8 бинарных признаков + 7 регрессий
        class_logits = self.class_head(x_pooled)  # [B, 8]
        reg_out      = self.reg_head(x_pooled)    # [B, 7]

        return class_logits, reg_out



class SiamDataset(Dataset):
    def __init__(self, siam_dataset_describe:pd.DataFrame):
        super().__init__()
        self.siam_dataset_describe = siam_dataset_describe

    def __len__(self):
        return self.siam_dataset_describe.shape[0]

    def __getitem__(self, idx):
        x = get_sample(idx, self.siam_dataset_describe)
        # x = torch.from_numpy(get_sample(idx, self.siam_dataset_describe).to_numpy(dtype=np.float32)) #.to_numpy(dtype=np.float64)
        # t = x["time"].to_numpy(dtype=np.float64)
        
        x =  torch.from_numpy(x[["delta_p", "p_"]].to_numpy(dtype=np.float32))
        if torch.isnan(x).any():
            print(get_sample(idx, self.siam_dataset_describe))
            print(x)
            raise Exception("ахтунг в данных")
        


        # 7) Возвращаем (X, Y)
        return x, torch.from_numpy(self.siam_dataset_describe.iloc[idx][self.siam_dataset_describe.columns[3:11]].to_numpy(dtype=np.float32)), torch.from_numpy(self.siam_dataset_describe.iloc[idx][self.siam_dataset_describe.columns[11:-2]].to_numpy(dtype=np.float32))



def collate_fn_with_padding(batch):
    """
    batch: список из (X, y_class, y_reg), где X shape [T_i, input_dim].
    Нужно вернуть:
      padded_X: [B, max_len, input_dim]
      src_key_padding_mask: [B, max_len] (bool)
      y_class: [B, 8]
      y_reg:   [B, 7]
    """
    # 1) Определяем batch_size
    batch_size = len(batch)
    # 2) Находим максимальную длину среди X
    lengths = [sample[0].shape[0] for sample in batch]  # T_i для каждого
    max_len = max(lengths)
    input_dim = batch[0][0].shape[1]

    # 3) Создаём тензоры под результирующие данные
    padded_X = torch.zeros((batch_size, max_len, input_dim), dtype=torch.float)
    # Маска: True = игнорируем => паддинг
    # Изначально False (значит реальная точка), затем выставим True там, где нет реальных данных
    src_key_padding_mask = torch.zeros((batch_size, max_len), dtype=torch.bool)

    y_class_list = []
    y_reg_list = []

    # 4) Копируем данные в паддинг-тензоры
    for i, (X, y_class, y_reg) in enumerate(batch):
        length = X.shape[0]
        padded_X[i, :length, :] = X
        # Для элементов после length делаем mask = True
        if length < max_len:
            src_key_padding_mask[i, length:] = True

        y_class_list.append(y_class)
        y_reg_list.append(y_reg)

    # 5) Склеиваем метки
    y_class_tensor = torch.stack(y_class_list, dim=0)  # [B, 8]
    y_reg_tensor   = torch.stack(y_reg_list, dim=0)   # [B, 7]

    return padded_X, src_key_padding_mask, y_class_tensor, y_reg_tensor


from torch.utils.data import DataLoader

# Создаём датасет
dataset = SiamDataset(frame_test)
# Создаём DataLoader
loader = DataLoader(dataset, 
                    batch_size=8, 
                    shuffle=True, 
                    collate_fn=collate_fn_with_padding)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device is {device}")
# device = 'cpu'

model = ConvTransformer(d_model=96, nhead=16, num_layers=4, dim_feedforward=128)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
bce_loss = nn.BCEWithLogitsLoss()
mse_loss = nn.L1Loss()
model.train()
for epoch in range(10):
    total_loss = 0.0
    for batch_data in loader:
        # batch_data = (padded_X, src_key_padding_mask, y_class, y_reg)
        padded_X, src_mask, y_class, y_reg = batch_data

        padded_X = padded_X.to(device)
        src_mask = src_mask.to(device)
        y_class  = y_class.to(device)
        y_reg    = y_reg.to(device)

        optimizer.zero_grad()

        # Прогон через модель
        logits, reg_out = model(padded_X, src_key_padding_mask=src_mask)
        # print(reg_out)
        # print(y_reg)

        # Лосс по классификации
        loss_class = bce_loss(logits, y_class)
        # Лосс по регрессии
        loss_reg = mse_loss(reg_out, y_reg)
        l1_norm = sum(p.abs().sum() for p in model.parameters())

        loss = loss_reg + loss_class #+ l1_norm #+  loss_reg
        # print(loss)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, loss={total_loss:.4f}")



Device is cuda
Epoch 1, loss=95.8200
Epoch 2, loss=86.2205
Epoch 3, loss=82.6232
Epoch 4, loss=82.7902
Epoch 5, loss=79.1305
Epoch 6, loss=78.7678
Epoch 7, loss=79.9139
Epoch 8, loss=78.2297
Epoch 9, loss=77.7657
Epoch 10, loss=74.7703


In [3]:
model.train()
for epoch in range(50):
    total_loss = 0.0
    for batch_data in loader:
        # batch_data = (padded_X, src_key_padding_mask, y_class, y_reg)
        padded_X, src_mask, y_class, y_reg = batch_data

        padded_X = padded_X.to(device)
        src_mask = src_mask.to(device)
        y_class  = y_class.to(device)
        y_reg    = y_reg.to(device)

        optimizer.zero_grad()

        # Прогон через модель
        logits, reg_out = model(padded_X, src_key_padding_mask=src_mask)
        # print(logits)
        # print(y_class)

        # Лосс по классификации
        loss_class = bce_loss(logits, y_class)
        # Лосс по регрессии
        loss_reg = mse_loss(reg_out, y_reg)
        l1_norm = sum(p.abs().sum() for p in model.parameters())

        loss = loss_class + loss_reg   #+ l1_norm #+ loss_reg loss_class
        # print(loss)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, loss={total_loss:.4f}")

Epoch 1, loss=74.7691
Epoch 2, loss=75.7576
Epoch 3, loss=73.3730
Epoch 4, loss=72.6190
Epoch 5, loss=71.3210
Epoch 6, loss=72.8127
Epoch 7, loss=73.1888
Epoch 8, loss=73.7166
Epoch 9, loss=73.4740
Epoch 10, loss=71.8188
Epoch 11, loss=71.1186
Epoch 12, loss=71.3280
Epoch 13, loss=69.3700
Epoch 14, loss=69.8520
Epoch 15, loss=69.2301
Epoch 16, loss=68.3598
Epoch 17, loss=66.3077
Epoch 18, loss=66.6130
Epoch 19, loss=65.3759
Epoch 20, loss=65.8718
Epoch 21, loss=65.6590
Epoch 22, loss=64.7795
Epoch 23, loss=65.5825
Epoch 24, loss=65.4996
Epoch 25, loss=69.0720
Epoch 26, loss=68.0735
Epoch 27, loss=65.6466
Epoch 28, loss=63.5809
Epoch 29, loss=65.2960
Epoch 30, loss=62.1465
Epoch 31, loss=62.2389
Epoch 32, loss=61.2505
Epoch 33, loss=66.0273
Epoch 34, loss=65.9966
Epoch 35, loss=64.1354
Epoch 36, loss=63.5731
Epoch 37, loss=63.7302
Epoch 38, loss=63.6070
Epoch 39, loss=62.8273
Epoch 40, loss=64.3297
Epoch 41, loss=63.4204
Epoch 42, loss=64.6651
Epoch 43, loss=61.3085
Epoch 44, loss=62.58

In [10]:
torch.cuda.empty_cache()
import gc
torch.cuda.reset_max_memory_allocated()  # Сброс максимального использования памяти
torch.cuda.reset_max_memory_cached()     # Сброс кэшированной памяти

gc.collect()

# Очистка кэша CUDA
torch.cuda.empty_cache()



In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = "cpu"
# model.to(device=device)
print(f"rinning at {device}")
# dataset = SiamDataset(frame_test)
loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_with_padding)
all_proba = []
all_real_y = []
all_reg = []
all_y_reg = []
model.eval()
with torch.no_grad():
    for padded_X, src_mask, y_class, y_reg in loader:

        padded_X = padded_X.to(device)
        src_mask = src_mask.to(device)
        y_class  = y_class.to(device)
        y_reg    = y_reg.to(device)

        optimizer.zero_grad()

        # Прогон через модель
        logits, reg_out = model(padded_X, src_key_padding_mask=src_mask)
        # print(logits)
        # print(y_class)

        # Лосс по классификации
        loss_class = bce_loss(logits, y_class)
        all_reg.append(reg_out.cpu())
        all_y_reg.append(y_reg.cpu())
        
        pred_proba = torch.sigmoid(reg_out)
        all_proba.append(pred_proba.cpu())
        all_real_y.append(y_class.cpu())

rinning at cuda


  output = torch._nested_tensor_from_mask(


In [5]:

all_real_y_2 = [tensor[0, 1:-2].cpu().detach().numpy() for tensor in all_real_y]
pred_proba_2 = [tensor.cpu().detach().numpy() for tensor in all_proba]
all_answers_2 = [(pred_proba[0] > 0.5).int().cpu().detach().numpy() for pred_proba in all_proba]
# # all_answers_2 = 
print(len(all_real_y_2))
print(len(all_answers_2))
print(all_real_y_2)
print(all_answers_2)

500
500
[array([1., 1., 0., 0., 0.], dtype=float32), array([1., 1., 1., 0., 0.], dtype=float32), array([0., 0., 0., 0., 0.], dtype=float32), array([1., 1., 0., 0., 0.], dtype=float32), array([1., 1., 1., 0., 0.], dtype=float32), array([0., 0., 0., 0., 0.], dtype=float32), array([0., 1., 0., 1., 0.], dtype=float32), array([1., 1., 0., 1., 0.], dtype=float32), array([0., 0., 0., 0., 0.], dtype=float32), array([0., 0., 0., 0., 0.], dtype=float32), array([1., 1., 0., 1., 0.], dtype=float32), array([1., 0., 1., 1., 0.], dtype=float32), array([1., 0., 0., 0., 1.], dtype=float32), array([1., 1., 1., 0., 0.], dtype=float32), array([1., 0., 0., 0., 1.], dtype=float32), array([1., 0., 0., 0., 1.], dtype=float32), array([1., 1., 1., 1., 0.], dtype=float32), array([1., 0., 0., 1., 0.], dtype=float32), array([1., 1., 0., 0., 0.], dtype=float32), array([0., 1., 0., 0., 1.], dtype=float32), array([1., 0., 0., 1., 0.], dtype=float32), array([0., 0., 0., 0., 0.], dtype=float32), array([1., 0., 1., 0., 

In [6]:
all_reg_2 = np.array([tmp_reg[0].cpu().detach().numpy() for tmp_reg in all_reg])
all_y_reg_2 = np.array([tmp_reg[0].cpu().detach().numpy() for tmp_reg in all_y_reg])

# np.mean(np.abs(np.array(all_y_reg_2) - np.array(all_reg_2)), axis=0)
mask = all_y_reg_2 < 0
all_reg_2[mask] = 0
all_y_reg_2[mask] = 0

np.mean(np.abs(np.array(all_y_reg_2) - np.array(all_reg_2)), axis=0)

array([0.5612721 , 0.52519006, 0.20582251, 0.38274506, 0.31886393],
      dtype=float32)

In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss
precision = precision_score(all_real_y_2, all_answers_2, average='macro')

recall = recall_score(all_real_y_2, all_answers_2, average='macro')

f1 = f1_score(all_real_y_2, all_answers_2, average='macro')

hamming = hamming_loss(all_real_y_2, all_answers_2)
print(np.sum(np.all(np.array(all_answers_2) == np.array(all_real_y_2), axis=1))/len(all_answers_2))
print(f"Precision: {precision}, Recall: {recall}, F1: {f1}, Hamming Loss: {hamming}")

0.272
Precision: 0.4772761149176584, Recall: 0.24669944090996726, F1: 0.28721941192834927, Hamming Loss: 0.2476


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
