In [1]:
import pandas as pd
import matplotlib.pylab as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
frame = pd.read_csv("hq_markup_train.csv")

def get_sample(index, frame):
    uuid = frame.iloc[index, 0]
    with open(f'data/{uuid}', 'r') as f:
        content = f.read()
    rows = [line.split('\t') for line in content.strip().split('\n')]
    data = pd.DataFrame(rows, columns=['time', 'delta_p', 'p_'], dtype=float)
    return data

In [2]:
class SiamDataset(Dataset):
    def __init__(self, siam_dataset_describe:pd.DataFrame):
        super().__init__()
        self.siam_dataset_describe = siam_dataset_describe

    def __len__(self):
        return self.siam_dataset_describe.shape[0]

    def __getitem__(self, idx):
        x = get_sample(idx, self.siam_dataset_describe) #.to_numpy(dtype=np.float64)
        # t = x["time"].to_numpy(dtype=np.float64)
        x = x[["delta_p", "p_"]].to_numpy(dtype=np.float64)
        


        # 7) Возвращаем (X, Y)
        return x, self.siam_dataset_describe.iloc[idx][['Некачественное ГДИС', 'Влияние ствола скважины', 'Радиальный режим', 'Линейный режим', 'Билинейный режим', 'Сферический режим', 'Граница постоянного давления', 'Граница непроницаемый разлом']].to_numpy(dtype=np.int8)

In [9]:
def collate_fn_batch1(batch):
    """
    collate_fn, который просто возвращает список длиной batch_size=1,
    так как иначе нужно продумывать, как совмещать разные временные метки в одном батче.
    """
    # batch - это список из [(t_i, x_i, y_i), (t_j, x_j, y_j), ...]
    # При batch_size=1 там будет ровно 1 элемент.
    return batch[0]  # вернём (t, x, y) без упаковки в дополнительные измерения


In [74]:
torch.save(trained_model.state_dict(), './TransformerV2.pth')
print('Model saved!')

Model saved!


In [3]:
def collate_fn_varlen(batch):
    """
    batch - список из [(x_i, y_i), (x_j, y_j), ...] размера batch_size.
      x_i: [T_i, input_dim]
      y_i: [num_classes]
    
    Задача: паддить x_i до [max_len, input_dim] и собрать в один тензор.
            Сформировать маску pad_mask для вычисления attention.
    """
    # 1) Определяем максимальную длину в батче
    max_len = 1800
    input_dim = batch[0][0].shape[1]
    # num_classes = batch[0][1].shape[0]
    batch_size = len(batch)
    
    # 2) Подготовим тензоры для x, y, mask
    x_padded = torch.zeros(batch_size, max_len, input_dim, dtype=torch.float32)
    pad_mask = torch.ones(batch_size, max_len, dtype=torch.bool)  # True = padded (будем инвертировать при attention)
    y_tensor = torch.zeros(batch_size, 8, dtype=torch.float32)
    
    # 3) Заполним их
    for i, (x_i, y_i) in enumerate(batch):
        length = min(x_i.shape[0], max_len)
        # print(x_i)
        # raise Exception("STOP")
        x_padded[i, :length, :] = torch.from_numpy(x_i[:length])
        pad_mask[i, :length] = False  # False => там реальные данные
        y_tensor[i] = torch.from_numpy(y_i)
    
    # Возвращаем (x_padded, pad_mask, y_tensor)
    return x_padded, pad_mask, y_tensor


In [4]:
class AttentionClassifier(nn.Module):
    def __init__(self, input_dim, d_model, num_heads, num_classes):
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_classes = num_classes
        
        # Проекция входа в d_model
        self.embedding = nn.Linear(input_dim, d_model)
        
        # Мульти-хед внимание
        self.attention = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=num_heads,
            batch_first=True  # сделаем batch_first=True (поддерживается в новых версиях PyTorch)
        )
        
        # Финальный классификатор (multi-label => просто линейный + BCEWithLogitsLoss в конце)
        self.classifier = nn.Linear(d_model, num_classes)
    
    def forward(self, x_padded, pad_mask):
        """
        x_padded: [batch_size, seq_len, input_dim]
        pad_mask: [batch_size, seq_len] (True => позиция паддинга)
        
        Возвращает logits: [batch_size, num_classes]
        """
        # 1) Проекция
        x_proj = self.embedding(x_padded)  # [batch_size, seq_len, d_model]
        
        # 2) Применяем MultiheadAttention
        #    В self-attention Q=K=V = x_proj, но нужно передать их в MultiheadAttention
        #    т.к. batch_first=True, форма входа: [batch_size, seq_len, d_model]
        #    pad_mask: [batch_size, seq_len], но для MHA часто нужна shape [batch_size, 1, seq_len, seq_len]
        #    Однако с batch_first=True с недавних пор PyTorch поддерживает форму [batch_size, seq_len].
        
        # query, key, value = x_proj (self-attention)
        # attn_mask => None, но key_padding_mask => pad_mask
        attn_out, attn_weights = self.attention(
            x_proj, x_proj, x_proj,
            key_padding_mask=pad_mask
        )
        # attn_out: [batch_size, seq_len, d_model]
        
        # 3) Pooling по временной оси, чтобы получить 1 вектор на объект
        #    Возьмём mean pooling по тем позициям, которые не паддинг.
        #    Для корректного усреднения учтём число реальных токенов.
        #    (Можно использовать маску, чтобы не считать паддинг.)
        
        # Считаем количество реальных (не паддинговых) токенов по каждому объекту
        real_lengths = (~pad_mask).sum(dim=1).unsqueeze(-1)  # [batch_size, 1]
        
        # Суммируем embeddings по seq_len
        sum_embeddings = torch.sum(attn_out * (~pad_mask).unsqueeze(-1), dim=1)  
        # sum_embeddings: [batch_size, d_model]
        #  т.к. (~pad_mask) имеет форму [batch_size, seq_len], мы делаем broadcast по последней оси
        
        # Теперь усредним
        avg_embeddings = sum_embeddings / real_lengths.clamp(min=1e-9)  # [batch_size, d_model]
        
        # 4) Применяем линейный классификатор
        logits = self.classifier(avg_embeddings)  # [batch_size, num_classes]
        return logits


In [5]:
def train_attention_model_demo():
    # Параметры
    # N = 50            # Размер датасета (демонстрация)
    input_dim = 2    # Размерность входа в момент времени
    num_classes = 8   # Кол-во меток (multi-label)
    d_model = 20      # Размерность скрытого представления
    num_heads = 2     # Число голов внимания
    batch_size = 32
    num_epochs = 30
    lr = 1e-3
    
    # 1) Создаём датасет и лоадер
    dataset = SiamDataset(frame)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_varlen)
    
    # 2) Модель
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device is {device}")
    model = AttentionClassifier(input_dim, d_model, num_heads, num_classes).to(device)
    
    criterion = nn.BCEWithLogitsLoss()  # multi-label
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # 3) Тренировочный цикл
    for epoch in range(num_epochs):
        total_loss = 0.0
        for x_padded, pad_mask, y in loader:
            x_padded = x_padded.to(device)  # [batch_size, seq_len, input_dim]
            pad_mask = pad_mask.to(device)  # [batch_size, seq_len] (bool)
            y = y.to(device)                # [batch_size, num_classes]
            
            optimizer.zero_grad()
            
            logits = model(x_padded, pad_mask)  # [batch_size, num_classes]
            loss = criterion(logits, y)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {avg_loss:.4f}")
    
    print("Обучение завершено.")
    return model

if __name__ == "__main__":
    trained_model = train_attention_model_demo()


Device is cuda
Epoch [1/30] - Loss: 30.6023
Epoch [2/30] - Loss: 10.2743
Epoch [3/30] - Loss: 4.3890
Epoch [4/30] - Loss: 3.3025
Epoch [5/30] - Loss: 3.1909
Epoch [6/30] - Loss: 2.3690
Epoch [7/30] - Loss: 1.9273
Epoch [8/30] - Loss: 2.3237
Epoch [9/30] - Loss: 2.2423
Epoch [10/30] - Loss: 1.8242
Epoch [11/30] - Loss: 2.4121
Epoch [12/30] - Loss: 1.8252
Epoch [13/30] - Loss: 1.7926
Epoch [14/30] - Loss: 1.6189
Epoch [15/30] - Loss: 1.8774
Epoch [16/30] - Loss: 1.5470
Epoch [17/30] - Loss: 1.2401
Epoch [18/30] - Loss: 2.1239
Epoch [19/30] - Loss: 1.4779
Epoch [20/30] - Loss: 1.6902
Epoch [21/30] - Loss: 1.6867
Epoch [22/30] - Loss: 1.3742
Epoch [23/30] - Loss: 1.5401
Epoch [24/30] - Loss: 1.6030
Epoch [25/30] - Loss: 2.4357
Epoch [26/30] - Loss: 1.9523
Epoch [27/30] - Loss: 1.6392
Epoch [28/30] - Loss: 1.3505
Epoch [29/30] - Loss: 1.0310
Epoch [30/30] - Loss: 1.1334
Обучение завершено.


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = "cpu"
# trained_model.to(device=device)
print(f"rinning at {device}")
dataset = SiamDataset(frame)
loader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn_varlen)
all_proba = []
all_real_y = []
for x_padded, pad_mask, y in loader:
    x_padded = x_padded.to(device)  # [batch_size, seq_len, input_dim]
    pad_mask = pad_mask.to(device)  # [batch_size, seq_len] (bool)
    # y = y.to(device)                # [batch_size, num_classes]
    
    logits = trained_model(x_padded, pad_mask)  # [batch_size, num_classes]
    pred_proba = torch.sigmoid(logits)
    all_proba.append(pred_proba)
    all_real_y.append(y)

rinning at cuda


In [17]:
# all_real_y_2 = np.array([tensor[0].cpu().tolist() for tensor in all_real_y]).ravel()
# pred_proba_2 = [tensor.cpu() for tensor in all_proba]
# all_answers_2 = np.array([(pred_proba[0] > 0.5).int().cpu().tolist() for pred_proba in all_proba]).ravel()

all_real_y_2 = [tensor[0].cpu() for tensor in all_real_y]
pred_proba_2 = [tensor.cpu() for tensor in all_proba]
all_answers_2 = [(pred_proba[0] > 0.5).int().cpu() for pred_proba in all_proba]
# # all_answers_2 = 
print(len(all_real_y_2))
print(len(all_answers_2))
print(all_real_y_2)
print(all_answers_2)

500
500
[tensor([1., 0., 0., 0., 0., 0., 0., 0.]), tensor([0., 1., 1., 0., 0., 0., 0., 0.]), tensor([0., 1., 0., 1., 0., 0., 0., 0.]), tensor([0., 0., 1., 1., 1., 0., 0., 0.]), tensor([0., 1., 1., 1., 0., 0., 0., 0.]), tensor([1., 0., 0., 0., 0., 0., 0., 0.]), tensor([0., 1., 0., 0., 0., 1., 1., 0.]), tensor([0., 1., 0., 1., 1., 0., 0., 0.]), tensor([0., 1., 1., 0., 0., 0., 0., 0.]), tensor([0., 1., 1., 0., 0., 0., 0., 1.]), tensor([0., 1., 1., 0., 0., 0., 0., 0.]), tensor([0., 1., 0., 0., 0., 0., 1., 0.]), tensor([0., 0., 1., 1., 1., 0., 0., 0.]), tensor([0., 1., 0., 1., 0., 0., 0., 1.]), tensor([1., 1., 0., 0., 0., 0., 0., 0.]), tensor([0., 1., 1., 0., 0., 0., 0., 0.]), tensor([0., 1., 1., 0., 0., 0., 1., 0.]), tensor([0., 1., 1., 0., 0., 0., 0., 0.]), tensor([0., 1., 1., 1., 0., 0., 0., 0.]), tensor([0., 1., 1., 0., 1., 0., 0., 0.]), tensor([1., 1., 0., 0., 0., 0., 0., 0.]), tensor([0., 1., 1., 0., 1., 1., 0., 0.]), tensor([0., 1., 0., 0., 0., 1., 0., 0.]), tensor([0., 1., 1., 0., 0

In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss
precision = precision_score(all_real_y_2, all_answers_2, average='macro')

recall = recall_score(all_real_y_2, all_answers_2, average='macro')

f1 = f1_score(all_real_y_2, all_answers_2, average='macro')

hamming = hamming_loss(all_real_y_2, all_answers_2)

print(f"Precision: {precision}, Recall: {recall}, F1: {f1}, Hamming Loss: {hamming}")

Precision: 0.4065487054329372, Recall: 0.26189187762238375, F1: 0.21893089918707404, Hamming Loss: 0.24575


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:

def collate_fn_varlen(batch):
    """
    batch: список из [(x_i, y_i), (x_j, y_j), ...] длиной batch_size
      - x_i: np.ndarray shape [T_i, input_dim]
      - y_i: np.ndarray shape [num_classes]

    Задача: 
      1) Определить max_len в батче
      2) Паддить все x_i до [max_len, input_dim]
      3) Сохранить длины T_i
      4) Объединить y_i в тензор [batch_size, num_classes]
    """
    # Определяем максимальную длину в данном батче
    max_len = max(x.shape[0] for x, _ in batch)
    batch_size = len(batch)
    input_dim = batch[0][0].shape[1]
    num_classes = batch[0][1].shape[0]
    num_classes = 2
    # print(num_classes)
    
    # Создаём пустой тензор для x
    x_padded = torch.zeros(batch_size, max_len, input_dim, dtype=torch.float32)
    lengths = torch.zeros(batch_size, dtype=torch.long)
    y_tensor = torch.zeros(batch_size, num_classes, dtype=torch.float32)
    
    for i, (x_i, y_i) in enumerate(batch):
        T_i = x_i.shape[0]
        if y_i[0] == 1:
            y_tensor[i] = torch.Tensor([0., 1.])
        else:
            y_tensor[i] = torch.Tensor([1., 0.])
        x_padded[i, :T_i, :] = torch.from_numpy(x_i)
        lengths[i] = T_i
        # y_tensor[i] = torch.from_numpy(y_i)[0]
         
    
    return x_padded, lengths, y_tensor


#######################################################
# 3. Модель на базе LSTM + классификатор
#######################################################

class LSTMClassifier(nn.Module):
    """
    LSTM, которая принимает последовательность переменной длины с помощью
    pack_padded_sequence. Финальное скрытое состояние (или усреднение) идёт
    в линейный классификатор для multi-label (BCEWithLogits).
    """
    def __init__(self, input_dim, hidden_dim, num_classes, num_layers=1, bidirectional=False):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        
        dir_factor = 2 if bidirectional else 1
        self.classifier = nn.Sequential(nn.Linear(hidden_dim * dir_factor, num_classes), nn.Softmax(dim=1))
    
    def forward(self, x_padded, lengths):
        """
        x_padded: [batch_size, max_len, input_dim]
        lengths: [batch_size], длины для pack_padded_sequence
        """
        batch_size, max_len, _ = x_padded.size()
        
        # Упаковываем (pack) последовательность, чтобы LSTM пропускала "пустые" шаги быстрее
        packed_input = nn.utils.rnn.pack_padded_sequence(
            x_padded, 
            lengths.cpu(),  # lengths лучше передать на cpu, если они на gpu
            batch_first=True, 
            enforce_sorted=False  # Позволяет не сортировать batch по длине
        )
        
        # Прогоняем через LSTM
        packed_output, (h_n, c_n) = self.lstm(packed_input)
        
        # Можно получить все выходные вектора:
        # output, output_lengths = pad_packed_sequence(packed_output, batch_first=True)
        # shape output: [batch_size, max_len, hidden_dim * dir_factor]

        # Но для классификации обычно берут последнее скрытое состояние
        # h_n: [num_layers * dir_factor, batch_size, hidden_dim]
        if self.bidirectional:
            # Для bidirectional нужно склеить h_n из прямого и обратного прохода
            # Обычно берём h_n[-2] и h_n[-1], если num_layers=1
            # Или можно взять h_n с последнего слоя: h_n[-2,...], h_n[-1,...]
            # Ниже вариант для общего случая num_layers:
            h_last_layer = h_n.view(self.num_layers, 2, batch_size, self.hidden_dim)
            # h_last_layer[-1]: shape [2, batch_size, hidden_dim]
            h_forward = h_last_layer[-1, 0, :, :]
            h_backward = h_last_layer[-1, 1, :, :]
            h_final = torch.cat([h_forward, h_backward], dim=1)  # [batch_size, 2*hidden_dim]
        else:
            # Если одна направленность, берём последний слой h_n[-1]
            h_final = h_n[-1]  # shape [batch_size, hidden_dim]
        
        # Линейная классификация
        logits = self.classifier(h_final)  # [batch_size, num_classes]
        return logits


#######################################################
# 4. Пример тренировки (multi-label)
#######################################################

def train_varlen_demo():
    # Гиперпараметры для примера
    input_dim = 2
    hidden_dim = 128
    num_classes = 2
    batch_size = 16
    num_epochs = 10
    lr = 1e-3
    
    # Создаём датасет и лоадер
    dataset = SiamDataset(siam_dataset_describe=frame)
    loader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=collate_fn_varlen
    )
    
    # Создаём модель
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = LSTMClassifier(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_classes=num_classes,
        num_layers=2,
        bidirectional=True  # для примера включим двунаправленную LSTM
    ).to(device)
    
    # Лосс и оптимизатор
    criterion = nn.CrossEntropyLoss()  # multi-label
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Тренировочный цикл
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for x_padded, lengths, y_true in loader:
            x_padded = x_padded.to(device)
            lengths = lengths.to(device)
            y_true = y_true.to(device)
            
            optimizer.zero_grad()
            
            logits = model(x_padded, lengths)  # [batch_size, num_classes]
            # print(logits)
            # print(y_true)
            loss = criterion(logits, y_true)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {avg_loss:.4f}")
    
    print("Обучение завершено.")
    return model

if __name__ == "__main__":
    trained_model = train_varlen_demo()

Epoch [1/10] - Loss: 0.5107
Epoch [2/10] - Loss: 0.4391
Epoch [3/10] - Loss: 0.4407
Epoch [4/10] - Loss: 0.4262
Epoch [5/10] - Loss: 0.4432
Epoch [6/10] - Loss: 0.4245
Epoch [7/10] - Loss: 0.4242
Epoch [8/10] - Loss: 0.4032
Epoch [9/10] - Loss: 0.3986
Epoch [10/10] - Loss: 0.3899
Обучение завершено.


In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = "cpu"
# trained_model.to(device=device)
print(f"rinning at {device}")
dataset = SiamDataset(frame)
loader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn_varlen)
all_proba = []
all_real_y = []
for x_padded, lengths, y in loader:
    x_padded = x_padded.to(device)  # [batch_size, seq_len, input_dim]
    pad_mask = lengths.to(device)  # [batch_size, seq_len] (bool)
    # y = y.to(device)                # [batch_size, num_classes]
    
    logits = trained_model(x_padded, lengths)  # [batch_size, num_classes]
    pred_proba = torch.sigmoid(logits)
    all_proba.append(pred_proba)
    all_real_y.append(y)

rinning at cuda


In [51]:
# all_real_y_2 = np.array([tensor[0].cpu().tolist() for tensor in all_real_y]).ravel()
# pred_proba_2 = [tensor.cpu() for tensor in all_proba]
# all_answers_2 = np.array([(pred_proba[0] > 0.5).int().cpu().tolist() for pred_proba in all_proba]).ravel()


all_real_y_2 = [tensor[0].argmax().cpu() for tensor in all_real_y]
pred_proba_2 = [tensor.cpu() for tensor in all_proba]
all_answers_2 = [(pred_proba[0] > 0.5).int().cpu() for pred_proba in all_proba]
all_answers_2 =  [pred_proba.argmax().cpu() for pred_proba in all_proba]
print(len(all_real_y_2))
print(len(all_answers_2))
print(all_real_y_2)
print(all_answers_2)

500
500
[tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(1), tensor(1), tensor(1), tensor(0), tensor(1), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(1), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(1), tensor(0), tensor(1), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(0), t

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss
precision = precision_score(all_real_y_2, all_answers_2, average='macro')

recall = recall_score(all_real_y_2, all_answers_2, average='macro')

f1 = f1_score(all_real_y_2, all_answers_2, average='macro')

hamming = hamming_loss(all_real_y_2, all_answers_2)


print(f"Precision: {precision}, Recall: {recall}, F1: {f1}, Hamming Loss: {hamming}")


Precision: 0.9174914350970689, Recall: 0.8366759612186948, F1: 0.8694968220712618, Hamming Loss: 0.074
