In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch.nn.functional as F
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split


NUM_CLASSES = 5
D_MODEL = 48
MAX_SEQ_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LR = 0.00034
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
class Attention(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        
        self.attn_weights_layer = nn.Linear(feature_dim, 1)

    def forward(self, rnn_outputs):
        
        
        
        
        attn_energies = self.attn_weights_layer(torch.tanh(rnn_outputs)) 
        
        
        attn_energies = attn_energies.squeeze(-1)
        
        
        
        alpha = F.softmax(attn_energies, dim=-1)
        
        
        
        
        
        context_vector = torch.bmm(alpha.unsqueeze(1), rnn_outputs)
        
        
        
        return context_vector.squeeze(1), alpha


class Hybrid_CNN_RNN_Model(nn.Module):
    def __init__(self, num_features_ts, num_meta_features, num_classes,
                 cnn_out_channels=64, rnn_hidden_size=128, rnn_layers=1,
                 meta_hidden_dim=32, dropout_rate=0.3):
        super().__init__()
        
        
        self.conv1 = nn.Conv1d(in_channels=num_features_ts, out_channels=cnn_out_channels // 2, kernel_size=7, padding=3)
        self.bn1 = nn.BatchNorm1d(cnn_out_channels // 2)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2) 

        self.conv2 = nn.Conv1d(in_channels=cnn_out_channels // 2, out_channels=cnn_out_channels, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(cnn_out_channels)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv3 = nn.Conv1d(in_channels=cnn_out_channels, out_channels=cnn_out_channels * 2, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(cnn_out_channels * 2)
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        
        
        self.lstm = nn.LSTM(input_size=cnn_out_channels * 2, 
                              hidden_size=rnn_hidden_size,
                              num_layers=rnn_layers,
                              bidirectional=True, 
                              batch_first=True,   
                              dropout=dropout_rate if rnn_layers > 1 else 0)
        
        
        
        self.attention = Attention(feature_dim=rnn_hidden_size * 2)
        
        
        self.meta_fc1 = nn.Linear(num_meta_features, meta_hidden_dim * 2)
        self.meta_fc2 = nn.Linear(meta_hidden_dim * 2, meta_hidden_dim)
        
        
        
        classifier_input_dim = (rnn_hidden_size * 2) + meta_hidden_dim
        self.fc_combine1 = nn.Linear(classifier_input_dim, classifier_input_dim // 2)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc_final = nn.Linear(classifier_input_dim // 2, num_classes)

    def forward(self, x_ts, x_meta):
        
        
        
        
        
        x = F.relu(self.bn1(self.conv1(x_ts)))
        x = self.pool1(x)
        
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)
        
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool3(x)
        

        
        
        x = x.permute(0, 2, 1) 
        
        
        lstm_out, (h_n, c_n) = self.lstm(x)
        
        
        

        
        
        context_vector, attn_weights = self.attention(lstm_out) 
        
        
        
        meta_processed = F.relu(self.meta_fc1(x_meta))
        meta_processed = F.relu(self.meta_fc2(meta_processed)) 
        
        
        
        combined_features = torch.cat((context_vector, meta_processed), dim=1)
        
        
        out = F.relu(self.fc_combine1(combined_features))
        out = self.dropout(out)
        out = self.fc_final(out) 
        
        
        return out

In [None]:
class PatientDataset(Dataset):
    def __init__(self, manifest_df, data_dir, target_encoder,
                 meta_label_encoders=None,
                 meta_scalers=None,
                 max_seq_len=MAX_SEQ_LEN,
                 is_train=True):
        
        self.manifest_df = manifest_df.copy()
        self.data_dir = data_dir
        self.max_seq_len = max_seq_len
        self.target_encoder = target_encoder
        
        self.meta_cols = ['пол', 'полных лет', 'пациент off/on', 'определенный диагноз врачом (0-5)']
        self.categorical_meta_cols = ['пол', 'пациент off/on']
        self.numerical_meta_cols = ['полных лет', 'определенный диагноз врачом (0-5)']

        if is_train:
            self.meta_label_encoders = {col: LabelEncoder() for col in self.categorical_meta_cols}
            self.meta_scalers = {col: col for col in self.numerical_meta_cols}
        else:
            if meta_label_encoders is None or meta_scalers is None:
                raise ValueError("meta_label_encoders and meta_scalers must be provided when is_train=False")
            self.meta_label_encoders = meta_label_encoders
            self.meta_scalers = meta_scalers

        for col in self.categorical_meta_cols:
            le = self.meta_label_encoders[col]
            self.manifest_df[col] = self.manifest_df[col].astype(str)
            if is_train:
                self.manifest_df[col] = le.fit_transform(self.manifest_df[col])
            else:
                self.manifest_df[col] = self.manifest_df[col].apply(
                    lambda x: le.transform([x])[0] if x in le.classes_ else -1
                )

        
        
        
        
        
        
        
        
        
        
        
        
        self.manifest_df['Folder Path Encoded'] = self.target_encoder.transform(self.manifest_df['Folder Path'])

    def __len__(self):
        return len(self.manifest_df)

    def __getitem__(self, idx):
        row = self.manifest_df.iloc[idx]
        file_name = row['имя файла данных']
        label = row['Folder Path Encoded']
        
        ts_data_path = os.path.join(self.data_dir, file_name)
        ts_data = None 

        try:
            
            
            df_temp = pd.read_csv(ts_data_path, header=None, na_values=['NA', '?', '', ' '])
            
            
            for col in df_temp.columns:
                df_temp[col] = pd.to_numeric(df_temp[col], errors='coerce')
            
            
            
            
            df_temp = df_temp.fillna(0) 

            ts_data = df_temp.values.astype(np.float32) 

        except FileNotFoundError:
            print(f"ERROR: File not found {ts_data_path}. Returning zeros.")
            ts_data = np.zeros((1, 64), dtype=np.float32) 
            
        except pd.errors.EmptyDataError:
            print(f"ERROR: File {ts_data_path} is empty. Returning zeros.")
            ts_data = np.zeros((1, 64), dtype=np.float32)
        except ValueError as ve: 
            print(f"ERROR: ValueError converting data in {ts_data_path} to numeric: {ve}. Check for non-numeric values. Returning zeros.")
            ts_data = np.zeros((1, 64), dtype=np.float32)
        except Exception as e: 
            print(f"ERROR: Could not read or process {ts_data_path}: {e}. Returning zeros.")
            ts_data = np.zeros((1, 64), dtype=np.float32)

        
        if ts_data is None or ts_data.ndim != 2:
             print(f"CRITICAL ERROR: ts_data for {file_name} is invalid after read attempt. Shape: {ts_data.shape if ts_data is not None else 'None'}. Returning zeros.")
             ts_data = np.zeros((1, 64), dtype=np.float32)


        
        if ts_data.shape[1] != 64:
            
            if ts_data.shape[1] > 64:
                ts_data = ts_data[:, :64]
            else: 
                padding_features = np.zeros((ts_data.shape[0], 64 - ts_data.shape[1]), dtype=np.float32)
                ts_data = np.hstack((ts_data, padding_features))
        
        
        current_seq_len = ts_data.shape[0]
        if current_seq_len > self.max_seq_len:
            ts_data = ts_data[:self.max_seq_len, :]
        elif current_seq_len < self.max_seq_len:
            padding = np.zeros((self.max_seq_len - current_seq_len, ts_data.shape[1]), dtype=np.float32)
            ts_data = np.vstack((ts_data, padding))
        
        
        ts_data_tensor = torch.FloatTensor(ts_data).T 
        
        meta_features_values = row[self.meta_cols].values
        try:
            
            meta_features = torch.FloatTensor(meta_features_values.astype(np.float32))
        except ValueError:
            print(f"Warning: Could not convert meta features to float for row {idx}: {meta_features_values}. Using zeros.")
            meta_features = torch.zeros(len(self.meta_cols), dtype=torch.float32)

        return ts_data_tensor, meta_features, torch.LongTensor([label]).squeeze()


In [None]:


def train_model(model, train_loader, criterion, optimizer, device, epochs, val_loader=None):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0
        for i, batch_data in enumerate(train_loader):
            if batch_data is None: 
                print(f"Skipping a problematic batch at index {i}")
                continue
            ts_inputs, meta_inputs, labels = batch_data
            ts_inputs, meta_inputs, labels = ts_inputs.to(device), meta_inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(ts_inputs, meta_inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

            if (i + 1) % (max(1, len(train_loader) // 5)) == 0: 
                print(f"Epoch [{epoch+1}/{epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
        
        epoch_loss = running_loss / (len(train_loader) - sum(1 for b in train_loader if b is None)) if (len(train_loader) - sum(1 for b in train_loader if b is None)) > 0 else float('inf')
        epoch_acc = 100 * correct_train / total_train if total_train > 0 else 0
        print(f"Epoch [{epoch+1}/{epochs}] Training Loss: {epoch_loss:.4f}, Training Acc: {epoch_acc:.2f}%")

        if val_loader:
            evaluate_model(model, val_loader, criterion, device, epoch, epochs)


def evaluate_model(model, val_loader, criterion, device, epoch=None, epochs=None):
    model.eval()
    running_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for batch_data in val_loader:
            if batch_data is None:
                continue
            ts_inputs, meta_inputs, labels = batch_data
            ts_inputs, meta_inputs, labels = ts_inputs.to(device), meta_inputs.to(device), labels.to(device)
            outputs = model(ts_inputs, meta_inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()
    
    avg_loss = running_loss / (len(val_loader) - sum(1 for b in val_loader if b is None)) if (len(val_loader) - sum(1 for b in val_loader if b is None)) > 0 else float('inf')
    accuracy = 100 * correct_val / total_val if total_val > 0 else 0
    epoch_str = f"Epoch [{epoch+1}/{epochs}] " if epoch is not None else ""
    print(f"{epoch_str}Validation Loss: {avg_loss:.4f}, Validation Acc: {accuracy:.2f}%")
    return avg_loss, accuracy



In [None]:
MANIFEST_FILE = "train.csv"
DATA_DIR = "data"

try:
    manifest_df = pd.read_csv(MANIFEST_FILE, encoding='utf-8')
    print(f"Successfully loaded '{MANIFEST_FILE}'. Shape: {manifest_df.shape}")

except FileNotFoundError:
    print(f"'{MANIFEST_FILE}' not found.")
    user_choice = input(f"Create dummy files for testing? (yes/no): ").lower()
    if user_choice == 'yes':

        global NUM_CLASSES
        _dummy_df = create_dummy_files_for_testing(
            data_dir=DATA_DIR, manifest_file=MANIFEST_FILE)

        NUM_CLASSES = _dummy_df['Folder Path'].nunique()
        manifest_df = _dummy_df
        print(f"Updated NUM_CLASSES to {NUM_CLASSES} based on dummy data.")
    else:
        print(
            "Exiting. Please provide a 'train.csv' and data files in './data/' directory.")
        exit()
except Exception as e:
    print(f"Error loading '{MANIFEST_FILE}': {e}")
    exit()

print(f"Using {len(manifest_df)} samples from manifest.")
if 'Folder Path' not in manifest_df.columns:
    print("ERROR: 'Folder Path' column missing in manifest. Exiting.")
    exit()

actual_num_classes = manifest_df['Folder Path'].nunique()
if NUM_CLASSES != actual_num_classes:
    print(
        f"Warning: Initial NUM_CLASSES was {NUM_CLASSES}, but found {actual_num_classes} unique 'Folder Path' values. Adjusting NUM_CLASSES.")
    NUM_CLASSES = actual_num_classes

target_label_encoder = LabelEncoder()
target_label_encoder.fit(manifest_df['Folder Path'])

train_df, val_df = train_test_split(
    manifest_df,
    test_size=0.2,
    random_state=42,
    stratify=manifest_df['Folder Path']
)

train_dataset = PatientDataset(
    manifest_df=train_df,
    data_dir=DATA_DIR,
    target_encoder=target_label_encoder,
    max_seq_len=MAX_SEQ_LEN,
    is_train=True
)

fitted_meta_label_encoders = train_dataset.meta_label_encoders
fitted_meta_scalers = train_dataset.meta_scalers

val_dataset = PatientDataset(
    manifest_df=val_df,
    data_dir=DATA_DIR,
    target_encoder=target_label_encoder,
    meta_label_encoders=fitted_meta_label_encoders,
    meta_scalers=fitted_meta_scalers,
    max_seq_len=MAX_SEQ_LEN,
    is_train=False
)


def collate_fn_skip_none(batch):
    batch = list(filter(lambda x: x is not None, batch))
    if not batch:
        return None
    return torch.utils.data.dataloader.default_collate(batch)


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=2, pin_memory=True, collate_fn=collate_fn_skip_none)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
                        num_workers=2, pin_memory=True, collate_fn=collate_fn_skip_none)

print(
    f"Number of training samples (approx, after potential skips): {len(train_df)}")
print(
    f"Number of validation samples (approx, after potential skips): {len(val_df)}")

num_meta_features = len(train_dataset.meta_cols)
model = Hybrid_CNN_RNN_Model(
    num_features_ts=64,
    num_meta_features=num_meta_features,
    num_classes=NUM_CLASSES

).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

print(
    f"Model created. Number of classes: {NUM_CLASSES}. Using device: {DEVICE}")
print(
    f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

print("Starting training...")
train_model(model, train_loader, criterion, optimizer,
            DEVICE, EPOCHS, val_loader=val_loader)

print("Training finished.")