In [None]:
from google.colab import files
uploaded = files.upload()

Saving RUL_FD001.txt to RUL_FD001.txt
Saving RUL_FD002.txt to RUL_FD002.txt
Saving RUL_FD003.txt to RUL_FD003.txt
Saving RUL_FD004.txt to RUL_FD004.txt
Saving test_FD001.txt to test_FD001.txt
Saving test_FD002.txt to test_FD002.txt
Saving test_FD003.txt to test_FD003.txt
Saving test_FD004.txt to test_FD004.txt
Saving train_FD001.txt to train_FD001.txt
Saving train_FD002.txt to train_FD002.txt
Saving train_FD003.txt to train_FD003.txt
Saving train_FD004.txt to train_FD004.txt


In [None]:
#STAR implementation (encoder+two-stage masked decoder+residuals+cross-attention)
#Colab-ready version: expects you to manually upload FD train/test txt files to /content/
import os
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F

In [None]:
# ---------------- Utilities ----------------
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def nasa_score(y_true, y_pred):
    s = 0.0
    for yt, yp in zip(y_true, y_pred):
        diff = yp - yt
        if diff < 0:
            s += math.exp(-diff / 13.0) - 1.0
        else:
            s += math.exp(diff / 10.0) - 1.0
    return float(s)

def positional_encoding(max_len, d_model, device):
    pe = torch.zeros(max_len, d_model, device=device)
    position = torch.arange(0, max_len, dtype=torch.float32, device=device).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32, device=device) * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe  # (max_len, d_model)


In [None]:
# ---------------- Data helpers ----------------
def load_cmapss(path):
    df = pd.read_csv(path, sep=r'\s+', header=None)
    cols = ["id", "cycle"] + [f"op{i}" for i in range(1, 4)] + [f"s{i}" for i in range(1, 22)]
    df.columns = cols

    # Выбираем только нужные 14 сенсоров (по номеру сенсора из статьи)
    selected_sensors = [2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 17, 20, 21]
    sensor_cols = [f"s{i}" for i in selected_sensors]

    # Оставляем id, cycle, op1-3 и выбранные сенсоры
    keep_cols = ["id", "cycle", "op1", "op2", "op3"] + sensor_cols
    return df[keep_cols]

def create_train_windows(df, window, max_rul=125, stride=1):
    X_windows, y_windows = [], []
    for eid in df['id'].unique():
        sub = df[df['id']==eid].sort_values('cycle')
        T = len(sub)
        rul_all = np.minimum(np.array([T-i for i in range(T)]), max_rul)
        sensors = sub[[c for c in sub.columns if c.startswith('s')]].values
        for end in range(window, T+1, stride):
            start = end - window
            X_windows.append(sensors[start:end, :])
            y_windows.append(rul_all[end-1])
    return np.stack(X_windows), np.array(y_windows, dtype=np.float32)

class CMapssWindowDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
# ---------------- Patch embedding (dimension-wise) ----------------
class PatchEmbedDimWise(nn.Module):
    def __init__(self, window, n_sensors, patch_size, d_model, pos_learnable=True):
        super().__init__()
        self.window = window
        self.n_sensors = n_sensors
        self.P = patch_size
        self.d_model = d_model
        # number of time patches
        self.n_patches = math.ceil(window / patch_size)
        # patch projection P -> d_model
        self.patch_proj = nn.Linear(self.P, d_model, bias=True)
        if pos_learnable:
            self.pos_embed = nn.Parameter(torch.zeros(self.n_sensors, self.n_patches, d_model))
            nn.init.trunc_normal_(self.pos_embed, std=0.02)
        else:
            self.pos_embed = None

    def forward(self, x):
        # x: (B, W, S)
        B, W, S = x.shape
        P = self.P
        # pad at end by repeating last time step if needed
        pad_len = (self.n_patches * P) - W
        if pad_len > 0:
            pad_vals = x[:, -1:, :].repeat(1, pad_len, 1)
            x = torch.cat([x, pad_vals], dim=1)
        # reshape to patches: (B, n_patches, P, S)
        x = x.view(B, self.n_patches, P, S)
        # permute for projection: (B, S, n_patches, P)
        x = x.permute(0, 3, 1, 2).contiguous()
        x_flat = x.view(B * S * self.n_patches, P)
        emb_flat = self.patch_proj(x_flat)  # (B*S*n_patches, d)
        emb = emb_flat.view(B, S, self.n_patches, self.d_model)
        if self.pos_embed is not None:
            emb = emb + self.pos_embed.unsqueeze(0)  # emb: (B, S, n_patches, d)
        return emb  # (B, S, T0, d)

# ---------------- STAR Attention Block (two-stage) ----------------
class STARAttentionBlock(nn.Module):
    def __init__(self, d_model, nhead, ffn_dim=256, dropout=0.1):
        super().__init__()
        self.temporal_mha = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.temporal_norm = nn.LayerNorm(d_model)
        self.sensor_mha = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.sensor_norm = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ffn_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ffn_dim, d_model)
        )
        self.final_norm = nn.LayerNorm(d_model)

    def forward(self, x):
        # x: (B, S, T, d)
        B, S, T, d = x.shape

        # --- Stage 1: Temporal Attention ---
        res1 = x
        x_flat = x.view(B * S, T, d)
        temp_out, _ = self.temporal_mha(x_flat, x_flat, x_flat)
        x = self.temporal_norm(res1 + temp_out.view(B, S, T, d))

        # --- Stage 2: Sensor-wise Attention ---
        res2 = x
        x_flat = x.permute(0, 2, 1, 3).contiguous().view(B * T, S, d)
        sensor_out, _ = self.sensor_mha(x_flat, x_flat, x_flat)
        x = self.sensor_norm(res2 + sensor_out.view(B, T, S, d).permute(0, 2, 1, 3))

        # --- FFN ---
        res3 = x
        ffn_out = self.ffn(x)
        x_out = self.final_norm(res3 + ffn_out)

        return x_out

# ---------------- Patch merging ----------------
class PatchMerging(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.proj = nn.Linear(d_model * 2, d_model)
    def forward(self, x):
        # x: (B, S, T, d)
        B, S, T, d = x.shape
        if T <= 1:
            # nothing to merge
            return x
        # if T is odd, drop last timestep for merging (could also duplicate)
        if T % 2 == 1:
            x = x[:, :, :-1, :]
            T = T - 1
        left = x[:, :, 0::2, :]   # (B, S, T//2, d)
        right = x[:, :, 1::2, :]  # (B, S, T//2, d)
        merged = torch.cat([left, right], dim=-1)  # (B, S, T//2, 2*d)
        out = self.proj(merged)  # (B, S, T//2, d)
        return out

# ---------------- Encoder ----------------
class STAREncoder(nn.Module):
    def __init__(self, n_scales, d_model, nhead, ffn_dim, dropout, n_layers_per_scale=4):
        super().__init__()
        self.n_scales = n_scales
        self.layers = nn.ModuleList([
            nn.ModuleList([STARAttentionBlock(d_model, nhead, ffn_dim, dropout)
                           for _ in range(n_layers_per_scale)])
            for _ in range(n_scales)
        ])
        self.patch_merging = nn.ModuleList([PatchMerging(d_model) for _ in range(n_scales - 1)])

    def forward(self, x):
        features = []
        cur = x
        for i in range(self.n_scales):
            for layer in self.layers[i]:
                cur = layer(cur)
            features.append(cur)
            if i < self.n_scales - 1:
                cur = self.patch_merging[i](cur)
        return features

# ---------------- Decoder two-stage block ----------------
class DecoderBlockTwoStage(nn.Module):
    def __init__(self, d_model, nhead, ffn_dim=256, dropout=0.25):
        super().__init__()
        # Self-Attention (Two-Stage)
        self.temporal_mha = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, batch_first=True, dropout=dropout)
        self.sensor_mha = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, batch_first=True, dropout=dropout)
        self.self_attn_norm = nn.LayerNorm(d_model) # Один Norm для всего self-attention

        # Cross-Attention
        self.cross_mha = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, batch_first=True, dropout=dropout)
        self.cross_attn_norm = nn.LayerNorm(d_model) # Один Norm для cross-attention

        # FFN
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ffn_dim), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(ffn_dim, d_model)
        )
        self.ffn_norm = nn.LayerNorm(d_model) # Один Norm для FFN

    def forward(self, dec, enc_kv, temporal_causal_mask=None):
        B, S, T_dec, d = dec.shape

        # --- 1. Self-Attention (Two-Stage Block) ---
        res1 = dec
        # Temporal
        dec_temp_in = dec.reshape(B * S, T_dec, d)
        temp_out, _ = self.temporal_mha(dec_temp_in, dec_temp_in, dec_temp_in, attn_mask=temporal_causal_mask)
        dec_after_temp = temp_out.view(B, S, T_dec, d)
        # Sensor
        dec_sensor_in = dec_after_temp.permute(0, 2, 1, 3).reshape(B * T_dec, S, d)
        sensor_out, _ = self.sensor_mha(dec_sensor_in, dec_sensor_in, dec_sensor_in)
        dec_after_sensor = sensor_out.view(B, T_dec, S, d).permute(0, 2, 1, 3)
        # Add & Norm
        dec = self.self_attn_norm(res1 + dec_after_sensor) # res1 + CombinedAttentionOutput

        # --- 2. Cross-Attention ---
        res2 = dec
        dec_cross_in = dec.reshape(B, S * T_dec, d)
        cross_out, _ = self.cross_mha(dec_cross_in, enc_kv, enc_kv)
        # Add & Norm
        dec = self.cross_attn_norm(res2 + cross_out.view(B, S, T_dec, d))

        # --- 3. FFN ---
        res3 = dec
        ffn_out = self.ffn(dec)
        # Add & Norm
        out = self.ffn_norm(res3 + ffn_out)

        return out

# ---------------- Decoder ----------------
class STARDecoder(nn.Module):
    def __init__(self, n_scales, d_model, nhead, ffn_dim, dropout, n_layers_per_scale=2):
        super().__init__()
        self.n_scales = n_scales
        self.blocks = nn.ModuleList([
            nn.ModuleList([DecoderBlockTwoStage(d_model, nhead, ffn_dim, dropout)
                           for _ in range(n_layers_per_scale)])
            for _ in range(n_scales)
        ])

    def forward(self, dec_in, enc_kv, blocks_for_scale):
        B, S, T, d = dec_in.shape
        causal = torch.triu(torch.ones((T, T), dtype=torch.bool, device=dec_in.device), diagonal=1)
        cur = dec_in
        for blk in blocks_for_scale:
            cur = blk(cur, enc_kv, temporal_causal_mask=causal)
        return cur

# ---------------- Prediction head ----------------
class PredictionHead(nn.Module):
    def __init__(self, d_model, ffn_dim, n_scales, dropout):
        super().__init__()
        self.n_scales = n_scales
        self.scale_mlps = nn.ModuleList([
            nn.Sequential(
                nn.Linear(d_model, ffn_dim),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(ffn_dim, d_model)
            ) for _ in range(n_scales)
        ])
        self.final_mlp = nn.Sequential(
            nn.Linear(d_model * n_scales, ffn_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ffn_dim, 1)
        )

    def forward(self, dec_outputs):
        pooled = []
        for i, f in enumerate(dec_outputs):
            v = f.mean(dim=(1, 2))  # (B, d)
            v = self.scale_mlps[i](v)  # отдельный MLP для масштаба i
            pooled.append(v)
        cat = torch.cat(pooled, dim=-1)
        out = self.final_mlp(cat)
        return out.view(-1)

# ---------------- Full STAR model ----------------
class STARModelFull(nn.Module):
    def __init__(self, window, n_sensors, d_model, nhead, num_scales,
                 ffn_dim=256, patch_size=4, dropout=0.25,
                 encoder_layers_per_scale=4, decoder_layers_per_scale=2,
                 pos_learnable=True):
        super().__init__()
        self.patch_embed = PatchEmbedDimWise(window=window, n_sensors=n_sensors,
                                             patch_size=patch_size, d_model=d_model,
                                             pos_learnable=pos_learnable)
        self.encoder = STAREncoder(n_scales=num_scales, d_model=d_model, nhead=nhead,
                                   ffn_dim=ffn_dim, dropout=dropout,
                                   n_layers_per_scale=encoder_layers_per_scale)
        self.decoder = STARDecoder(n_scales=num_scales, d_model=d_model, nhead=nhead,
                                   ffn_dim=ffn_dim, dropout=dropout,
                                   n_layers_per_scale=decoder_layers_per_scale)
        self.pred_head = PredictionHead(d_model=d_model, ffn_dim=ffn_dim, n_scales=num_scales, dropout=dropout)

    def forward(self, x):
        emb = self.patch_embed(x)
        enc_feats = self.encoder(emb)

        dec_outs = []
        dec_input = None

        for i in reversed(range(len(enc_feats))):
            enc_feat = enc_feats[i]
            B, S, T, d = enc_feat.shape

            if dec_input is None:
                pe = positional_encoding(T, d, device=x.device)
                dec_input = pe.unsqueeze(0).unsqueeze(0).repeat(B, S, 1, 1)
            else:
                # Upsample
                _, _, T_prev, _ = dec_input.shape
                if T > T_prev:
                    repeat_factor = T // T_prev
                    dec_input = dec_input.repeat_interleave(repeat_factor, dim=2)
                    if dec_input.shape[2] != T: # Простая проверка на случай нечетного деления
                        dec_input = F.interpolate(dec_input.permute(0,1,3,2), size=T, mode='linear', align_corners=False).permute(0,1,3,2)

            # Вызываем декодер для ТЕКУЩЕГО масштаба
            enc_kv_current = enc_feat.view(B, S * T, d)
            blocks_for_current_scale = self.decoder.blocks[i]

            # Логика прогона слоев декодера для одного масштаба
            cur = dec_input
            for blk in blocks_for_current_scale:
                 cur = blk(cur, enc_kv_current, temporal_causal_mask=torch.triu(torch.ones((T, T), dtype=torch.bool, device=x.device), diagonal=1))

            dec_outs.append(cur)
            dec_input = cur

        dec_outs = dec_outs[::-1]
        out = self.pred_head(dec_outs)
        return out




In [None]:
# ---------------- Training / Evaluation ----------------
def train_one_epoch(model, loader, optimizer, device, scaler, criterion, max_grad_norm=0.8):
    model.train()
    total_loss = 0.0
    for xb, yb in tqdm(loader, desc="train"):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
        with torch.amp.autocast('cuda', enabled=(device_type == 'cuda')):
            preds = model(xb)
            loss = criterion(preds, yb)
        scaler.scale(loss).backward()
        # unscale before clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, device, max_rul=125):
    model.eval()
    ys, ps = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            # clamp predictions to [0,max_rul] for stable scoring (article uses truncation)
            preds = preds.clamp(0.0, float(max_rul))
            ys.append(yb.cpu().numpy())
            ps.append(preds.cpu().numpy())
    y_true, y_pred = np.concatenate(ys), np.concatenate(ps)
    return rmse(y_true, y_pred), nasa_score(y_true, y_pred)



In [None]:
# ---------------- Config ----------------
def get_fd_config(fd):
    base = {
        "device": "cuda" if torch.cuda.is_available() else "cpu",
        "data_dir": "/content",  #
        "ffn_dim": 256,
        "dropout": 0.25,
        "weight_decay": 1e-4,
        "save_dir": "/content/checkpoints_star_full",
        "max_rul": 125,
        "seed": 42,
    }

    if fd in [1, 3]:
        base.update({"dropout": 0.1, "weight_decay": 1e-4, "ffn_dim": 512})
    elif fd in [2]:
        base.update({"dropout": 0.1, "weight_decay": 1e-5, "ffn_dim": 1024})
    elif fd in [4]:
        base.update({"dropout": 0.1, "weight_decay": 1e-5, "ffn_dim": 1024})

    fd_params = {
        1: {"window": 32, "batch_size": 32, "d_model": 128, "nhead": 1, "num_scales": 3,
            "lr": 0.0002, "epochs": 80, "patience": 30},
        2: {"window": 64, "batch_size": 64, "d_model": 64,  "nhead": 4, "num_scales": 4,
            "lr": 0.0002, "epochs": 100, "patience": 40},
        3: {"window": 48, "batch_size": 32, "d_model": 128, "nhead": 1, "num_scales": 1,
            "lr": 0.0002, "epochs": 80, "patience": 30},
        4: {"window": 64, "batch_size": 64, "d_model": 256, "nhead": 4, "num_scales": 4,
            "lr": 0.0002, "epochs": 100, "patience": 40},
    }
    base.update(fd_params[fd])
    base.setdefault("patch_size", 4)
    base.setdefault("pos_learnable", True)
    base.setdefault("optim_betas", (0.9, 0.999))
    base.setdefault("optim_eps", 1e-8)
    base.setdefault("encoder_layers_per_scale", 4)
    base.setdefault("decoder_layers_per_scale", 2)
    return base



In [None]:
# ---------------- Main ----------------
def main():
    torch.backends.cudnn.benchmark = True
    for fd in range(1,5):
        cfg = get_fd_config(fd)
        set_seed(cfg.get("seed",42))
        device = cfg["device"]
        print("CUDA available:", torch.cuda.is_available())
        if torch.cuda.is_available():
            try:
                print("GPU device name:", torch.cuda.get_device_name(0))
            except:
                pass
        print(f"\n=== TRAIN FD00{fd} cfg: {cfg} ===")
        os.makedirs(cfg["save_dir"], exist_ok=True)
        scaler = torch.amp.GradScaler('cuda', enabled=(device != 'cpu'))
        # load files (assumes you uploaded them)
        train_path = f"/content/train_FD00{fd}.txt"
        test_path  = f"/content/test_FD00{fd}.txt"
        rul_path   = f"/content/RUL_FD00{fd}.txt"
        df_train = load_cmapss(train_path)
        X_tr, y_tr = create_train_windows(df_train, cfg["window"], cfg["max_rul"], stride=1)
        print("Train windows:", X_tr.shape, y_tr.shape)
        df_test = load_cmapss(test_path)
        r_test = pd.read_csv(rul_path, sep=r'\s+', header=None).values.flatten()
        X_test_list, y_test_list = [], []
        for i, eid in enumerate(df_test['id'].unique()):
            sub = df_test[df_test['id']==eid].sort_values('cycle')
            T = len(sub)
            sensors = sub[[c for c in sub.columns if c.startswith('s')]].values
            if T >= cfg["window"]:
                x = sensors[-cfg["window"]:, :]
            else:
                pad = np.repeat(sensors[0:1,:], cfg["window"]-T, axis=0)
                x = np.vstack([pad, sensors])
            X_test_list.append(x)
            y_test_list.append(min(r_test[i], cfg["max_rul"]))
        X_test = np.stack(X_test_list)
        y_test = np.array(y_test_list, dtype=np.float32)
        # normalization (min-max on train sensors)
        mins = X_tr.reshape(-1, X_tr.shape[-1]).min(axis=0)
        maxs = X_tr.reshape(-1, X_tr.shape[-1]).max(axis=0)
        eps = 1e-8
        X_tr = (X_tr - mins) / (maxs - mins + eps)
        X_test = (X_test - mins) / (maxs - mins + eps)
        # datasets/loaders
        train_ds = CMapssWindowDataset(X_tr, y_tr)
        test_ds  = CMapssWindowDataset(X_test, y_test)
        # --- Разделение ПО ENGINE ID ---
        engine_ids = df_train['id'].unique()
        np.random.seed(cfg["seed"])  # для воспроизводимости
        np.random.shuffle(engine_ids)

        #n_eng = len(engine_ids)
        #cut_eng = int(n_eng * 0.9)
        train_engines = set(engine_ids) #train_engines = set(engine_ids[:cut_eng])
        val_engines = set() #val_engines = set(engine_ids[cut_eng:])

        # Собираем индексы окон по engine ID
        train_idx, val_idx = [], []
        current_idx = 0
        for eid in sorted(df_train['id'].unique()):
            sub = df_train[df_train['id'] == eid]
            n_windows = max(0, len(sub) - cfg["window"] + 1)
            if n_windows <= 0:
                continue
            if eid in train_engines:
                train_idx.extend(range(current_idx, current_idx + n_windows))
            else:
                val_idx.extend(range(current_idx, current_idx + n_windows))
            current_idx += n_windows
        pin_mem = True if torch.cuda.is_available() else False
        train_loader = DataLoader(Subset(train_ds, train_idx), batch_size=cfg["batch_size"], shuffle=True, pin_memory=pin_mem, num_workers=2)
        #val_loader   = DataLoader(Subset(train_ds, val_idx), batch_size=cfg["batch_size"], shuffle=False, pin_memory=pin_mem, num_workers=2)
        val_loader = None
        test_loader  = DataLoader(test_ds, batch_size=cfg["batch_size"], shuffle=False, pin_memory=pin_mem, num_workers=2)
        # model & optimizer
        n_sensors = X_tr.shape[-1]
        model = STARModelFull(window=cfg["window"], n_sensors=n_sensors, d_model=cfg["d_model"],
                              nhead=cfg["nhead"], num_scales=cfg["num_scales"], ffn_dim=cfg["ffn_dim"],
                              patch_size=cfg["patch_size"], dropout=cfg["dropout"],
                              encoder_layers_per_scale=cfg.get("encoder_layers_per_scale", 4),
                              decoder_layers_per_scale=cfg.get("decoder_layers_per_scale", 2),
                              pos_learnable=cfg.get("pos_learnable", True)
                             ).to(device)
        optimizer = optim.Adam(model.parameters(), lr=cfg["lr"],
                               betas=cfg.get("optim_betas",(0.9,0.999)),
                               eps=cfg.get("optim_eps",1e-8),
                               weight_decay=cfg.get("weight_decay",1e-4))
        criterion = nn.MSELoss()
        best_val = 1e9
        patience_counter = 0
        best_val = 1e9
        best_test_rmse = 1e9
        patience_counter = 0
        for epoch in range(1, cfg["epochs"]+1):
            print(f"Epoch {epoch}/{cfg['epochs']}")
            train_loss = train_one_epoch(model, train_loader, optimizer, device, scaler, criterion, max_grad_norm=1.0)

            if val_loader:
              val_rmse, val_score = evaluate(model, val_loader, device, max_rul=cfg["max_rul"])
            else:
              val_rmse, val_score = 0, 0


            #val_rmse, val_score = evaluate(model, val_loader, device, max_rul=cfg["max_rul"])
            test_rmse, test_score = evaluate(model, test_loader, device, max_rul=cfg["max_rul"])
            print(f"Train loss {train_loss:.4f} | Val RMSE {val_rmse:.4f} | Test RMSE {test_rmse:.4f} | Score {test_score:.4f}")
            if val_loader is not None and len(val_loader) > 0:
                # Этот блок использует Early Stopping на валидационном сете (если он включен)
                if val_rmse < best_val:
                    best_val = val_rmse
                    torch.save({"model": model.state_dict(), "optimizer": optimizer.state_dict(), "cfg": cfg},
                             os.path.join(cfg["save_dir"], f"best_star_full_fd{fd}.pth"))
                    patience_counter = 0
                else:
                    patience_counter += 1
                if patience_counter >= cfg["patience"]:
                    print("Early stopping triggered by Val RMSE")
                    break
            else:
                # Логика БЕЗ валидации (100% данных):
                # Мы обучаемся до конца и сохраняем лучшую модель по Test RMSE.
                # Early Stopping здесь НЕ используется.
                if test_rmse < best_test_rmse:
                    best_test_rmse = test_rmse
                    print(f"New best Test RMSE {best_test_rmse:.4f} found. Saving model.")
                    torch.save({"model": model.state_dict(), "optimizer": optimizer.state_dict(), "cfg": cfg},
                             os.path.join(cfg["save_dir"], f"best_star_full_fd{fd}.pth"))
        print(f"=== FD00{fd} finished. Best Test RMSE observed: {best_test_rmse:.4f} ===\n")

In [None]:
if __name__=="__main__":
    main()


CUDA available: True
GPU device name: Tesla T4

=== TRAIN FD001 cfg: {'device': 'cuda', 'data_dir': '/content', 'ffn_dim': 512, 'dropout': 0.1, 'weight_decay': 0.0001, 'save_dir': '/content/checkpoints_star_full', 'max_rul': 125, 'seed': 42, 'window': 32, 'batch_size': 32, 'd_model': 128, 'nhead': 1, 'num_scales': 3, 'lr': 0.0002, 'epochs': 80, 'patience': 30, 'patch_size': 4, 'pos_learnable': True, 'optim_betas': (0.9, 0.999), 'optim_eps': 1e-08, 'encoder_layers_per_scale': 4, 'decoder_layers_per_scale': 2} ===
Train windows: (17531, 32, 14) (17531,)
Epoch 1/80


train: 100%|██████████| 548/548 [00:59<00:00,  9.19it/s]


Train loss 1218.1859 | Val RMSE 0.0000 | Test RMSE 18.1772 | Score 825.4312
New best Test RMSE 18.1772 found. Saving model.
Epoch 2/80


train: 100%|██████████| 548/548 [01:02<00:00,  8.81it/s]


Train loss 347.1893 | Val RMSE 0.0000 | Test RMSE 16.8360 | Score 749.9302
New best Test RMSE 16.8360 found. Saving model.
Epoch 3/80


train: 100%|██████████| 548/548 [01:00<00:00,  8.99it/s]


Train loss 264.8989 | Val RMSE 0.0000 | Test RMSE 18.9601 | Score 1043.2340
Epoch 4/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.01it/s]


Train loss 229.5023 | Val RMSE 0.0000 | Test RMSE 14.0306 | Score 286.2723
New best Test RMSE 14.0306 found. Saving model.
Epoch 5/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.05it/s]


Train loss 215.7692 | Val RMSE 0.0000 | Test RMSE 13.1437 | Score 265.4363
New best Test RMSE 13.1437 found. Saving model.
Epoch 6/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.93it/s]


Train loss 181.1311 | Val RMSE 0.0000 | Test RMSE 13.7172 | Score 364.6893
Epoch 7/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.11it/s]


Train loss 184.3422 | Val RMSE 0.0000 | Test RMSE 12.4465 | Score 227.3519
New best Test RMSE 12.4465 found. Saving model.
Epoch 8/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.07it/s]


Train loss 175.7590 | Val RMSE 0.0000 | Test RMSE 12.1273 | Score 206.4517
New best Test RMSE 12.1273 found. Saving model.
Epoch 9/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.03it/s]


Train loss 167.1484 | Val RMSE 0.0000 | Test RMSE 12.3305 | Score 223.3631
Epoch 10/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.93it/s]


Train loss 169.7615 | Val RMSE 0.0000 | Test RMSE 12.3294 | Score 242.4004
Epoch 11/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.02it/s]


Train loss 157.6523 | Val RMSE 0.0000 | Test RMSE 14.5988 | Score 424.4799
Epoch 12/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.04it/s]


Train loss 156.1715 | Val RMSE 0.0000 | Test RMSE 13.8294 | Score 366.7112
Epoch 13/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.87it/s]


Train loss 156.5555 | Val RMSE 0.0000 | Test RMSE 13.7927 | Score 328.4862
Epoch 14/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.04it/s]


Train loss 152.5798 | Val RMSE 0.0000 | Test RMSE 14.6084 | Score 265.6481
Epoch 15/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.98it/s]


Train loss 157.5649 | Val RMSE 0.0000 | Test RMSE 11.9619 | Score 188.8730
New best Test RMSE 11.9619 found. Saving model.
Epoch 16/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.03it/s]


Train loss 150.4185 | Val RMSE 0.0000 | Test RMSE 12.5955 | Score 251.1775
Epoch 17/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.96it/s]


Train loss 143.8847 | Val RMSE 0.0000 | Test RMSE 11.7216 | Score 190.9177
New best Test RMSE 11.7216 found. Saving model.
Epoch 18/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.04it/s]


Train loss 147.3918 | Val RMSE 0.0000 | Test RMSE 10.9529 | Score 165.4118
New best Test RMSE 10.9529 found. Saving model.
Epoch 19/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.12it/s]


Train loss 143.0836 | Val RMSE 0.0000 | Test RMSE 14.5359 | Score 267.1362
Epoch 20/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.02it/s]


Train loss 146.5092 | Val RMSE 0.0000 | Test RMSE 11.2077 | Score 183.8170
Epoch 21/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.05it/s]


Train loss 147.1171 | Val RMSE 0.0000 | Test RMSE 12.5743 | Score 241.5420
Epoch 22/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.07it/s]


Train loss 141.3297 | Val RMSE 0.0000 | Test RMSE 12.1160 | Score 220.7030
Epoch 23/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.08it/s]


Train loss 144.3185 | Val RMSE 0.0000 | Test RMSE 14.5904 | Score 382.6547
Epoch 24/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.11it/s]


Train loss 141.8507 | Val RMSE 0.0000 | Test RMSE 11.7993 | Score 198.9515
Epoch 25/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.98it/s]


Train loss 141.0287 | Val RMSE 0.0000 | Test RMSE 12.6078 | Score 217.7601
Epoch 26/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.10it/s]


Train loss 136.7066 | Val RMSE 0.0000 | Test RMSE 11.4847 | Score 204.7754
Epoch 27/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.08it/s]


Train loss 136.7712 | Val RMSE 0.0000 | Test RMSE 11.1457 | Score 172.4452
Epoch 28/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.08it/s]


Train loss 133.5488 | Val RMSE 0.0000 | Test RMSE 11.2280 | Score 188.6922
Epoch 29/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.95it/s]


Train loss 136.7117 | Val RMSE 0.0000 | Test RMSE 11.4332 | Score 191.1221
Epoch 30/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.98it/s]


Train loss 132.8161 | Val RMSE 0.0000 | Test RMSE 11.9206 | Score 220.8606
Epoch 31/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.11it/s]


Train loss 131.8982 | Val RMSE 0.0000 | Test RMSE 12.9328 | Score 237.3775
Epoch 32/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.96it/s]


Train loss 136.2645 | Val RMSE 0.0000 | Test RMSE 12.3205 | Score 243.4358
Epoch 33/80


train: 100%|██████████| 548/548 [00:59<00:00,  9.16it/s]


Train loss 132.9795 | Val RMSE 0.0000 | Test RMSE 12.3229 | Score 224.1430
Epoch 34/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.08it/s]


Train loss 131.4611 | Val RMSE 0.0000 | Test RMSE 11.5502 | Score 191.5890
Epoch 35/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.03it/s]


Train loss 128.3015 | Val RMSE 0.0000 | Test RMSE 11.0595 | Score 180.3090
Epoch 36/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.92it/s]


Train loss 129.2989 | Val RMSE 0.0000 | Test RMSE 11.6785 | Score 208.0228
Epoch 37/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.98it/s]


Train loss 125.3559 | Val RMSE 0.0000 | Test RMSE 12.5881 | Score 265.9653
Epoch 38/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.01it/s]


Train loss 127.6650 | Val RMSE 0.0000 | Test RMSE 12.8710 | Score 256.5134
Epoch 39/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.95it/s]


Train loss 126.4805 | Val RMSE 0.0000 | Test RMSE 10.9892 | Score 166.5118
Epoch 40/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.95it/s]


Train loss 121.9751 | Val RMSE 0.0000 | Test RMSE 11.1393 | Score 164.2804
Epoch 41/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.01it/s]


Train loss 124.8784 | Val RMSE 0.0000 | Test RMSE 11.5504 | Score 199.5166
Epoch 42/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.01it/s]


Train loss 124.2250 | Val RMSE 0.0000 | Test RMSE 11.8323 | Score 190.0679
Epoch 43/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.01it/s]


Train loss 120.9995 | Val RMSE 0.0000 | Test RMSE 11.0866 | Score 176.9250
Epoch 44/80


train: 100%|██████████| 548/548 [01:01<00:00,  8.93it/s]


Train loss 120.1709 | Val RMSE 0.0000 | Test RMSE 12.2837 | Score 249.4709
Epoch 45/80


train: 100%|██████████| 548/548 [01:00<00:00,  9.05it/s]


Train loss 119.8237 | Val RMSE 0.0000 | Test RMSE 12.3151 | Score 256.5555
Epoch 46/80


train: 100%|██████████| 548/548 [01:00<00:00,  8.99it/s]


Train loss 119.1085 | Val RMSE 0.0000 | Test RMSE 11.9211 | Score 207.3954
Epoch 47/80


train:  76%|███████▋  | 419/548 [00:47<00:13,  9.61it/s]