In [None]:
!gdown 1q4UCHs2JvZZltJ1QzKydWogMBNUQhftm
!gdown 188hwfGDnLjFCQsQiJXKBSAm1cOJeprtF
!gdown 1tbxE-SZO5R37TiBscVZ2zbTGeTAIsUYk
!gdown 1ZI-JsZ-6QxtHJKnxj1Mggscxumodxm64
!pip install torch_geometric

Downloading...
From: https://drive.google.com/uc?id=1q4UCHs2JvZZltJ1QzKydWogMBNUQhftm
To: /content/X_train_mosc_wembs.pickle
100% 34.1M/34.1M [00:00<00:00, 96.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=188hwfGDnLjFCQsQiJXKBSAm1cOJeprtF
To: /content/X_test_mosc_wembs.pickle
100% 6.03M/6.03M [00:00<00:00, 60.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1tbxE-SZO5R37TiBscVZ2zbTGeTAIsUYk
To: /content/y_train_msk_merged_2.pkl
100% 788k/788k [00:00<00:00, 47.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZI-JsZ-6QxtHJKnxj1Mggscxumodxm64
To: /content/y_test_msk_merged_2.pkl
100% 140k/140k [00:00<00:00, 38.1MB/s]
Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1

In [None]:
import pandas as pd
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class SimpleGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(SimpleGCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

model = SimpleGCN(in_channels=X_all.shape[1], hidden_channels=64, out_channels=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 5.3641
Epoch 20, Loss: 0.3516
Epoch 40, Loss: 0.2118
Epoch 60, Loss: 0.1910
Epoch 80, Loss: 0.1839
Epoch 100, Loss: 0.1798
Epoch 120, Loss: 0.1764
Epoch 140, Loss: 0.1734
Epoch 160, Loss: 0.1708
Epoch 180, Loss: 0.1683
Epoch 200, Loss: 0.1660
Epoch 220, Loss: 0.1640
Epoch 240, Loss: 0.1621
Epoch 260, Loss: 0.1610
Epoch 280, Loss: 0.1590
Epoch 300, Loss: 0.1583
Epoch 320, Loss: 0.1553
Epoch 340, Loss: 0.1582
Epoch 360, Loss: 0.1530
Epoch 380, Loss: 0.1527
Epoch 400, Loss: 0.1519
Epoch 420, Loss: 0.1529
Epoch 440, Loss: 0.1512
Epoch 460, Loss: 0.1510
Epoch 480, Loss: 0.1489
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [19321666. 10567110. 11671585. 31074312. 32347326. 24625944. 16482421.
 33327422. 10402218. 15796223.]
Test RMSE: 7617069.39


# Simple GNN 500 epochs LR=0.001

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class SimpleGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(SimpleGCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

model = SimpleGCN(in_channels=X_all.shape[1], hidden_channels=64, out_channels=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 0.9369
Epoch 20, Loss: 0.2764
Epoch 40, Loss: 0.2209
Epoch 60, Loss: 0.2038
Epoch 80, Loss: 0.1945
Epoch 100, Loss: 0.1883
Epoch 120, Loss: 0.1834
Epoch 140, Loss: 0.1797
Epoch 160, Loss: 0.1764
Epoch 180, Loss: 0.1734
Epoch 200, Loss: 0.1705
Epoch 220, Loss: 0.1680
Epoch 240, Loss: 0.1659
Epoch 260, Loss: 0.1642
Epoch 280, Loss: 0.1626
Epoch 300, Loss: 0.1611
Epoch 320, Loss: 0.1598
Epoch 340, Loss: 0.1586
Epoch 360, Loss: 0.1574
Epoch 380, Loss: 0.1563
Epoch 400, Loss: 0.1552
Epoch 420, Loss: 0.1543
Epoch 440, Loss: 0.1534
Epoch 460, Loss: 0.1525
Epoch 480, Loss: 0.1517
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [20517988. 10573108. 10621115. 29240722. 32322828. 26622184. 16462741.
 34351924. 10978047. 15486948.]
Test RMSE: 7548791.52


# GNN advanced

Несколько GNN-слоёв (GCN, GraphSAGE или GAT).

Нормализацию (BatchNorm).

Dropout.

Residual connection.

MLP в конце.

Меняем архитектуру модели: комбинация GraphSAGE + BatchNorm + Dropout + MLP:

## 500 EPOCHS LR=0.001 DROPOUT=0.3

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import SAGEConv, BatchNorm

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout

        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)

        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)

        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)

        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)

        x = self.mlp(x)
        return x

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=128, out_channels=1, dropout=0.4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 0.9985
Epoch 20, Loss: 0.2003
Epoch 40, Loss: 0.1539
Epoch 60, Loss: 0.1385
Epoch 80, Loss: 0.1274
Epoch 100, Loss: 0.1175
Epoch 120, Loss: 0.1101
Epoch 140, Loss: 0.1072
Epoch 160, Loss: 0.1022
Epoch 180, Loss: 0.0992
Epoch 200, Loss: 0.0964
Epoch 220, Loss: 0.0920
Epoch 240, Loss: 0.0902
Epoch 260, Loss: 0.0912
Epoch 280, Loss: 0.0881
Epoch 300, Loss: 0.0850
Epoch 320, Loss: 0.0834
Epoch 340, Loss: 0.0828
Epoch 360, Loss: 0.0802
Epoch 380, Loss: 0.0788
Epoch 400, Loss: 0.0786
Epoch 420, Loss: 0.0742
Epoch 440, Loss: 0.0742
Epoch 460, Loss: 0.0738
Epoch 480, Loss: 0.0731
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [17263888. 10534946. 14193814. 30599792. 29395228. 25365090. 13391490.
 52319568. 10345269. 13846885.]
Test RMSE: 5082855.62


## 700 EPOCHS LR=0.001 DROPOUT=0.4 ** BEST **

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import SAGEConv, BatchNorm

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout

        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)

        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)

        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)

        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)

        x = self.mlp(x)
        return x

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=128, out_channels=1, dropout=0.4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(700):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 1.0327
Epoch 20, Loss: 0.2144
Epoch 40, Loss: 0.1694
Epoch 60, Loss: 0.1452
Epoch 80, Loss: 0.1297
Epoch 100, Loss: 0.1210
Epoch 120, Loss: 0.1137
Epoch 140, Loss: 0.1097
Epoch 160, Loss: 0.1048
Epoch 180, Loss: 0.1028
Epoch 200, Loss: 0.1003
Epoch 220, Loss: 0.0978
Epoch 240, Loss: 0.0930
Epoch 260, Loss: 0.0925
Epoch 280, Loss: 0.0896
Epoch 300, Loss: 0.0859
Epoch 320, Loss: 0.0857
Epoch 340, Loss: 0.0826
Epoch 360, Loss: 0.0805
Epoch 380, Loss: 0.0799
Epoch 400, Loss: 0.0797
Epoch 420, Loss: 0.0758
Epoch 440, Loss: 0.0756
Epoch 460, Loss: 0.0747
Epoch 480, Loss: 0.0753
Epoch 500, Loss: 0.0723
Epoch 520, Loss: 0.0721
Epoch 540, Loss: 0.0705
Epoch 560, Loss: 0.0703
Epoch 580, Loss: 0.0681
Epoch 600, Loss: 0.0676
Epoch 620, Loss: 0.0666
Epoch 640, Loss: 0.0656
Epoch 660, Loss: 0.0654
Epoch 680, Loss: 0.0661
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [16983026. 10677627. 14862286. 310843

# папам

Построение графа на объединённых данных
✅ 2	Тренировка DeepGNN
✅ 3	Извлечение эмбеддингов
✅ 4	CatBoost на исходных признаках
✅ 5	CatBoost на признаки + GNN-эмбеддинги

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import kneighbors_graph
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, BatchNorm
from catboost import CatBoostRegressor

# === 1. Загрузка и подготовка данных ===

df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

# Заполнение пропусков
for col in df_train.columns:
    df_train[col].fillna(df_train[col].mean(), inplace=True)
    df_test[col].fillna(df_test[col].mean(), inplace=True)

# Масштабирование таргета
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединение train + test
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Построение графа и PyG Data ===

A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 3. GNN модель ===

class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.4):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)
        return self.mlp(x)

    def get_embeddings(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)
        return x

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=128, out_channels=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

# === 4. Обучение GNN ===

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Извлечение эмбеддингов ===

model.eval()
with torch.no_grad():
    embeddings = model.get_embeddings(data.x, data.edge_index).cpu().numpy()
emb_train = embeddings[data.train_mask]
emb_test = embeddings[data.test_mask]




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(df_train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[col].fillna(df_test[col].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 1.0554
Epoch 50, Loss: 0.1504
Epoch 100, Loss: 0.1204
Epoch 150, Loss: 0.1030
Epoch 200, Loss: 0.0958
Epoch 250, Loss: 0.0914
Epoch 300, Loss: 0.0861
Epoch 350, Loss: 0.0826
Epoch 400, Loss: 0.0779
Epoch 450, Loss: 0.0785

=== CatBoost RMSE по различным признаковым пространствам ===


TypeError: got an unexpected keyword argument 'squared'

In [None]:
def train_and_evaluate_catboost(X_train, X_test, name):
    model = CatBoostRegressor(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        loss_function='RMSE',
        verbose=0
    )
    model.fit(X_train, y_train_scaled.ravel())
    preds_scaled = model.predict(X_test)
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).ravel()
    rmse = mean_squared_error(y_test, preds) ** 0.5
    print(f'{name} RMSE: {rmse:.2f}')
    return preds

print('\n=== CatBoost RMSE по различным признаковым пространствам ===')
pred1 = train_and_evaluate_catboost(df_train.values, df_test.values, '1. Исходные признаки')
pred2 = train_and_evaluate_catboost(emb_train, emb_test, '2. Только GNN-эмбеддинги')
pred3 = train_and_evaluate_catboost(
    np.hstack([df_train.values, emb_train]),
    np.hstack([df_test.values, emb_test]),
    '3. Признаки + GNN-эмбеддинги'
)

# === 7. Примеры предсказаний ===

print('\n🔍 Примеры предсказаний для CatBoost (вариант 3):')
print('True: ', y_test.values[:10].ravel())
print('Pred: ', pred3[:10])


=== CatBoost RMSE по различным признаковым пространствам ===
1. Исходные признаки RMSE: 5319891.68
2. Только GNN-эмбеддинги RMSE: 5225126.92
3. Признаки + GNN-эмбеддинги RMSE: 5201237.01

🔍 Примеры предсказаний для CatBoost (вариант 3):
True:  [29000000. 60000000. 82000000. 18500000. 14000000. 21900000. 90000000.
 21500000. 17950000. 71000000.]
Pred:  [27749979.73009477 51173209.27768907 77033958.63143659 18230162.35398448
 12455446.44858989 22174369.30963769 95585119.95531502 19483789.04615035
 19714213.25770666 79151156.27625135]


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, BatchNorm
from sklearn.neighbors import kneighbors_graph
from catboost import CatBoostRegressor

# === 1. Загрузка и подготовка данных ===

df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

# Обработка пропусков
for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем признаки и таргеты
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Строим граф по KNN (10 соседей) ===
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Маски
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 3. GNN-модель ===
class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index, return_embeddings=False):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)

        if return_embeddings:
            return x  # Вернуть эмбеддинги до MLP
        return self.mlp(x)

# === 4. Обучение GNN ===

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=128, out_channels=1, dropout=0.4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

model.train()
for epoch in range(400):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Извлечение эмбеддингов ===

model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index, return_embeddings=True).cpu().numpy()
    emb_train = embeddings[:len(df_train)]
    emb_test = embeddings[len(df_train):]

# === 6. Обучение CatBoost на трех вариантах ===

X_train_std = df_train.values
X_test_std = df_test.values

X_train_concat = np.hstack([X_train_std, emb_train])
X_test_concat = np.hstack([X_test_std, emb_test])

# Опции CatBoost
params = dict(verbose=0, iterations=300, random_seed=42)

def run_catboost(X_tr, X_te, name=''):
    model = CatBoostRegressor(**params)
    model.fit(X_tr, y_train)
    preds = model.predict(X_te)
    rmse = mean_squared_error(y_test, preds) ** 0.5
    print(f'CatBoost {name} RMSE: {rmse:.2f}')
    return rmse

rmse_std = run_catboost(X_train_std, X_test_std, name='[A] только признаки')
rmse_concat = run_catboost(X_train_concat, X_test_concat, name='[B] признаки + эмбеддинги')
rmse_emb = run_catboost(emb_train, emb_test, name='[C] только эмбеддинги')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 0.9894
Epoch 20, Loss: 0.2015
Epoch 40, Loss: 0.1593
Epoch 60, Loss: 0.1399
Epoch 80, Loss: 0.1264
Epoch 100, Loss: 0.1173
Epoch 120, Loss: 0.1088
Epoch 140, Loss: 0.1039
Epoch 160, Loss: 0.1009
Epoch 180, Loss: 0.0989
Epoch 200, Loss: 0.0926
Epoch 220, Loss: 0.0903
Epoch 240, Loss: 0.0906
Epoch 260, Loss: 0.0876
Epoch 280, Loss: 0.0861
Epoch 300, Loss: 0.0816
Epoch 320, Loss: 0.0802
Epoch 340, Loss: 0.0817
Epoch 360, Loss: 0.0795
Epoch 380, Loss: 0.0780
CatBoost [A] только признаки RMSE: 5188808.43
CatBoost [B] признаки + эмбеддинги RMSE: 5194884.08
CatBoost [C] только эмбеддинги RMSE: 5309247.49


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, BatchNorm
from sklearn.neighbors import kneighbors_graph
from catboost import CatBoostRegressor

# === 1. Загрузка и подготовка данных ===

df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

# Обработка пропусков
for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем признаки и таргеты
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Строим граф по KNN (10 соседей) ===
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Маски
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 3. GNN-модель ===
class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index, return_embeddings=False):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)

        if return_embeddings:
            return x  # Вернуть эмбеддинги до MLP
        return self.mlp(x)

# === 4. Обучение GNN ===

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=128, out_channels=1, dropout=0.4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

model.train()
for epoch in range(700):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Извлечение эмбеддингов ===

model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index, return_embeddings=True).cpu().numpy()
    emb_train = embeddings[:len(df_train)]
    emb_test = embeddings[len(df_train):]

# === 6. Обучение CatBoost на трех вариантах ===

X_train_std = df_train.values
X_test_std = df_test.values

X_train_concat = np.hstack([X_train_std, emb_train])
X_test_concat = np.hstack([X_test_std, emb_test])

# Опции CatBoost
params = dict(verbose=0, iterations=300, random_seed=42)

def run_catboost(X_tr, X_te, name=''):
    model = CatBoostRegressor(**params)
    model.fit(X_tr, y_train)
    preds = model.predict(X_te)
    rmse = mean_squared_error(y_test, preds) ** 0.5
    print(f'CatBoost {name} RMSE: {rmse:.2f}')
    return rmse

rmse_std = run_catboost(X_train_std, X_test_std, name='[A] только признаки')
rmse_concat = run_catboost(X_train_concat, X_test_concat, name='[B] признаки + эмбеддинги')
rmse_emb = run_catboost(emb_train, emb_test, name='[C] только эмбеддинги')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 1.0057
Epoch 20, Loss: 0.5634
Epoch 40, Loss: 0.3233
Epoch 60, Loss: 0.2522
Epoch 80, Loss: 0.2278
Epoch 100, Loss: 0.2200
Epoch 120, Loss: 0.2050
Epoch 140, Loss: 0.1925
Epoch 160, Loss: 0.1824
Epoch 180, Loss: 0.1794
Epoch 200, Loss: 0.1755
Epoch 220, Loss: 0.1690
Epoch 240, Loss: 0.1627
Epoch 260, Loss: 0.1548
Epoch 280, Loss: 0.1554
Epoch 300, Loss: 0.1499
Epoch 320, Loss: 0.1458
Epoch 340, Loss: 0.1410
Epoch 360, Loss: 0.1394
Epoch 380, Loss: 0.1350
Epoch 400, Loss: 0.1313
Epoch 420, Loss: 0.1318
Epoch 440, Loss: 0.1283
Epoch 460, Loss: 0.1288
Epoch 480, Loss: 0.1263
Epoch 500, Loss: 0.1218
Epoch 520, Loss: 0.1213
Epoch 540, Loss: 0.1221
Epoch 560, Loss: 0.1178
Epoch 580, Loss: 0.1196
Epoch 600, Loss: 0.1167
Epoch 620, Loss: 0.1125
Epoch 640, Loss: 0.1115
Epoch 660, Loss: 0.1099
Epoch 680, Loss: 0.1108
CatBoost [A] только признаки RMSE: 5188808.43
CatBoost [B] признаки + эмбеддинги RMSE: 5147010.55
CatBoost [C] только эмбеддинги RMSE: 5459178.06


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, BatchNorm
from sklearn.neighbors import kneighbors_graph
from catboost import CatBoostRegressor

# === 1. Загрузка и подготовка данных ===

df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

# Обработка пропусков
for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем признаки и таргеты
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Строим граф по KNN (10 соседей) ===
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Маски
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 3. GNN-модель ===
class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index, return_embeddings=False):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)

        if return_embeddings:
            return x  # Вернуть эмбеддинги до MLP
        return self.mlp(x)

# === 4. Обучение GNN ===

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=128, out_channels=1, dropout=0.3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

model.train()
for epoch in range(700):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Извлечение эмбеддингов ===

model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index, return_embeddings=True).cpu().numpy()
    emb_train = embeddings[:len(df_train)]
    emb_test = embeddings[len(df_train):]

# === 6. Обучение CatBoost на трех вариантах ===

X_train_std = df_train.values
X_test_std = df_test.values

X_train_concat = np.hstack([X_train_std, emb_train])
X_test_concat = np.hstack([X_test_std, emb_test])

# Опции CatBoost
params = dict(verbose=0, iterations=300, random_seed=42)

def run_catboost(X_tr, X_te, name=''):
    model = CatBoostRegressor(**params)
    model.fit(X_tr, y_train)
    preds = model.predict(X_te)
    rmse = mean_squared_error(y_test, preds) ** 0.5
    print(f'CatBoost {name} RMSE: {rmse:.2f}')
    return rmse

rmse_std = run_catboost(X_train_std, X_test_std, name='[A] только признаки')
rmse_concat = run_catboost(X_train_concat, X_test_concat, name='[B] признаки + эмбеддинги')
rmse_emb = run_catboost(emb_train, emb_test, name='[C] только эмбеддинги')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 0.9787
Epoch 20, Loss: 0.5554
Epoch 40, Loss: 0.2886
Epoch 60, Loss: 0.2349
Epoch 80, Loss: 0.2116
Epoch 100, Loss: 0.1965
Epoch 120, Loss: 0.1813
Epoch 140, Loss: 0.1712
Epoch 160, Loss: 0.1702
Epoch 180, Loss: 0.1613
Epoch 200, Loss: 0.1543
Epoch 220, Loss: 0.1506
Epoch 240, Loss: 0.1449
Epoch 260, Loss: 0.1408
Epoch 280, Loss: 0.1384
Epoch 300, Loss: 0.1318
Epoch 320, Loss: 0.1300
Epoch 340, Loss: 0.1275
Epoch 360, Loss: 0.1227
Epoch 380, Loss: 0.1226
Epoch 400, Loss: 0.1203
Epoch 420, Loss: 0.1174
Epoch 440, Loss: 0.1156
Epoch 460, Loss: 0.1136
Epoch 480, Loss: 0.1133
Epoch 500, Loss: 0.1078
Epoch 520, Loss: 0.1070
Epoch 540, Loss: 0.1050
Epoch 560, Loss: 0.1064
Epoch 580, Loss: 0.1037
Epoch 600, Loss: 0.1009
Epoch 620, Loss: 0.1012
Epoch 640, Loss: 0.1002
Epoch 660, Loss: 0.0996
Epoch 680, Loss: 0.0973
CatBoost [A] только признаки RMSE: 5188808.43
CatBoost [B] признаки + эмбеддинги RMSE: 5134052.80
CatBoost [C] только эмбеддинги RMSE: 5428254.37


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, BatchNorm
from sklearn.neighbors import kneighbors_graph
from catboost import CatBoostRegressor

# === 1. Загрузка и подготовка данных ===

df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

# Обработка пропусков
for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем признаки и таргеты
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Строим граф по KNN (10 соседей) ===
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Маски
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 3. GNN-модель ===
class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index, return_embeddings=False):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)

        if return_embeddings:
            return x  # Вернуть эмбеддинги до MLP
        return self.mlp(x)

# === 4. Обучение GNN ===

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=256, out_channels=1, dropout=0.3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

model.train()
for epoch in range(700):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Извлечение эмбеддингов ===

model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index, return_embeddings=True).cpu().numpy()
    emb_train = embeddings[:len(df_train)]
    emb_test = embeddings[len(df_train):]

# === 6. Обучение CatBoost на трех вариантах ===

X_train_std = df_train.values
X_test_std = df_test.values

X_train_concat = np.hstack([X_train_std, emb_train])
X_test_concat = np.hstack([X_test_std, emb_test])

# Опции CatBoost
params = dict(verbose=0, iterations=300, random_seed=42)

def run_catboost(X_tr, X_te, name=''):
    model = CatBoostRegressor(**params)
    model.fit(X_tr, y_train)
    preds = model.predict(X_te)
    rmse = mean_squared_error(y_test, preds) ** 0.5
    print(f'CatBoost {name} RMSE: {rmse:.2f}')
    return rmse

rmse_std = run_catboost(X_train_std, X_test_std, name='[A] только признаки')
rmse_concat = run_catboost(X_train_concat, X_test_concat, name='[B] признаки + эмбеддинги')
rmse_emb = run_catboost(emb_train, emb_test, name='[C] только эмбеддинги')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 1.0975
Epoch 20, Loss: 0.3672
Epoch 40, Loss: 0.2288
Epoch 60, Loss: 0.1918
Epoch 80, Loss: 0.1697
Epoch 100, Loss: 0.1595
Epoch 120, Loss: 0.1509
Epoch 140, Loss: 0.1431
Epoch 160, Loss: 0.1354
Epoch 180, Loss: 0.1305
Epoch 200, Loss: 0.1228
Epoch 220, Loss: 0.1214
Epoch 240, Loss: 0.1155
Epoch 260, Loss: 0.1138
Epoch 280, Loss: 0.1098
Epoch 300, Loss: 0.1069
Epoch 320, Loss: 0.1044
Epoch 340, Loss: 0.1016
Epoch 360, Loss: 0.0981
Epoch 380, Loss: 0.0972
Epoch 400, Loss: 0.0955
Epoch 420, Loss: 0.0927
Epoch 440, Loss: 0.0916
Epoch 460, Loss: 0.0908
Epoch 480, Loss: 0.0881
Epoch 500, Loss: 0.0856
Epoch 520, Loss: 0.0845
Epoch 540, Loss: 0.0851
Epoch 560, Loss: 0.0837
Epoch 580, Loss: 0.0800
Epoch 600, Loss: 0.0811
Epoch 620, Loss: 0.0787
Epoch 640, Loss: 0.0789
Epoch 660, Loss: 0.0790
Epoch 680, Loss: 0.0769
CatBoost [A] только признаки RMSE: 5188808.43
CatBoost [B] признаки + эмбеддинги RMSE: 5170425.79
CatBoost [C] только эмбеддинги RMSE: 5333307.35


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, BatchNorm
from sklearn.neighbors import kneighbors_graph
from catboost import CatBoostRegressor

# === 1. Загрузка и подготовка данных ===

df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

# Обработка пропусков
for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем признаки и таргеты
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Строим граф по KNN (10 соседей) ===
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Маски
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 3. GNN-модель ===
class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index, return_embeddings=False):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)

        if return_embeddings:
            return x  # Вернуть эмбеддинги до MLP
        return self.mlp(x)

# === 4. Обучение GNN ===

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=256, out_channels=1, dropout=0.3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

model.train()
for epoch in range(700):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Извлечение эмбеддингов ===

model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index, return_embeddings=True).cpu().numpy()
    emb_train = embeddings[:len(df_train)]
    emb_test = embeddings[len(df_train):]

# === 6. Обучение CatBoost на трех вариантах ===

X_train_std = df_train.values
X_test_std = df_test.values

X_train_concat = np.hstack([X_train_std, emb_train])
X_test_concat = np.hstack([X_test_std, emb_test])

# Опции CatBoost
params = dict(verbose=0, iterations=1000, random_seed=42)

def run_catboost(X_tr, X_te, name=''):
    model = CatBoostRegressor(**params)
    model.fit(X_tr, y_train)
    preds = model.predict(X_te)
    rmse = mean_squared_error(y_test, preds) ** 0.5
    print(f'CatBoost {name} RMSE: {rmse:.2f}')
    return rmse

rmse_std = run_catboost(X_train_std, X_test_std, name='[A] только признаки')
rmse_concat = run_catboost(X_train_concat, X_test_concat, name='[B] признаки + эмбеддинги')
rmse_emb = run_catboost(emb_train, emb_test, name='[C] только эмбеддинги')

## 500 EPOCHS LR=0.0001 DROPOUT=0.3

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import SAGEConv, BatchNorm

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout

        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)

        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)

        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)

        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)

        x = self.mlp(x)
        return x

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=128, out_channels=1, dropout=0.4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(700):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 1.0099
Epoch 20, Loss: 0.6135
Epoch 40, Loss: 0.3343
Epoch 60, Loss: 0.2696
Epoch 80, Loss: 0.2303
Epoch 100, Loss: 0.2159
Epoch 120, Loss: 0.2025
Epoch 140, Loss: 0.1917
Epoch 160, Loss: 0.1768
Epoch 180, Loss: 0.1785
Epoch 200, Loss: 0.1708
Epoch 220, Loss: 0.1685
Epoch 240, Loss: 0.1597
Epoch 260, Loss: 0.1548
Epoch 280, Loss: 0.1498
Epoch 300, Loss: 0.1474
Epoch 320, Loss: 0.1411
Epoch 340, Loss: 0.1402
Epoch 360, Loss: 0.1371
Epoch 380, Loss: 0.1324
Epoch 400, Loss: 0.1333
Epoch 420, Loss: 0.1307
Epoch 440, Loss: 0.1285
Epoch 460, Loss: 0.1263
Epoch 480, Loss: 0.1241
Epoch 500, Loss: 0.1233
Epoch 520, Loss: 0.1208
Epoch 540, Loss: 0.1173
Epoch 560, Loss: 0.1216
Epoch 580, Loss: 0.1156
Epoch 600, Loss: 0.1166
Epoch 620, Loss: 0.1145
Epoch 640, Loss: 0.1140
Epoch 660, Loss: 0.1113
Epoch 680, Loss: 0.1103
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [15185498. 10829550. 14711090. 290822

## 600 EPOCHS LR=0.0001 DROPOUT=0.3

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import SAGEConv, BatchNorm

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout

        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)

        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)

        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)

        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)

        x = self.mlp(x)
        return x

model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=256, out_channels=1, dropout=0.3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(700):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 0.9841
Epoch 20, Loss: 0.3106
Epoch 40, Loss: 0.2196
Epoch 60, Loss: 0.1810
Epoch 80, Loss: 0.1648
Epoch 100, Loss: 0.1558
Epoch 120, Loss: 0.1460
Epoch 140, Loss: 0.1377
Epoch 160, Loss: 0.1312
Epoch 180, Loss: 0.1264
Epoch 200, Loss: 0.1185
Epoch 220, Loss: 0.1159
Epoch 240, Loss: 0.1114
Epoch 260, Loss: 0.1085
Epoch 280, Loss: 0.1050
Epoch 300, Loss: 0.1036
Epoch 320, Loss: 0.1009
Epoch 340, Loss: 0.1008
Epoch 360, Loss: 0.0958
Epoch 380, Loss: 0.0935
Epoch 400, Loss: 0.0932
Epoch 420, Loss: 0.0906
Epoch 440, Loss: 0.0898
Epoch 460, Loss: 0.0861
Epoch 480, Loss: 0.0860
Epoch 500, Loss: 0.0836
Epoch 520, Loss: 0.0817
Epoch 540, Loss: 0.0812
Epoch 560, Loss: 0.0804
Epoch 580, Loss: 0.0796
Epoch 600, Loss: 0.0783
Epoch 620, Loss: 0.0785
Epoch 640, Loss: 0.0776
Epoch 660, Loss: 0.0765
Epoch 680, Loss: 0.0756
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [17863600. 10838048. 14815050. 299862

# GAT + JK 500 epochs LR=0.001

In [None]:
усилим архитектуру с помощью Graph Attention Networks (GAT) и Jumping Knowledge (JK). Это даст модели возможность:

GAT — фокусироваться на важности соседей через attention.

Jumping Knowledge — агрегировать представления из разных глубин графа (из всех GNN-слоев), что помогает при градиентных затуханиях и переусложнении.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import GATConv, BatchNorm, JumpingKnowledge

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class GATWithJK(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4, dropout=0.3):
        super(GATWithJK, self).__init__()
        self.dropout = dropout

        self.gat1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.bn1 = BatchNorm(hidden_channels * heads)

        self.gat2 = GATConv(hidden_channels * heads, hidden_channels, heads=heads, dropout=dropout)
        self.bn2 = BatchNorm(hidden_channels * heads)

        self.gat3 = GATConv(hidden_channels * heads, hidden_channels, heads=1, concat=True, dropout=dropout)
        self.bn3 = BatchNorm(hidden_channels)

        self.jk = JumpingKnowledge(mode='cat')  # concatenate all layer outputs

        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * (2 * heads + 1), hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels, out_channels)
        )

    def forward(self, x, edge_index):
        out1 = self.gat1(x, edge_index)
        out1 = self.bn1(out1)
        out1 = F.elu(out1)
        out1 = F.dropout(out1, p=self.dropout, training=self.training)

        out2 = self.gat2(out1, edge_index)
        out2 = self.bn2(out2)
        out2 = F.elu(out2)
        out2 = F.dropout(out2, p=self.dropout, training=self.training)

        out3 = self.gat3(out2, edge_index)
        out3 = self.bn3(out3)
        out3 = F.elu(out3)

        x = self.jk([out1, out2, out3])
        x = self.mlp(x)
        return x
model = GATWithJK(
    in_channels=X_all.shape[1],  # количество признаков, включая emb_*
    hidden_channels=64,
    out_channels=1,
    heads=4,
    dropout=0.4
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 1.0740
Epoch 20, Loss: 0.2354
Epoch 40, Loss: 0.1928
Epoch 60, Loss: 0.1738
Epoch 80, Loss: 0.1633
Epoch 100, Loss: 0.1619
Epoch 120, Loss: 0.1564
Epoch 140, Loss: 0.1501
Epoch 160, Loss: 0.1455
Epoch 180, Loss: 0.1446
Epoch 200, Loss: 0.1423
Epoch 220, Loss: 0.1398
Epoch 240, Loss: 0.1369
Epoch 260, Loss: 0.1347
Epoch 280, Loss: 0.1327
Epoch 300, Loss: 0.1311
Epoch 320, Loss: 0.1292
Epoch 340, Loss: 0.1332
Epoch 360, Loss: 0.1281
Epoch 380, Loss: 0.1249
Epoch 400, Loss: 0.1267
Epoch 420, Loss: 0.1229
Epoch 440, Loss: 0.1228
Epoch 460, Loss: 0.1234
Epoch 480, Loss: 0.1219
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [15928250. 10457849. 14608371. 29919222. 29281144. 25930926. 14516802.
 36612548. 10647432. 13875816.]
Test RMSE: 5966135.04


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import GATConv, BatchNorm, JumpingKnowledge

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=5, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class GATWithJK(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=3, dropout=0.3):
        super(GATWithJK, self).__init__()
        self.dropout = dropout

        self.gat1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.bn1 = BatchNorm(hidden_channels * heads)

        self.gat2 = GATConv(hidden_channels * heads, hidden_channels, heads=heads, dropout=dropout)
        self.bn2 = BatchNorm(hidden_channels * heads)

        self.gat3 = GATConv(hidden_channels * heads, hidden_channels, heads=1, concat=True, dropout=dropout)
        self.bn3 = BatchNorm(hidden_channels)

        self.jk = JumpingKnowledge(mode='cat')  # concatenate all layer outputs

        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * (2 * heads + 1), hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels, out_channels)
        )

    def forward(self, x, edge_index):
        out1 = self.gat1(x, edge_index)
        out1 = self.bn1(out1)
        out1 = F.elu(out1)
        out1 = F.dropout(out1, p=self.dropout, training=self.training)

        out2 = self.gat2(out1, edge_index)
        out2 = self.bn2(out2)
        out2 = F.elu(out2)
        out2 = F.dropout(out2, p=self.dropout, training=self.training)

        out3 = self.gat3(out2, edge_index)
        out3 = self.bn3(out3)
        out3 = F.elu(out3)

        x = self.jk([out1, out2, out3])
        x = self.mlp(x)
        return x
model = GATWithJK(
    in_channels=X_all.shape[1],  # количество признаков, включая emb_*
    hidden_channels=64,
    out_channels=1,
    heads=4,
    dropout=0.4
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 1.2233
Epoch 20, Loss: 0.2535
Epoch 40, Loss: 0.1997
Epoch 60, Loss: 0.1785
Epoch 80, Loss: 0.1688
Epoch 100, Loss: 0.1640
Epoch 120, Loss: 0.1556
Epoch 140, Loss: 0.1533
Epoch 160, Loss: 0.1497
Epoch 180, Loss: 0.1487
Epoch 200, Loss: 0.1477
Epoch 220, Loss: 0.1411
Epoch 240, Loss: 0.1445
Epoch 260, Loss: 0.1397
Epoch 280, Loss: 0.1384
Epoch 300, Loss: 0.1385
Epoch 320, Loss: 0.1345
Epoch 340, Loss: 0.1354
Epoch 360, Loss: 0.1343
Epoch 380, Loss: 0.1300
Epoch 400, Loss: 0.1308
Epoch 420, Loss: 0.1296
Epoch 440, Loss: 0.1268
Epoch 460, Loss: 0.1265
Epoch 480, Loss: 0.1275
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [16060455. 11874409. 14304381. 28776264. 28888258. 26710058. 13920800.
 44423140. 10306080. 13847341.]
Test RMSE: 5884138.44


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import GATConv, BatchNorm, JumpingKnowledge

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class GATWithJK(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4, dropout=0.3):
        super(GATWithJK, self).__init__()
        self.dropout = dropout

        self.gat1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.bn1 = BatchNorm(hidden_channels * heads)

        self.gat2 = GATConv(hidden_channels * heads, hidden_channels, heads=heads, dropout=dropout)
        self.bn2 = BatchNorm(hidden_channels * heads)

        self.gat3 = GATConv(hidden_channels * heads, hidden_channels, heads=1, concat=True, dropout=dropout)
        self.bn3 = BatchNorm(hidden_channels)

        self.jk = JumpingKnowledge(mode='cat')  # concatenate all layer outputs

        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * (2 * heads + 1), hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels, out_channels)
        )

    def forward(self, x, edge_index):
        out1 = self.gat1(x, edge_index)
        out1 = self.bn1(out1)
        out1 = F.elu(out1)
        out1 = F.dropout(out1, p=self.dropout, training=self.training)

        out2 = self.gat2(out1, edge_index)
        out2 = self.bn2(out2)
        out2 = F.elu(out2)
        out2 = F.dropout(out2, p=self.dropout, training=self.training)

        out3 = self.gat3(out2, edge_index)
        out3 = self.bn3(out3)
        out3 = F.elu(out3)

        x = self.jk([out1, out2, out3])
        x = self.mlp(x)
        return x
model = GATWithJK(
    in_channels=X_all.shape[1],  # количество признаков, включая emb_*
    hidden_channels=64,
    out_channels=1,
    heads=4,
    dropout=0.4
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 1.1089
Epoch 20, Loss: 0.5017
Epoch 40, Loss: 0.3643
Epoch 60, Loss: 0.3114
Epoch 80, Loss: 0.2810
Epoch 100, Loss: 0.2685
Epoch 120, Loss: 0.2545
Epoch 140, Loss: 0.2429
Epoch 160, Loss: 0.2271
Epoch 180, Loss: 0.2238
Epoch 200, Loss: 0.2122
Epoch 220, Loss: 0.2118
Epoch 240, Loss: 0.2035
Epoch 260, Loss: 0.2004
Epoch 280, Loss: 0.1981
Epoch 300, Loss: 0.1925
Epoch 320, Loss: 0.1882
Epoch 340, Loss: 0.1873
Epoch 360, Loss: 0.1844
Epoch 380, Loss: 0.1839
Epoch 400, Loss: 0.1833
Epoch 420, Loss: 0.1796
Epoch 440, Loss: 0.1768
Epoch 460, Loss: 0.1781
Epoch 480, Loss: 0.1765
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [16552404. 11033623. 15374494. 27908692. 29027126. 24963698. 17212064.
 34643104. 11724708. 13720763.]
Test RMSE: 6741784.12


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import GATConv, BatchNorm, JumpingKnowledge

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class GATWithJK(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4, dropout=0.3):
        super(GATWithJK, self).__init__()
        self.dropout = dropout

        self.gat1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.bn1 = BatchNorm(hidden_channels * heads)

        self.gat2 = GATConv(hidden_channels * heads, hidden_channels, heads=heads, dropout=dropout)
        self.bn2 = BatchNorm(hidden_channels * heads)

        self.gat3 = GATConv(hidden_channels * heads, hidden_channels, heads=1, concat=True, dropout=dropout)
        self.bn3 = BatchNorm(hidden_channels)

        self.jk = JumpingKnowledge(mode='cat')  # concatenate all layer outputs

        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * (2 * heads + 1), hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels, out_channels)
        )

    def forward(self, x, edge_index):
        out1 = self.gat1(x, edge_index)
        out1 = self.bn1(out1)
        out1 = F.elu(out1)
        out1 = F.dropout(out1, p=self.dropout, training=self.training)

        out2 = self.gat2(out1, edge_index)
        out2 = self.bn2(out2)
        out2 = F.elu(out2)
        out2 = F.dropout(out2, p=self.dropout, training=self.training)

        out3 = self.gat3(out2, edge_index)
        out3 = self.bn3(out3)
        out3 = F.elu(out3)

        x = self.jk([out1, out2, out3])
        x = self.mlp(x)
        return x
model = GATWithJK(
    in_channels=X_all.shape[1],  # количество признаков, включая emb_*
    hidden_channels=64,
    out_channels=1,
    heads=4,
    dropout=0.4
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[column].fillna(df_train[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(df_test[column].mean(), inplace=True)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0, Loss: 1.0559
Epoch 20, Loss: 0.2757
Epoch 40, Loss: 0.2151
Epoch 60, Loss: 0.1900
Epoch 80, Loss: 0.1806
Epoch 100, Loss: 0.1684
Epoch 120, Loss: 0.1614
Epoch 140, Loss: 0.1572
Epoch 160, Loss: 0.1531
Epoch 180, Loss: 0.1486
Epoch 200, Loss: 0.1519
Epoch 220, Loss: 0.1453
Epoch 240, Loss: 0.1423
Epoch 260, Loss: 0.1407
Epoch 280, Loss: 0.1391
Epoch 300, Loss: 0.1373
Epoch 320, Loss: 0.1365
Epoch 340, Loss: 0.1369
Epoch 360, Loss: 0.1314
Epoch 380, Loss: 0.1317
Epoch 400, Loss: 0.1323
Epoch 420, Loss: 0.1269
Epoch 440, Loss: 0.1290
Epoch 460, Loss: 0.1268
Epoch 480, Loss: 0.1314
True values: [19750000. 10100000. 16900000. 31900000. 31300000. 29000000. 13150000.
 37000000.  9771580. 14900000.]
Predictions: [15943336. 10905781. 14320043. 29991792. 29472586. 25686128. 14095543.
 36890552.  9952932. 14128744.]
Test RMSE: 6026126.23


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import GATConv, BatchNorm, JumpingKnowledge

# === 1. Загрузка и подготовка данных ===

# Загрузка
df_train = pd.read_pickle('X_train_mosc_wembs.pickle')
df_test = pd.read_pickle('X_test_mosc_wembs.pickle')
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

for column in df_train.columns:
    df_train[column].fillna(df_train[column].mean(), inplace=True)
    df_test[column].fillna(df_test[column].mean(), inplace=True)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Объединяем для построения общего графа
df_all = pd.concat([df_train, df_test], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])

# Извлекаем только признаки
X_all = df_all.values.astype(np.float32)

# Строим граф по KNN (10 ближайших соседей)
A = kneighbors_graph(X_all, n_neighbors=10, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

# Создаём PyG Data
data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Разбиваем обратно на train/test
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train)] = True
test_mask[len(df_train):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# === 2. Определение модели ===

class GATWithJK(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=7, dropout=0.3):
        super(GATWithJK, self).__init__()
        self.dropout = dropout

        self.gat1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.bn1 = BatchNorm(hidden_channels * heads)

        self.gat2 = GATConv(hidden_channels * heads, hidden_channels, heads=heads, dropout=dropout)
        self.bn2 = BatchNorm(hidden_channels * heads)

        self.gat3 = GATConv(hidden_channels * heads, hidden_channels, heads=1, concat=True, dropout=dropout)
        self.bn3 = BatchNorm(hidden_channels)

        self.jk = JumpingKnowledge(mode='cat')  # concatenate all layer outputs

        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * (2 * heads + 1), hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels, out_channels)
        )

    def forward(self, x, edge_index):
        out1 = self.gat1(x, edge_index)
        out1 = self.bn1(out1)
        out1 = F.elu(out1)
        out1 = F.dropout(out1, p=self.dropout, training=self.training)

        out2 = self.gat2(out1, edge_index)
        out2 = self.bn2(out2)
        out2 = F.elu(out2)
        out2 = F.dropout(out2, p=self.dropout, training=self.training)

        out3 = self.gat3(out2, edge_index)
        out3 = self.bn3(out3)
        out3 = F.elu(out3)

        x = self.jk([out1, out2, out3])
        x = self.mlp(x)
        return x
model = GATWithJK(
    in_channels=X_all.shape[1],  # количество признаков, включая emb_*
    hidden_channels=64,
    out_channels=1,
    heads=4,
    dropout=0.4
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

# === 3. Обучение модели ===

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 4. Оценка ===

model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze().cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()
    true = y_train.tolist() + y_test.tolist()

    # Выводим первые 10
    print('True values:', np.array(true[:10]))
    print('Predictions:', preds[:10])

    # RMSE по тесту
    rmse = mean_squared_error(
        y_test,
        preds[len(df_train):]
    ) ** 0.5
    print(f'Test RMSE: {rmse:.2f}')

KeyboardInterrupt: 