# Загрузка данных

In [1]:
!gdown 19WJRvdFZgV1hxmVl52HCKq0gieTQk_uY #flats_checks_raster.csv
!gdown 1tbxE-SZO5R37TiBscVZ2zbTGeTAIsUYk #таргет price трейн
!gdown 1ZI-JsZ-6QxtHJKnxj1Mggscxumodxm64 # таргет price тест

Downloading...
From (original): https://drive.google.com/uc?id=19WJRvdFZgV1hxmVl52HCKq0gieTQk_uY
From (redirected): https://drive.google.com/uc?id=19WJRvdFZgV1hxmVl52HCKq0gieTQk_uY&confirm=t&uuid=4da11b1c-5b06-4d80-9173-064b626dc548
To: /content/flats_checks_raster.csv
100% 1.05G/1.05G [00:05<00:00, 202MB/s]
Downloading...
From: https://drive.google.com/uc?id=1tbxE-SZO5R37TiBscVZ2zbTGeTAIsUYk
To: /content/y_train_msk_merged_2.pkl
100% 788k/788k [00:00<00:00, 24.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZI-JsZ-6QxtHJKnxj1Mggscxumodxm64
To: /content/y_test_msk_merged_2.pkl
100% 140k/140k [00:00<00:00, 83.6MB/s]


In [2]:
!pip install torch_geometric
!pip install catboost
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.neighbors import kneighbors_graph
from torch_geometric.nn import SAGEConv, BatchNorm
from sklearn.model_selection import train_test_split

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1
Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


# Предобработка датасетов. Выделение датасетов из единого

In [25]:
import pandas as pd
# 1. Загружаем и ставим индекс. Это требуется потому, что индексы слетели из-за сокращения датасета после добавления данных ФНС
df_mega_base = pd.read_csv('flats_checks_raster.csv', index_col='Unnamed: 0')

# 1.1. Загружаем y_train и y_test. Таргеты были отдельно
y_train = pd.read_pickle('y_train_msk_merged_2.pkl')
y_test = pd.read_pickle('y_test_msk_merged_2.pkl')

# 2. Объединяем их в один DataFrame
y_full = pd.concat([y_train, y_test])

# 3. Присоединяем таргеты к df_mega_base по индексу
df_mega_base = df_mega_base.join(y_full, how='left')
df_mega_base = df_mega_base.reset_index(drop=True)

# 2. Удаляем все столбцы с 'emb' в названии. Удаляем эмбеддинги Славы, т.к. датасет содержал эмбеддинги tabfpn, мне они не трубуются
df_mega_base = df_mega_base.loc[:, ~df_mega_base.columns.str.contains('emb')]

# 3. Удаляем столбцы
df_mega_base = df_mega_base.drop([
    'TruncatedAverageBill', 'MedianBill',
    'lat', 'lng', 'geometry', 'index_right',
    'coordinates', 'polygon', 'district_id'
], axis=1, errors='ignore')

In [26]:
# заполняем пропуски средним
for column in df_mega_base.columns:
    df_mega_base[column].fillna(df_mega_base[column].mean(), inplace=True)

# 4. Создаём df_check из столбцов ФНС. на нем будем валидироваться
df_check = df_mega_base[[
    'KktCount',
    'AverageBill',
    'CachePayPercent',
    'IntensityOfNumberBills',
    'RevenueIntensity',
    'ReceiptTotalCount',
    'price'
]].copy()

# 5. df_flats_rasters — это оставшиеся столбцы. Получаем датасет только из квартир + POI + растры
df_flats_rasters = df_mega_base.drop(columns=df_check.columns, errors='ignore')


# 6. Удаляем все столбцы, содержащие 'rast' в названии
df_flats = df_flats_rasters.loc[:, ~df_flats_rasters.columns.str.contains('rast')]

# 7. Делаем датасет, в котором есть только исходные данные по квартирам (ЦИАН)
POI = [
    'distance_to_center',
    'highways_count',
    'undergrounds_count',
    'railways_count',
    'time_to_metro',
    'pop_dense',
    'pop_work_dense',
    'pop_child_dense',
    'avg_age',
    'trade_per_pers',
    'pers_per_bed',
    'self_goods_sold',
    'other_goods_sold',
    'org_num',
    'entrep_num',
    'buildings_apartments',
    'buildings_service',
    'buildings_retail',
    'buildings_kindergarten',
    'buildings_school',
    'buildings_office',
    'buildings_construction',
    'buildings_commercial',
    'buildings_hospital',
    'buildings_university',
    'buildings_public',
    'buildings_industrial',
    'buildings_church',
    'education',
    'food_buy',
    'food_out',
    'health',
    'leisure',
    'religion',
    'services',
    'shopping',
    'transport',
    'mun_district',
]

df_flats_without_POI = df_flats.drop(columns=POI, errors='ignore')
df_flats_rasters_only = df_flats_rasters.drop(columns=POI, errors='ignore')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mega_base[column].fillna(df_mega_base[column].mean(), inplace=True)


# GNN предсказание на датасете квартир + растровой гео-информации

# GNN
Датасет квартиры МСК

In [27]:
# === 1. Загрузка и подготовка данных ===
y = df_mega_base['price']
X = df_flats_without_POI.copy()

# Делим на train и test
df_train_flats, df_test_flats, y_train_price, y_test_price = train_test_split(
    X, y, test_size=0.2, random_state=999
)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train_price.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test_price.values.reshape(-1, 1))

# Объединение
df_all = pd.concat([df_train_flats, df_test_flats], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Построение графа и PyG Data ===
A = kneighbors_graph(X_all, n_neighbors=8, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

In [39]:
# === NEW: определим устройство ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Маски
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train_flats)] = True
test_mask[len(df_train_flats):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# Переносим data на GPU
data = data.to(device)

# === 3. GNN модель ===
class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = F.relu(x)

        return self.mlp(x)

    def get_embeddings(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        return x


# Модель и оптимизатор
model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=192, out_channels=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

In [41]:
model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Оценка модели (RMSE на тесте) ===
model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze()
    preds_test_scaled = preds_scaled[data.test_mask].cpu().numpy()
    y_test_scaled_true = data.y[data.test_mask].squeeze().cpu().numpy()

    # Обратное масштабирование
    preds_test = target_scaler.inverse_transform(preds_test_scaled.reshape(-1, 1))
    y_test_true = target_scaler.inverse_transform(y_test_scaled_true.reshape(-1, 1))

    # Вычисляем RMSE
    rmse = mean_squared_error(y_test_true, preds_test) ** 0.5
    print(f'\nTest RMSE: {rmse:.2f}')

Epoch 0, Loss: 0.5569
Epoch 10, Loss: 0.5178
Epoch 20, Loss: 0.5617
Epoch 30, Loss: 0.5633
Epoch 40, Loss: 0.5214
Epoch 50, Loss: 0.4895
Epoch 60, Loss: 0.4563
Epoch 70, Loss: 0.4493
Epoch 80, Loss: 0.5116
Epoch 90, Loss: 0.5189
Epoch 100, Loss: 0.4592
Epoch 110, Loss: 0.4173
Epoch 120, Loss: 0.3977
Epoch 130, Loss: 0.4055
Epoch 140, Loss: 0.3918
Epoch 150, Loss: 0.3973
Epoch 160, Loss: 0.3637
Epoch 170, Loss: 0.3534
Epoch 180, Loss: 0.4313
Epoch 190, Loss: 0.4267
Epoch 200, Loss: 0.3880
Epoch 210, Loss: 0.3577
Epoch 220, Loss: 0.3429
Epoch 230, Loss: 0.3372
Epoch 240, Loss: 0.3416
Epoch 250, Loss: 0.4106
Epoch 260, Loss: 0.4183
Epoch 270, Loss: 0.3659
Epoch 280, Loss: 0.3566
Epoch 290, Loss: 0.3389
Epoch 300, Loss: 0.3269
Epoch 310, Loss: 0.3818
Epoch 320, Loss: 0.3912
Epoch 330, Loss: 0.3431
Epoch 340, Loss: 0.3455
Epoch 350, Loss: 0.3266
Epoch 360, Loss: 0.4009
Epoch 370, Loss: 0.4360
Epoch 380, Loss: 0.3946
Epoch 390, Loss: 0.3636
Epoch 400, Loss: 0.3460
Epoch 410, Loss: 0.3393
Epo

# GNN
Датасет квартиры МСК + POI

In [42]:
# === 1. Загрузка и подготовка данных ===
y = df_mega_base['price']
X = df_flats.copy()

# Делим на train и test
df_train_flats, df_test_flats, y_train_price, y_test_price = train_test_split(
    X, y, test_size=0.2, random_state=999
)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train_price.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test_price.values.reshape(-1, 1))

# Объединение
df_all = pd.concat([df_train_flats, df_test_flats], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Построение графа и PyG Data ===
A = kneighbors_graph(X_all, n_neighbors=8, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

In [43]:
# === NEW: определим устройство ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Маски
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train_flats)] = True
test_mask[len(df_train_flats):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# Переносим data на GPU
data = data.to(device)

# === 3. GNN модель ===
class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = F.relu(x)

        return self.mlp(x)

    def get_embeddings(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        return x


# Модель и оптимизатор
model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=192, out_channels=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

In [44]:
model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Оценка модели (RMSE на тесте) ===
model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze()
    preds_test_scaled = preds_scaled[data.test_mask].cpu().numpy()
    y_test_scaled_true = data.y[data.test_mask].squeeze().cpu().numpy()

    # Обратное масштабирование
    preds_test = target_scaler.inverse_transform(preds_test_scaled.reshape(-1, 1))
    y_test_true = target_scaler.inverse_transform(y_test_scaled_true.reshape(-1, 1))

    # Вычисляем RMSE
    rmse = mean_squared_error(y_test_true, preds_test) ** 0.5
    print(f'\nTest RMSE: {rmse:.2f}')

Epoch 0, Loss: 1.1371
Epoch 10, Loss: 1.0406
Epoch 20, Loss: 1.0032
Epoch 30, Loss: 1.0014
Epoch 40, Loss: 1.0005
Epoch 50, Loss: 1.0003
Epoch 60, Loss: 1.0005
Epoch 70, Loss: 0.9999
Epoch 80, Loss: 1.0000
Epoch 90, Loss: 1.0000
Epoch 100, Loss: 1.0004
Epoch 110, Loss: 1.0001
Epoch 120, Loss: 0.9998
Epoch 130, Loss: 0.9998
Epoch 140, Loss: 0.9997
Epoch 150, Loss: 0.9993
Epoch 160, Loss: 0.9993
Epoch 170, Loss: 0.9988
Epoch 180, Loss: 0.9986
Epoch 190, Loss: 0.9981
Epoch 200, Loss: 0.9972
Epoch 210, Loss: 0.9964
Epoch 220, Loss: 0.9955
Epoch 230, Loss: 0.9956
Epoch 240, Loss: 0.9945
Epoch 250, Loss: 0.9938
Epoch 260, Loss: 0.9933
Epoch 270, Loss: 0.9917
Epoch 280, Loss: 0.9899
Epoch 290, Loss: 0.9883
Epoch 300, Loss: 0.9881
Epoch 310, Loss: 0.9841
Epoch 320, Loss: 0.9849
Epoch 330, Loss: 0.9827
Epoch 340, Loss: 0.9811
Epoch 350, Loss: 0.9785
Epoch 360, Loss: 0.9769
Epoch 370, Loss: 0.9753
Epoch 380, Loss: 0.9749
Epoch 390, Loss: 0.9715
Epoch 400, Loss: 0.9733
Epoch 410, Loss: 0.9673
Epo

# GNN
Датасет квартиры МСК + растровые данные

In [45]:
# === 1. Загрузка и подготовка данных ===
y = df_mega_base['price']
X = df_flats_rasters

# Делим на train и test
df_train_flats, df_test_flats, y_train_price, y_test_price = train_test_split(
    X, y, test_size=0.2, random_state=999
)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train_price.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test_price.values.reshape(-1, 1))

# Объединение
df_all = pd.concat([df_train_flats, df_test_flats], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Построение графа и PyG Data ===
A = kneighbors_graph(X_all, n_neighbors=8, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

In [48]:
# === NEW: определим устройство ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Маски
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train_flats)] = True
test_mask[len(df_train_flats):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# Переносим data на GPU
data = data.to(device)

# === 3. GNN модель ===
class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.3):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = F.relu(x)

        return self.mlp(x)

    def get_embeddings(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        return x


# Модель и оптимизатор
model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=192, out_channels=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

In [49]:
model.train()
for epoch in range(100):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Оценка модели (RMSE на тесте) ===
model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze()
    preds_test_scaled = preds_scaled[data.test_mask].cpu().numpy()
    y_test_scaled_true = data.y[data.test_mask].squeeze().cpu().numpy()

    # Обратное масштабирование
    preds_test = target_scaler.inverse_transform(preds_test_scaled.reshape(-1, 1))
    y_test_true = target_scaler.inverse_transform(y_test_scaled_true.reshape(-1, 1))

    # Вычисляем RMSE
    rmse = mean_squared_error(y_test_true, preds_test) ** 0.5
    print(f'\nTest RMSE: {rmse:.2f}')

Epoch 0, Loss: 1.0050
Epoch 10, Loss: 1.0010
Epoch 20, Loss: 1.0026
Epoch 30, Loss: 1.0009
Epoch 40, Loss: 0.9957
Epoch 50, Loss: 0.9993
Epoch 60, Loss: 0.9637


KeyboardInterrupt: 

# GNN
Датасет квартиры МСК + POI + растровые данные

In [51]:
# === 1. Загрузка и подготовка данных ===
y = df_mega_base['price']
X = df_flats_rasters.copy()

# Делим на train и test
df_train_flats, df_test_flats, y_train_price, y_test_price = train_test_split(
    X, y, test_size=0.2, random_state=999
)

# Масштабируем таргеты
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train_price.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test_price.values.reshape(-1, 1))

# Объединение
df_all = pd.concat([df_train_flats, df_test_flats], ignore_index=True)
y_all = np.vstack([y_train_scaled, y_test_scaled])
X_all = df_all.values.astype(np.float32)

# === 2. Построение графа и PyG Data ===
A = kneighbors_graph(X_all, n_neighbors=8, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(A.nonzero()), dtype=torch.long)

In [60]:
# === NEW: определим устройство ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = Data(
    x=torch.tensor(X_all, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y_all, dtype=torch.float)
)

# Маски
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(df_train_flats)] = True
test_mask[len(df_train_flats):] = True
data.train_mask = train_mask
data.test_mask = test_mask

# Переносим data на GPU
data = data.to(device)

# === 3. GNN модель ===
class DeepGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.4):
        super(DeepGNN, self).__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_channels // 2, out_channels)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)
        return self.mlp(x)

    def get_embeddings(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)
        return x

# Модель и оптимизатор
model = DeepGNN(in_channels=X_all.shape[1], hidden_channels=192, out_channels=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

In [61]:
model.train()
for epoch in range(800):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index).squeeze()
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask].squeeze())
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# === 5. Оценка модели (RMSE на тесте) ===
model.eval()
with torch.no_grad():
    preds_scaled = model(data.x, data.edge_index).squeeze()
    preds_test_scaled = preds_scaled[data.test_mask].cpu().numpy()
    y_test_scaled_true = data.y[data.test_mask].squeeze().cpu().numpy()

    # Обратное масштабирование
    preds_test = target_scaler.inverse_transform(preds_test_scaled.reshape(-1, 1))
    y_test_true = target_scaler.inverse_transform(y_test_scaled_true.reshape(-1, 1))

    # Вычисляем RMSE
    rmse = mean_squared_error(y_test_true, preds_test) ** 0.5
    print(f'\nTest RMSE: {rmse:.2f}')

Epoch 0, Loss: 1.0262
Epoch 10, Loss: 0.9005
Epoch 20, Loss: 0.8428
Epoch 30, Loss: 0.8027
Epoch 40, Loss: 0.7708
Epoch 50, Loss: 0.7325
Epoch 60, Loss: 0.6915
Epoch 70, Loss: 0.6575
Epoch 80, Loss: 0.6297
Epoch 90, Loss: 0.6060
Epoch 100, Loss: 0.5724
Epoch 110, Loss: 0.5532
Epoch 120, Loss: 0.5297
Epoch 130, Loss: 0.5051
Epoch 140, Loss: 0.4854
Epoch 150, Loss: 0.4680
Epoch 160, Loss: 0.4516
Epoch 170, Loss: 0.4381
Epoch 180, Loss: 0.4154
Epoch 190, Loss: 0.4052
Epoch 200, Loss: 0.3859
Epoch 210, Loss: 0.3743
Epoch 220, Loss: 0.3610
Epoch 230, Loss: 0.3519
Epoch 240, Loss: 0.3380
Epoch 250, Loss: 0.3301
Epoch 260, Loss: 0.3209
Epoch 270, Loss: 0.3205
Epoch 280, Loss: 0.3066
Epoch 290, Loss: 0.2979
Epoch 300, Loss: 0.2936
Epoch 310, Loss: 0.2903
Epoch 320, Loss: 0.2775
Epoch 330, Loss: 0.2737
Epoch 340, Loss: 0.2661
Epoch 350, Loss: 0.2639
Epoch 360, Loss: 0.2542
Epoch 370, Loss: 0.2485
Epoch 380, Loss: 0.2500
Epoch 390, Loss: 0.2444
Epoch 400, Loss: 0.2415
Epoch 410, Loss: 0.2392
Epo

Это лучший результат, извлечем его эмбеддинги для дальнейшего использования

In [54]:
# === 5. Извлечение эмбеддингов ===
model.eval()
with torch.no_grad():
    embeddings = model.get_embeddings(data.x, data.edge_index).cpu().numpy()

In [55]:
# 1. Создаем DataFrame с эмбеддингами и правильными индексами из X_all
embeddings = pd.DataFrame(embeddings, index=df_mega_base.index)
embeddings.to_csv('embeddings.csv')

In [56]:
#подгружаем эмбеддинги, чтобы не пересчитывать
#!gdown 1pZIcstb6kL90U8NfTpDKd7TOHr2FtEFC
#embeddings = pd.read_csv('embeddings.csv')

# Валидация. Датасет с фискальными данными

## Предсказание catboost на сыром датасете

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# === 1. Берем df_check и определяем X и y ===
target_col = 'AverageBill'
X = df_check.drop(columns=[target_col])
y = df_check[target_col]

# === 2. Делим на train/test ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=999
)

# === 3. Масштабируем признаки по train ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === 4. Обучение CatBoost ===
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    verbose=0,
    random_state=999
)
model.fit(X_train, y_train)

# === 5. Предсказания и оценка RMSE ===
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print(f'RMSE: {rmse:.2f}')


RMSE: 120.35


## Предсказание catboost на датасете с добавленными эмбеддингами из GNN

In [58]:
# 1. Создаем DataFrame с эмбеддингами и правильными индексами из X_all
emb_df = pd.DataFrame(embeddings, index=df_mega_base.index)
emb_df.columns = [f'emb_{i}' for i in range(embeddings.shape[1])]

# 2. Присоединяем эмбеддинги по индексу
existing_cols = set(df_check.columns)

# 3. Отбираем только те столбцы из emb_df, которых нет в df_check
emb_to_add = emb_df[[col for col in emb_df.columns if col not in existing_cols]]

# 4. Присоединяем по индексу
df_check_emb = df_check.join(emb_to_add, how='left')

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# === 1. Берем df_check и определяем X и y ===
target_col = 'AverageBill'
X = df_check_emb.drop(columns=[target_col])
y = df_check_emb[target_col]

# === 2. Делим на train/test ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=999
)

# === 3. Масштабируем признаки по train ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === 4. Обучение CatBoost ===
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    verbose=0,
    random_state=999
)
model.fit(X_train, y_train)

# === 5. Предсказания и оценка RMSE ===
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print(f'RMSE: {rmse:.2f}')


RMSE: 132.22


На датасете с фискальной информацией полученные GNN на квартирах эмбеддинги не дали улучшения метрики, а наоборот - ухудшили результаты. Видимо, они внесли шум, либо датасет квартир был недостаточно большим