In [3]:
%%capture
!pip install osmnx contextily

In [4]:
import osmnx as ox
import matplotlib.pyplot as plt
import contextily as ctx
from tqdm import tqdm
from shapely.geometry import box
import matplotlib.pyplot as plt
import osmnx as ox
import pandas as pd
import numpy as np
import os

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## 1. Парсер карт

In [None]:
log_file_path = '/content/drive/MyDrive/masters/geo-embeddings/raster_osm_parser/moscow_maps_log.txt'

def write_to_log(index):
    with open(log_file_path, 'a') as f:  # 'a' - режим добавления в конец файла
        f.write(f"{index}\n")

In [None]:
!mkdir moscow_maps

mkdir: cannot create directory ‘moscow_maps’: File exists


In [6]:
df = pd.read_csv("merged_df_final.csv")

coordinates = list(zip(df['level_0'], df['lat'], df['lng']))

In [7]:
df[["lat", "lng"]]

Unnamed: 0,lat,lng
0,55.670696,37.750891
1,55.673778,37.764671
2,55.699097,37.640911
3,55.699097,37.640911
4,55.783635,37.458337
...,...,...
38593,55.790346,37.469469
38594,55.857725,37.615614
38595,55.710159,37.469952
38596,55.716554,37.457520


In [None]:
def get_fast_map(lat, lon, dist=400, save_path=None):
    # 1. Создаем bounding box
    bbox = ox.utils_geo.bbox_from_point((lat, lon), dist=dist)
    polygon = box(*bbox)

    # 2. Настройка тегов для загрузки
    ox.settings.useful_tags_way =  []

    # 3. Параллельная загрузка зданий и дорог одним запросом (новые версии OSMnx)
    gdf = ox.features_from_polygon(polygon, tags={'building': True,
                                                    'highway': True})

    # Фильтруем здания
    buildings = gdf[gdf['geometry'].notnull() & gdf['building'].notnull()].copy()

    # Фильтруем дороги (исключаем пешеходные дорожки и тропинки для упрощения)
    roads = gdf[gdf['geometry'].notnull() &
            gdf['highway'].notnull() &
            ~gdf['highway'].isin(['footway', 'path', 'pedestrian',
                                  'steps', 'cycleway', 'service',
                                  'unclassified', ])].copy()

    roads = roads[roads.geom_type.isin(['LineString', 'MultiLineString'])]

    # 4. Визуализация
    fig, ax = plt.subplots(figsize=(10, 10))

    # Сначала рисуем дороги (чтобы они были под зданиями)
    if not roads.empty:
        roads.plot(
            ax=ax,
            color='black',
            linewidth=0.7,
            alpha=0.8
        )

    # Затем рисуем здания поверх дорог
    if not buildings.empty:
        buildings.plot(
            ax=ax,
            color='lightgray',
            edgecolor='dimgray',
            linewidth=0.5,
            alpha=0.7
        )

    # 5. Настройка границ
    buffer = dist * 0.000015
    ax.set_xlim([lon - buffer, lon + buffer])
    ax.set_ylim([lat - buffer, lat + buffer])
    ax.axis('off')

    if save_path:
        plt.savefig(save_path, bbox_inches='tight', pad_inches=0, dpi=50)
    plt.close()

    return save_path

In [None]:
# Пример использования
i = 0
for index, lat, lon in tqdm(coordinates):
    try:
        get_fast_map(lat, lon, save_path=f"moscow_maps/map_{i}.png")
        write_to_log(index)  # Записываем index в лог
        i += 1
    except Exception as e:
        with open(log_file_path, 'a') as f:
            f.write(f"Error processing index {index}: {str(e)}\n")

## 2. Получение эмбеддингов

In [None]:
%%capture
!unzip drive/MyDrive/masters/geo-embeddings/raster_osm_parser/ekb/e_m_res.zip -d /content/maps_2
os.makedirs("maps_emb_ekb", exist_ok=True)

In [None]:
import torch
from torchvision import models, transforms
from PIL import Image

In [None]:
model = models.efficientnet_b7(pretrained=True)

model.classifier = torch.nn.Identity()

model.eval()

preprocess = transforms.Compose([
    transforms.Resize((200)),
    transforms.CenterCrop(224),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

def get_emd(path):
    img = Image.open(path)
    img_tensor = preprocess(img).unsqueeze(0)

    with torch.no_grad():
        embedding = model(img_tensor).squeeze().numpy()
    return embedding

# get_emd("map_6.png")



In [None]:
for map_name in tqdm(os.listdir("maps_2"), desc="Эмбеддинги изображений"):
    try:
        temp_array = get_emd(f"maps_2/{map_name}")

        parts = map_name.split("_")
        n = parts[1].split(".")[0]

        folder_path = "maps_emb_ekb"
        file_name = f"{n}.npy"
        file_path = os.path.join(folder_path, file_name)

        np.save(file_path, temp_array)

    except Exception as e:
        print(f"Произошла ошибка: {e}")

Эмбеддинги изображений: 100%|██████████| 9117/9117 [51:10<00:00,  2.97it/s]


In [None]:
!zip -r ekb_emb.zip /content/maps_emb_ekb

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
  adding: content/maps_emb_ekb/1927.npy (deflated 8%)
  adding: content/maps_emb_ekb/8872.npy (deflated 8%)
  adding: content/maps_emb_ekb/1210.npy (deflated 8%)
  adding: content/maps_emb_ekb/6635.npy (deflated 8%)
  adding: content/maps_emb_ekb/5822.npy (deflated 8%)
  adding: content/maps_emb_ekb/2694.npy (deflated 8%)
  adding: content/maps_emb_ekb/4342.npy (deflated 8%)
  adding: content/maps_emb_ekb/568.npy (deflated 8%)
  adding: content/maps_emb_ekb/7421.npy (deflated 8%)
  adding: content/maps_emb_ekb/1958.npy (deflated 8%)
  adding: content/maps_emb_ekb/8405.npy (deflated 8%)
  adding: content/maps_emb_ekb/2670.npy (deflated 8%)
  adding: content/maps_emb_ekb/3932.npy (deflated 8%)
  adding: content/maps_emb_ekb/7116.npy (deflated 8%)
  adding: content/maps_emb_ekb/3677.npy (deflated 8%)
  adding: content/maps_emb_ekb/803.npy (deflated 8%)
  adding: content/maps_emb_ekb/5305.npy (deflated 8%)
  

In [None]:
!cp ekb_emb.zip /content/drive/MyDrive/masters/geo-embeddings/raster_osm_parser/ekb/

In [None]:
file_path = "maps_emb_ekb/0.npy"

# Load the .npy file
data = np.load(file_path)

# Print the loaded data
print(data)

[ 0.25682458 -0.17701226  0.11119191 ... -0.15363914 -0.16109379
 -0.12165738]


## 3. Эмбеддинги + табличные данные

In [None]:
!unzip -q "/content/drive/MyDrive/masters/geo-embeddings/raster_osm_parser/spb/spb_emb.zip" -d "/content/maps_emb_spb"

In [None]:
spb_data = pd.read_csv("merged_df_final_spb.csv")

In [None]:
spb_data.head()

Unnamed: 0,level_0,dealType,roomsCount,repairType,hasFurniture,isApartments,floorNumber,flatType,livingArea,windowsViewType,...,buildings_university,education,food_buy,food_out,health,leisure,religion,services,shopping,transport
0,0,sale,1,cosmetic,unknown,False,2,rooms,18.6,street,...,1.0,43,100,379,82,9,6,78,251,0
1,1,sale,3,no,unknown,False,5,rooms,30.0,yard,...,0.0,8,32,19,15,1,1,13,33,0
2,2,sale,2,euro,true,False,22,rooms,18.7,unknown,...,0.0,19,43,40,32,1,0,22,52,0
3,3,sale,2,cosmetic,unknown,False,13,rooms,29.0,street,...,0.0,6,15,13,13,0,3,7,11,0
4,4,sale,1,cosmetic,unknown,False,5,rooms,19.0,yard,...,0.0,11,18,17,15,0,1,5,12,0


In [None]:
# Путь к папке с .npy файлами
folder_path = 'maps_emb_spb/content/maps_emb_spb'

# Шаг 1: Загрузка всех .npy файлов в словарь
embeddings_dict = {}
for file_name in os.listdir(folder_path):
    if file_name.endswith('.npy'):
        # Извлекаем имя файла без расширения
        file_key = os.path.splitext(file_name)[0]
        # Загружаем данные из .npy файла
        embeddings_dict[file_key] = np.load(os.path.join(folder_path, file_name))

# Шаг 2: Сопоставление с датафреймом
spb_data['level_0'] = spb_data['level_0'].astype(str)

# Создаем новую колонку для эмбеддингов
spb_data['embedding'] = spb_data['level_0'].apply(lambda x: embeddings_dict.get(x, None))

# Шаг 3: Проверка результата
missing_embeddings = spb_data[spb_data['embedding'].isnull()]
missing_embeddings

Unnamed: 0,level_0,dealType,roomsCount,repairType,hasFurniture,isApartments,floorNumber,flatType,livingArea,windowsViewType,...,education,food_buy,food_out,health,leisure,religion,services,shopping,transport,embedding
24378,24378,sale,2,cosmetic,unknown,False,1,rooms,31.8,yard,...,11,16,7,5,0,0,5,8,0,
24379,24379,sale,2,euro,true,False,3,rooms,30.2,yard,...,12,49,44,47,2,0,48,155,0,
24380,24380,sale,2,no,unknown,False,4,rooms,28.3,street,...,41,77,171,86,2,1,60,173,0,
24381,24381,sale,2,no,unknown,False,4,rooms,28.3,street,...,41,77,171,86,2,1,60,173,0,
24382,24382,sale,2,no,unknown,False,1,rooms,39.0,yard,...,43,78,237,58,13,8,48,198,0,
24383,24383,sale,2,no,unknown,False,1,rooms,39.0,yard,...,43,78,237,58,13,8,48,198,0,
24384,24384,sale,2,euro,false,False,5,rooms,27.6,yardandstreet,...,7,43,43,18,2,0,11,74,0,
24385,24385,sale,1,euro,unknown,False,16,rooms,12.4,street,...,12,41,21,23,2,2,22,52,0,
24386,24386,sale,4,cosmetic,unknown,False,4,rooms,71.0,yardandstreet,...,49,65,186,75,3,3,56,177,0,
24387,24387,sale,4,cosmetic,unknown,False,4,rooms,71.0,yardandstreet,...,49,65,186,75,3,3,56,177,0,


In [None]:
spb_data = spb_data[spb_data['embedding'].isnull() != True]
spb_data[spb_data['embedding'].isnull()]

Unnamed: 0,level_0,dealType,roomsCount,repairType,hasFurniture,isApartments,floorNumber,flatType,livingArea,windowsViewType,...,education,food_buy,food_out,health,leisure,religion,services,shopping,transport,embedding


In [None]:
spb_data.to_csv("merged_df_emb_spb.csv")