In [1]:
!pip install catboost osmnx scipy geopy

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting osmnx
  Downloading osmnx-1.9.3-py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.2/107.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost, osmnx
Successfully installed catboost-1.2.5 osmnx-1.9.3


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.spatial import cKDTree
from geopy.distance import geodesic
from concurrent.futures import ThreadPoolExecutor
from catboost import CatBoostRegressor, Pool
import osmnx as ox
import geopandas as gpd
from shapely.geometry import Point, MultiPoint
import requests
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore', message="The indices of the two GeoSeries are different.")

In [3]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # радиус Земли в километрах

    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)

    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    a = np.sin(dlat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    distance = R * c
    return distance

In [6]:
df=pd.read_json('train_data.json')
df=pd.concat([df,pd.json_normalize(df['targetAudience'])], axis=1)
df=df.drop(['targetAudience','id'], axis=1)

In [7]:
df['geometry'] = df['points'].apply(lambda x: Point(float(x[0]['lon']), float(x[0]['lat'])))
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.crs = 'EPSG:4326'

In [8]:
moscow = ox.geocode_to_gdf('Moscow, Russia')

population_density = { # wiki
    'Центральный': 11702.67,
    'Северный': 10709.15,
    'Северо-Восточный': 14289.05,
    'Восточный': 9743.75,
    'Юго-Восточный': 12893.76,
    'Южный': 13422.73,
    'Юго-Западный': 12890.82,
    'Западный': 9312.38,
    'Северо-Западный': 11144.78,
    'Зеленоградский': 7272.25,
    'Новомосковский': 1497.79,
    'Троицкий': 181.13
}

# Функция для определения административного округа
def get_district(point):
    for idx, row in moscow.iterrows():
        if row.geometry.contains(point):
            return row['name']
    return 'Unknown'

gdf['district'] = gdf.apply(lambda row: get_district(row.geometry), axis=1)
gdf['population_density'] = gdf['district'].map(population_density)


In [9]:
# Расстояние до центра (Красная площадь)
red_square = Point(37.620393, 55.753930)
gdf['distance_to_center'] = gdf.apply(lambda row: haversine_distance(row.geometry.y, row.geometry.x, red_square.y, red_square.x), axis=1)

# Ближайшая станция метро
metro_stations = ox.features_from_place('Moscow, Russia', tags={'railway': 'station', 'station': 'subway'})
gdf['distance_to_metro'] = gdf.apply(lambda row: metro_stations.distance(row.geometry).min() / 1000, axis=1)

shopping_centers = ox.features_from_place('Moscow, Russia', tags={'shop': 'mall'})
gdf['distance_to_shopping_center'] = gdf.apply(lambda row: shopping_centers.distance(row.geometry).min() / 1000, axis=1)


  gdf['distance_to_metro'] = gdf.apply(lambda row: metro_stations.distance(row.geometry).min() / 1000, axis=1)

  gdf['distance_to_shopping_center'] = gdf.apply(lambda row: shopping_centers.distance(row.geometry).min() / 1000, axis=1)


In [10]:
def split_on_intervals(min_val, max_val, n):
    step = (max_val - min_val) / n
    intervals = [min_val + (step * x) for x in range(n + 1)]
    return intervals

def create_groups(x_intervals, y_intervals):
    groups = {}
    x_intervals = np.concatenate([[-np.inf], x_intervals, [np.inf]])
    y_intervals = np.concatenate([[-np.inf], y_intervals, [np.inf]])

    for x_i in range(len(x_intervals) - 1):
        for y_i in range(len(y_intervals) - 1):
            groups[f'x:{x_intervals[x_i]:.2f}-{x_intervals[x_i+1]:.2f}|y:{y_intervals[y_i]:.2f}-{y_intervals[y_i+1]:.2f}'] = 0

    return groups


def sort_on_groups(x_vals, y_vals, x_intervals, y_intervals, groups, only_vals=False):
    for x, y in zip(x_vals, y_vals):
        for x_i in range(len(x_intervals) - 1):
            for y_i in range(len(y_intervals) - 1):
                if (x_intervals[x_i] <= x < x_intervals[x_i + 1]) and (y_intervals[y_i] <= y < y_intervals[y_i + 1]):
                    groups[f'x:{x_intervals[x_i]:.2f}-{x_intervals[x_i+1]:.2f}|y:{y_intervals[y_i]:.2f}-{y_intervals[y_i+1]:.2f}'] += 1

    return list(groups.values()) if only_vals else groups

def create_dataset(config, gdf):
    x_intervals = split_on_intervals(config['min_xval'], config['max_xval'], config['x_ngroups'])
    y_intervals = split_on_intervals(config['min_yval'], config['max_yval'], config['y_ngroups'])

    groups = create_groups(x_intervals, y_intervals)

    groups_values = []
    for _, row in gdf.iterrows():
        points = np.array([[float(x['lat']), float(x['lon'])] for x in row['points']])
        group_values = sort_on_groups(points[:, 0], points[:, 1], x_intervals, y_intervals, groups.copy(), only_vals=True)
        groups_values.append(group_values)

    groups_values = np.array(groups_values)

    result = pd.DataFrame(groups_values, columns=list(groups.keys()))

    # Add additional features
    result['num_points'] = gdf['points'].apply(len)
    result['avg_lat'] = gdf['points'].apply(lambda x: np.mean([float(p['lat']) for p in x]))
    result['avg_lon'] = gdf['points'].apply(lambda x: np.mean([float(p['lon']) for p in x]))
    result['avg_azimuth'] = gdf['points'].apply(lambda x: np.mean([p['azimuth'] for p in x]))

    # # Add target audience features
    result['gender'] = gdf['gender'].map({'all': 0, 'male': 1, 'female': 2})
    result['ageFrom'] = gdf['ageFrom']
    result['ageTo'] = gdf['ageTo']
    result['age_range'] = gdf['ageTo'] - gdf['ageFrom']
    result['income'] = gdf['income'].map({'a': 1, 'b': 2, 'c': 3, 'ab': 4, 'bc': 5, 'ac': 6, 'abc': 7})

    #Add new features
    result['distance_to_center'] = gdf['distance_to_center']
    result['distance_to_metro'] = gdf['distance_to_metro']
    result['population_density'] = gdf['population_density']
    result['distance_to_shopping_center'] = gdf['distance_to_shopping_center']

    return result

  and should_run_async(code)


In [11]:
config = {'min_xval':55.55, 'max_xval':55.95, 'min_yval':37.3, 'max_yval':37.9, 'x_ngroups': 33, 'y_ngroups': 33}

In [12]:
X = create_dataset(config, gdf)
y = df['value']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  and should_run_async(code)


In [14]:
model = CatBoostRegressor(iterations=1700,
                          depth=6,
                          learning_rate=0.04,
                          grow_policy='SymmetricTree',
                          random_state=42,
                          loss_function='RMSE')

model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100)

0:	learn: 23.7090362	test: 21.7998817	best: 21.7998817 (0)	total: 60.7ms	remaining: 1m 43s
100:	learn: 9.3417619	test: 9.4355152	best: 9.4355152 (100)	total: 1.26s	remaining: 19.9s
200:	learn: 7.4205334	test: 8.1602172	best: 8.1602172 (200)	total: 2.44s	remaining: 18.2s
300:	learn: 6.2832707	test: 7.5366365	best: 7.5366365 (300)	total: 3.63s	remaining: 16.9s
400:	learn: 5.6673500	test: 7.1649117	best: 7.1649117 (400)	total: 4.79s	remaining: 15.5s
500:	learn: 5.1983372	test: 6.9555953	best: 6.9551489 (499)	total: 5.97s	remaining: 14.3s
600:	learn: 4.8404025	test: 6.8178635	best: 6.8170571 (598)	total: 7.14s	remaining: 13.1s
700:	learn: 4.5429632	test: 6.7352540	best: 6.7352540 (700)	total: 8.29s	remaining: 11.8s
800:	learn: 4.3231677	test: 6.6710322	best: 6.6705001 (799)	total: 9.76s	remaining: 10.9s
900:	learn: 4.1306358	test: 6.6388674	best: 6.6384745 (899)	total: 12s	remaining: 10.6s
1000:	learn: 3.9818831	test: 6.6174801	best: 6.6171812 (996)	total: 14s	remaining: 9.76s
1100:	learn:

<catboost.core.CatBoostRegressor at 0x7a7d187b3ca0>

In [15]:
y_pred = model.predict(X_test)

In [16]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
custom = max(1 - rmse/30, 0) ** 4

print(f'RMSE: {rmse:.4f}')
print(f'R²: {r2:.4f}')
print(f'MAE: {mae:.4f}')
print(f'Custom: {custom:.4f}')

feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
print(feature_importance.sort_values('importance', ascending=False))

RMSE: 6.5837
R²: 0.9115
MAE: 4.2347
Custom: 0.3712
                          feature  importance
1225                   num_points   19.117246
128   x:55.57-55.59|y:37.70-37.72    3.305264
824   x:55.82-55.83|y:37.63-37.65    3.031304
1026  x:55.89-55.90|y:37.48-37.50    2.858889
536   x:55.72-55.73|y:37.48-37.50    2.750096
...                           ...         ...
456   x:55.70-55.71|y:37.30-37.32    0.000000
459   x:55.70-55.71|y:37.35-37.37    0.000000
460   x:55.70-55.71|y:37.37-37.39    0.000000
462   x:55.70-55.71|y:37.41-37.43    0.000000
619   x:55.74-55.76|y:37.72-37.74    0.000000

[1238 rows x 2 columns]




In [17]:
model.save_model('mediawise-model.cbm')