### Начиная отсюда, можно все запускать

Здесь происходит финальная предобработка фич.

В следующем ноутбуке будет старая предобработка, тоже используемая для финальной модели.

Результаты обеих предобработок на всякий случай уже сохранены и лежат в папке ./data/created_files

В целом один ноутбук предобработки занимает не более 20-30 минут

In [1]:
import numpy as np, pandas as pd
import re, pickle, logging, json, gc, utils

from math import sin, cos, sqrt, atan2, radians
from tqdm import tqdm_notebook
from collections import Counter, defaultdict

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors

R = 6373.0 # радиус земли в километрах

def distance(x, y):
    lat_a, long_a, lat_b, long_b = map(radians, [*x,*y])  
    dlon = long_b - long_a
    dlat = lat_b - lat_a
    a = sin(0.5 * dlat)**2 + cos(lat_a) * cos(lat_b) * sin(0.5 * dlon)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

HOLDOUT_MODE = False

In [2]:
atm_k_values = []
atm_rad_values = [1]
atm_group_rad_values = [1]
metro_rad_values=[1.]

metro_diff_k_values = [10, 50]
sber_diff_k_values = [10, 100]
atm_diff_k_values = [15, 100]
city_diff_k_values = [10, 100]
rosbank_diff_k_values = [10, 100]
sravni_diff_k_values = [10, 100]
rshb_diff_k_values = [10, 50]
raif_diff_k_values = [10, 50]
gazprom_diff_k_values = [10, 50]
partners_diff_k_values = [10, 50]

In [3]:
logging.basicConfig(filename='./data/logs/logfile.log', level=logging.INFO, 
                    format='%(asctime)s %(message)s', datefmt='%I:%M:%S. ')
logging.info('Reading data.csv')

data = pd.read_csv("./data/prep/data.csv")
groups = data['group'].unique()

In [4]:
# holdout mode
if HOLDOUT_MODE:
    holdout = range(5009, 6261)

    data['isHoldout'] = [el in holdout for el in range(data.shape[0])]
    data.loc[holdout, 'isTrain'] = False

### Избирательные участки

+ Расстояние до ближайшего участка
+ Широта и долгота ближайшего участка
+ Для типа, региона, города, типа объекта, и адреса ближайшего участка считаются признаки:
  + Количество объектов в выборке с такой же категорией (с таким же городом, регионом и т.п.)
  + Количество объектов с такой же категорией и такой же группой, как и у объекта, для которого считается признак
  + отношение второй величины к первой, то есть доля объектов с такой же группой среди всех объектов данной категории

In [5]:
%%time

votes = pd.read_csv("./data/prep/votes.csv")
coords = votes[['lat', 'lon']].values

neigh = NearestNeighbors(metric=distance)
neigh.fit(coords)
    
distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 2, return_distance=True)

PREFIX = 'izbir'

data[PREFIX + '_dist'] = distances[:, 0]

for name in ['lat', 'lon', 'type', 'region', 'location_type', 'town', 'voters_in', 'voters_out', 'idx', 'address']:
    data[PREFIX + '_' + name] = votes.loc[indexes[:, 0], name].values
    
for name in ['type', 'region', 'location_type', 'town', 'address']:
    
    data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())   
    
    tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = \
        data.apply(lambda x: tmp.get((x[PREFIX + '_' + name], x['group']), 0.), axis=1).values
    
    data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
            data[PREFIX + '_' + name + '_amount']
        
for name in ['town']:
    for group in groups:
        data[PREFIX + '_' + name + '_group_' + str(group) + '_amount'] = data[PREFIX + '_' + name] \
                            .map(data.groupby(PREFIX + '_' + name)['group'] \
                            .apply(lambda x: (x == group).sum()))

CPU times: user 51.4 s, sys: 525 ms, total: 51.9 s
Wall time: 51.7 s


### Признаки, связанные с наиболее близким городом по таблице cities.csv

+ Расстояние до ближайшего города (фактически до центра города)
+ Широта и долгота данного города
+ Население города
+ Широта и долгота ближайшего участка
+ Для города, региона, типа региона, района и часового пояса ближайшего города считаются признаки:
  + Количество объектов в выборке с такой же категорией (с таким же городом, регионом и т.п.)
  + Количество объектов с такой же категорией и такой же группой, как и у объекта, для которого считается признак
  + отношение второй величины к первой, то есть доля объектов с такой же группой среди всех объектов данной категории
+ Расстояние до Москвы и Питера
+ Количество объектов каждой из групп в данном городе

In [6]:
%%time

city_renames = {
    'Широта': 'lat',
    'Долгота': 'lon',
    'Тип региона': 'region_type',
    'Район': 'district',
    'Часовой пояс': 'time'
}

logging.info('creating cities.csv features')

df = pd.read_csv("./data/prep/cities.csv").rename(city_renames, axis=1)

coords = df[['lat', 'lon']].values
neigh = NearestNeighbors()
neigh.fit(coords)

distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, n_neighbors=100, return_distance=True)

PREFIX = 'city'

data[PREFIX + '_dist'] = distances[:, 0]
data['city_population'] = df.loc[indexes[:, 0], 'population']\
                            .apply(lambda x: re.findall('\d+', x)[0]).astype(int).values

for name in ['lat', 'lon', 'region', 'town', 'region_type', 'district', 'time']:
    data[PREFIX + '_' + name] = df.loc[indexes[:, 0], name].values

data['city_town'] = data['city_town'].map(lambda x: x.lower() if x == x else x)
data.loc[data['city_region'] == 'Москва'] == 'москва'
data['city_region'].replace({'Москва': 'город Москва'}, inplace=True)
data.loc[data['city_region'] == 'Санкт-Петербург'] == 'санкт-петербург'
data['city_region'].replace({'Санкт-Петербург': 'город Санкт-Петербург'}, inplace=True)

for name in ['town', 'region', 'region_type', 'district', 'time']:
    
    data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())   
    
    tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = \
        data.apply(lambda x: tmp.get((x[PREFIX + '_' + name], x['group']), 0.), axis=1).values
    
    data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
            data[PREFIX + '_' + name + '_amount']
        
for name in ['town']:
    for group in groups:
        data[PREFIX + '_' + name + '_group_' + str(group) + '_amount'] = data[PREFIX + '_' + name] \
                            .map(data.groupby(PREFIX + '_' + name)['group'] \
                            .apply(lambda x: (x == group).sum()))
            

res = []
for idx, (dists, ids) in enumerate(zip(distances, indexes)):
    info = []
    diffs = np.array([dists[i + 1] - dists[i] for i in range(len(dists) - 1)])
    info.append(diffs[0])
    for k in city_diff_k_values:
        info += [diffs[:k].mean(), diffs[:k].std(), diffs[:k].max()]
    res.append(info)
    
city_diff_features = [PREFIX + '_min_diff']
for k in city_diff_k_values:
    city_diff_features += [PREFIX + '_dist_' + str(k) + '_' + word for word in ['mean', 'std', 'max']]
data[city_diff_features] = pd.DataFrame(res)

important_coords = {
    'moscow': [55.753879, 37.620373],
    'peter': [59.939125, 30.315822]
}

for city in important_coords:
    data['dist2' + city] = data[['lat', 'long']]\
                    .apply(lambda x: distance(important_coords[city], [x['lat'], x['long']]), axis=1)

CPU times: user 4.89 s, sys: 52 ms, total: 4.95 s
Wall time: 5.01 s


### Признаки, связанные с метро

+ Расстояние до ближайшей станции метро
+ Количество станций в радиусе одного км

In [7]:
%%time

logging.info('creating metro features')

metro_fields = {
    'metro': ['town', 'line', 'station'],
    'metro_moscow': ['station'],
    'metro_peter': ['station']
}

metro_new_features = []
metro_new_features += ['min_diff']
for k in metro_diff_k_values:
    metro_new_features += ['diff_' + str(k) + '_' + word for word in ['mean', 'std', 'max']]

for filename in metro_fields:
    df = pd.read_csv("./data/prep/" + filename + ".csv")
    coords = df[['lat', 'lon']].values
    neigh = NearestNeighbors(metric=distance)
    neigh.fit(coords)
    
    distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 50, return_distance=True)
    
    data[filename + '_dist'] = distances[:, 0]
    data[filename + '_lat'] = df.loc[indexes[:, 0], 'lat'].values
    data[filename + '_lon'] = df.loc[indexes[:, 0], 'lon'].values

    PREFIX = filename
    for name in metro_fields[filename]:
        
        data[PREFIX + '_' + name] = df.loc[indexes[:, 0], name].values
        
        data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())   
    
        tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
        data[PREFIX + '_' + name + '_group_amount'] = \
            data.apply(lambda x: tmp.get((x[PREFIX + '_' + name], x['group']), 0.), axis=1).values

        data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
                data[PREFIX + '_' + name + '_amount']
            
    stats = []
    for idx, (dists, ids) in tqdm_notebook(enumerate(zip(distances, indexes)), leave=False):
        info = []
        diffs = np.array([dists[i + 1] - dists[i] for i in range(len(dists) - 1)])
        info.append(diffs[0])
        for k in metro_diff_k_values:
            info += [diffs[:k].mean(), diffs[:k].std(), diffs[:k].max()]
        stats.append(info)
    data[[filename + '_' + word for word in metro_new_features]] = pd.DataFrame(stats)
    
    for rad in metro_rad_values:
        data[filename + '_dist_count_' + str(rad)] = (distances < rad).sum(axis=1)





CPU times: user 14.7 s, sys: 334 ms, total: 15.1 s
Wall time: 15 s


In [8]:
TOWN = 'izbir_town'

### Признаки по населению из вики

In [9]:
logging.info('creating wiki.csv features')

df = pd.read_csv("./data/prep/wiki_cities.csv").drop_duplicates('town')

data['wiki_town_population'] = data[TOWN].map(df.set_index('town')['population'])
data['wiki_region_population'] = data['izbir_region'].map(lambda x: x.lower())\
                                .map(df.groupby('region')['population'].sum())

### Признаки по населению из gks таблицы

+ Только, собственно, еще одно населения региона

In [10]:
logging.info('creating population_gks features')


populus = pd.read_csv("./data/prep/population_gks.csv")

data['gks_population'] = data['izbir_region'].map(populus.set_index('region')['population'])

### Зарплаты

In [11]:
logging.info('creating fee-related features')

# fee min
fee_min = pd.read_csv("./data/prep/fee_min.csv")
for word in ['all', 'work', 'pension', 'child']:
    data['fee_min_' + word] = data['izbir_region'].map(fee_min.set_index('region')['fee_' + word].map(np.log))
data['fee_min_work2all'] = data['fee_min_work'] / data['fee_min_all']

# fee work
fee = pd.read_csv("./data/prep/fee_work.csv").replace({'г.Москва': 'город Москва'})
fee_names = fee.columns[1:]
for field in fee_names:
    data['fee_' + field] = data['izbir_region'].map(fee.set_index('region')[field].map(np.log)).fillna(0.).astype(float)

### Генерация признаков

+ Количество банкоматов, имеющих такое сочетание группы + координат
+ Количество банкоматов, имеющих такие же координаты
+ Количество банкоматов, имеющих такой же ближайший <<избирательный город>>
+ Количество банкоматов с такой же группой
+ Количество банкоматов с таким же индексом избирательного адреса
+ Тип локации избирательного участка (школа, театр, и т.п.). OHE
+ Группа банкомата. OHE

In [12]:
PREFIX = 'initial'

for name in ['coord']:
    data[PREFIX + '_' + name + '_amount'] = data[name].map(data.groupby(name).size())
    
    tmp = data.groupby([name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = \
        data.apply(lambda x: tmp.get((x[name], x['group']), 0.), axis=1).values

    #data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
     #           data[PREFIX + '_' + name + '_amount']
        
        
data['group_amount'] = data['group'].map(data.groupby('group').size())

for name in ['coord']:
    for group in groups:
        data[PREFIX + '_' + name + '_group_' + str(group) + '_amount'] = \
            data[name].map(data.groupby(name)['group'].apply(lambda x: (x == group).sum()))

### Признаки, связанные с банкоматами из выборки

+ Расстояние до ближайшего другого банкомата
+ Количество банкоматов в радиусах 0.05, 0.1, 0.3, 0.5, 1
+ Стандартное отклонение расстояний до самых близких 40 банкоматов
+ Доля банкоматов из той или иной группы в радиусах 0.1, 0.5, 1.

In [13]:
logging.info('creating atm related features')

data_coords = data[['lat', 'long']].values
neigh = NearestNeighbors(metric=distance)
neigh.fit(data_coords)

distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 100, return_distance=True)

In [14]:
res = []
for idx, (dists, ids) in tqdm_notebook(enumerate(zip(distances, indexes))):
    
    info = []
    
    curr_group = data.loc[idx, 'group']
    curr_dists = dists[ids != idx]
    min_dist = curr_dists[0]
    curr_ids = ids[ids != idx]
    
    min_ids = curr_ids[curr_dists == min_dist]
    n = len(min_ids)
    info.append(n)
    cnt = data.loc[min_ids, 'group'].value_counts()
    group_counts = [cnt.get(group, 0) / n for group in groups]
    info += group_counts
    tmp = np.array(group_counts)
    info += [(tmp[tmp != 0] * np.log(tmp[tmp != 0])).sum()]
    info.append(min_dist)
    
    if min_dist == 0:
        info.append(1)
    else:
        info.append(0)

    for k in atm_diff_k_values:
        curr_k_dists = curr_dists[:k]
        diffs = [curr_k_dists[i + 1] - curr_k_dists[i] for i in range(len(curr_k_dists) - 1)]
        if len(diffs) > 0:
            info += [np.mean(diffs), np.max(diffs), np.std(diffs)]
        else:
            info += [0] * 3
        
    for group in groups:
        mask = data.loc[curr_ids, 'group'] == group
        
        if mask.sum() == 0:
            info += [0] * 2
        else:
            curr_group_ids = curr_ids[mask]
            curr_group_dists = curr_dists[mask]

            min_group_dist = curr_group_dists[0]
            min_group_ids = curr_group_ids[curr_group_dists == min_group_dist]
            group_n = len(min_group_ids)
            info.append(group_n)
            info.append(min_group_dist)
    
    res.append(info)




In [15]:
new_atm_features = ['atm_min_len'] + ['atm_min_freq_' + str(group) for group in groups] + ['group_entropy']
new_atm_features += ['atm_min_dist', 'atm_min_is_0']
for k in atm_diff_k_values:
    new_atm_features += ['atm_diff_' + str(k) + '_' + word for word in ['mean' ,'max', 'std']]
for group in groups:
    new_atm_features += ['atm_min_group_len_' + str(group), 'atm_min_group_dist_' + str(group)]

data[new_atm_features] = pd.DataFrame(res)


    
for k in atm_k_values:
    data['atm_dist_std_' + str(k)] = distances[:, 1:k].std(axis=1)

for rad in atm_rad_values:
    data['atm_dist_count_' + str(rad)] = (distances < rad).sum(axis=1)
    
    
groups = data['group'].unique()

for rad in tqdm_notebook(atm_group_rad_values):
    res = []
    for idx, row in enumerate(distances):
        tmp = row < rad
        cnt = Counter(data.loc[indexes[idx][tmp], 'group'].values)
        m = tmp.sum()
        res.append([cnt[el] / m for el in groups])
        
    data[[str(group) + str(rad) for group in groups]] = pd.DataFrame(res)
    




### OSM

In [16]:
logging.info('creating OSM related features')


with open('./data/prep/osm_data_0.005.pickle', 'rb') as fin:
    osm_data = pickle.load(fin)

osm_sections = {
 'highway': ['crossing', 'traffic_signals', 'bus_stop'],
 'crossing': ['uncontrolled', 'traffic_signals'],
 'barrier': ['gate', 'lift_gate', 'block', 'entrance'],
 'entrance': ['yes', 'main'],
 'power': ['tower'],
 'shop': ['convenience', 'supermarket', 'florist', 'hairdresser'],
 'amenity': ['pharmacy', 'waste_disposal', 'parking', 'fountain', 'bench', 'cafe',
            'car_wash', 'library', 'fuel', 'bank', 'toilets', 'fast_food'],
 'traffic_calming': ['bump'],
 'railway': ['level_crossing'],
 'leisure': ['playground'],
 'access': ['private'],
 'natural': ['tree'],
 'historic': ['memorial'],
 'amenity': ['bank', 'atm'],
 'name': ['Росбанк', 'Сбербанк', 'Газпромбанк', 'ВТБ', 'Россельхозбанк', 'Магнит', 'Пятёрочка']
}

feat = defaultdict(list)
for coord in tqdm_notebook(osm_data):
    for section in osm_sections:
        nodes = [node for node in osm_data[coord] if section in node[1]]
        
        for subsection in osm_sections[section]:
            curr_nodes = [node for node in nodes if node[1][section] == subsection]
            if len(curr_nodes) > 0:
                feat[coord] += [len(curr_nodes), curr_nodes[0][2]]
                if len(curr_nodes) > 1:
                    feat[coord] += [curr_nodes[1][2] - curr_nodes[1][2]]
                else:
                    feat[coord] += [1000]
            else:
                feat[coord] += [0, 1000, 1000]
                
osm_features = []
for section in osm_sections:
    for subsection in osm_sections[section]:
        osm_features += [section + '_' + subsection + '_' + word for word in ['amount', 'min_dist', 'diff']]
        
data[osm_features] = pd.DataFrame(data['coord_idx'].map(feat).tolist())




### Сбербанк

In [17]:
logging.info('creating sberbank.csv features')

sberbank_renames = {
    'Субъект федерации': 'region',
    'Населенный пункт': 'type',
    'Улица': 'district',
    'Индекс': 'index'
}

df = pd.read_csv("./data/prep/bank_sberbank.csv").rename(sberbank_renames, axis=1)

df = df[~df[['lat', 'long']].isnull().any(axis=1)].copy().reset_index()
coords = df[['lat', 'long']].values

neigh = NearestNeighbors(metric=distance)
neigh.fit(coords)
distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 100, return_distance=True)


res = []
for idx, (dists, ids) in enumerate(zip(distances, indexes)):
    info = [(dists == 0.).sum()]
    diffs = np.array([dists[i + 1] - dists[i] for i in range(len(dists) - 1)])
    info.append(diffs[0])
    for k in sber_diff_k_values:
        info += [diffs[:k].mean(), diffs[:k].std(), diffs[:k].max()]
    res.append(info)
    
PREFIX = 'sberbank'

sberbank_new_features = [PREFIX + '_zerodist_amount', PREFIX + '_min_diff']
for k in sber_diff_k_values:
    sberbank_new_features += [PREFIX + '_dist_' + str(k) + '_' + word for word in ['mean', 'std', 'max']]
    
data[sberbank_new_features] = pd.DataFrame(res)


data[PREFIX + '_dist'] = distances[:, 0]
for name in ['lat', 'long', 'index', 'town', 'region', 'type', 'district']:
    
    if name == 'index':
        data[PREFIX + '_index'] = df.loc[indexes[:, 0], 'index'].astype(float).values
    else:
        data[PREFIX + '_' + name] = df.loc[indexes[:, 0], name].values


tmp = df['town'].dropna().map(lambda x: x.lower()).value_counts()
data[PREFIX + '_town_amount_outer'] = data[TOWN].map(tmp)

for name in ['town', 'type', 'region', 'district']:
    
    data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())
    
    tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = data\
                .apply(lambda x: tmp.get((x[PREFIX + '_' + name], x['group']), 0.), axis=1)
        
    data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
                data[PREFIX + '_' + name + '_amount']

### rosbank p2p

In [18]:
logging.info('creating rosbank_p2p.csv features')

rosbank_p2p_renames = {
    'Город': 'town',
    'Регион': 'region',
    'Режим работы': 'regime',
}
df = pd.read_csv("./data/prep/bank_rosbank_p2p.csv").rename(rosbank_p2p_renames, axis=1)

df = df[~df[['lat', 'long']].isnull().any(axis=1)].copy().reset_index(drop=True)
coords = df[['lat', 'long']].values

neigh = NearestNeighbors(metric=distance)
neigh.fit(coords)
distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 100, return_distance=True)

data['rosbank_p2p_dist'] = distances[:, 0]

PREFIX = 'rosbank_p2p'
for name in ['town', 'region', 'regime', 'address', 'lat', 'long']:
    data[PREFIX + '_' + name] = df.loc[indexes[:, 0], name].values

for name in ['town', 'region', 'regime']:
    tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = \
        data.apply(lambda x: tmp.get((x[PREFIX + '_' + name], x['group']), 0.), axis=1).values
    data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())
     
    data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
            data[PREFIX+ '_' + name + '_amount']
        
    
rosbank_p2p_new_features = [PREFIX + '_min_diff']
for k in rosbank_diff_k_values:
    rosbank_p2p_new_features += [PREFIX + '_dist_' + str(k) + '_' + word for word in ['mean', 'std', 'max']]

res = []
for idx, (dists, ids) in enumerate(zip(distances, indexes)):
    info = []
    diffs = np.array([dists[i + 1] - dists[i] for i in range(len(dists) - 1)])
    info.append(diffs[0])
    for k in rosbank_diff_k_values:
        info += [diffs[:k].mean(), diffs[:k].std(), diffs[:k].max()]
    res.append(info)
    
data[rosbank_p2p_new_features] = pd.DataFrame(res)

### sravni.ru

In [19]:
logging.info('creating sravni.ru features')

df = pd.read_csv("./data/prep/bank_sravni_2.csv")
df = df[~df[['lat', 'long']].isnull().any(axis=1)].copy().reset_index(drop=True)

coords = df[['lat', 'long']].values

neigh = NearestNeighbors(metric=distance)
neigh.fit(coords)

distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 100, return_distance=True)

PREFIX = 'sravni'

data[PREFIX + '_dist'] = distances[:, 0]
for name in ['bank', 'town', 'lat', 'long', 'service', 'time']:
    data[PREFIX + '_' + name] = df.loc[indexes[:, 0], name].values

for name in ['bank', 'town', 'service', 'time']:
    tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = \
        data.apply(lambda x: tmp.get((x['sravni_' + name], x['group']), 0.), axis=1).values
        
    data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())   
    
    data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
            data[PREFIX + '_' + name + '_amount']
        
res = []
for idx, (dists, ids) in enumerate(zip(distances, indexes)):
    info = []
    diffs = np.array([dists[i + 1] - dists[i] for i in range(len(dists) - 1)])
    info.append(diffs[0])
    for k in sravni_diff_k_values:
        info += [diffs[:k].mean(), diffs[:k].std(), diffs[:k].max()]
    res.append(info)
    
sravni_new_features = [PREFIX + '_min_diff']
for k in sravni_diff_k_values:
    sravni_new_features += [PREFIX + '_dist_' + str(k) + '_' + word for word in ['mean', 'std', 'max']]
    
data[sravni_new_features] = pd.DataFrame(res)

### Россельхозбанк

In [20]:
logging.info('creating rosselhoz features')

PREFIX = 'rshb'

df = pd.read_csv("./data/prep/bank_rshb.csv").rename({'location_lat': 'lat', 'location_lng': 'long'}, axis=1)
df = df[~df[['lat', 'long']].isnull().any(axis=1)].copy().reset_index(drop=True)
coords = df[['lat', 'long']].values
neigh = NearestNeighbors(metric=distance)
neigh.fit(coords)
distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 100, return_distance=True)

data[PREFIX + '_dist'] = distances[:, 0]

for name in ['region', 'lat', 'long', 'access', 'currency', 'shedule']:
    data[ PREFIX + '_' + name] = df.loc[indexes[:, 0], name].values

for name in ['region', 'access', 'currency', 'shedule']:
    tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = \
        data.apply(lambda x: tmp.get((x[PREFIX + '_' + name], x['group']), 0.), axis=1).values
        
    data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())   
    
    data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
            data[PREFIX + '_' + name + '_amount']
        
res = []
for idx, (dists, ids) in enumerate(zip(distances, indexes)):
    info = []
    diffs = np.array([dists[i + 1] - dists[i] for i in range(len(dists) - 1)])
    info.append(diffs[0])
    for k in rshb_diff_k_values:
        info += [diffs[:k].mean(), diffs[:k].std(), diffs[:k].max()]
    res.append(info)
    
rshb_new_features = [PREFIX + '_min_diff']
for k in rshb_diff_k_values:
    rshb_new_features += [PREFIX + '_dist_' + str(k) + '_' + word for word in ['mean', 'std', 'max']]
    
data[rshb_new_features] = pd.DataFrame(res)

### Раффайзен банк

In [21]:
PREFIX = 'raif'
logging.info('creating raiffaizen features')
             
df = pd.read_csv("./data/prep/bank_raif.csv")
df['lat'] = np.array(df['coords'].apply(lambda x: json.loads(x)).tolist())[:, 0]
df['long'] = np.array(df['coords'].apply(lambda x: json.loads(x)).tolist())[:, 1]
coords = df[['lat', 'long']].values
neigh = NearestNeighbors(metric=distance)
neigh.fit(coords)
distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 100, return_distance=True)

data[PREFIX + '_dist'] = distances[:, 0]

for name in ['time', 'lat', 'long']:
    data[ PREFIX + '_' + name] = df.loc[indexes[:, 0], name].values
    
for name in ['time']:
    tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = \
        data.apply(lambda x: tmp.get((x[PREFIX + '_' + name], x['group']), 0.), axis=1).values
        
    data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())   
    
    data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
            data[PREFIX + '_' + name + '_amount']
        
        
res = []
for idx, (dists, ids) in enumerate(zip(distances, indexes)):
    info = []
    diffs = np.array([dists[i + 1] - dists[i] for i in range(len(dists) - 1)])
    info.append(diffs[0])
    for k in raif_diff_k_values:
        info += [diffs[:k].mean(), diffs[:k].std(), diffs[:k].max()]
    res.append(info)
    
raif_new_features = [PREFIX + '_min_diff']
for k in raif_diff_k_values:
    raif_new_features += [PREFIX + '_dist_' + str(k) + '_' + word for word in ['mean', 'std', 'max']]
    
data[raif_new_features] = pd.DataFrame(res)

### Газпромбанк

In [22]:
PREFIX = 'gazprom'
logging.info('creating gazprombank features')

new_names = {
    'GPS-координаты: широта': 'lat',
    'GPS-координаты: долгота': 'long',
    'Регион - имя собств.': 'region',
    'Название населен. Пункта': 'town',
    'Территориальное образование (регион) - суффикс': 'region_type', 
    'Город, поселок, село, …': 'town_type',
    'Место расположения банкомата': 'loctype',
    'Тип  (ATM / PVN / INF)': 'atm_type',
    'Время работы / доступа': 'time', 
    'Visa / MC/ JCB / CUP': 'cards', 
    'Cash-IN': 'cash_in',
    'Обмен валюты': 'exchange',
    'Общий доступ': 'access',
    'Банк / Филиал': 'bank'
}

df = pd.read_excel('./data/prep/bank_gazprombank.xlsx', header=1).rename(new_names, axis=1)
df= df[~df[['lat', 'long']].isnull().any(axis=1)].copy()


coords = df[['lat', 'long']].values
neigh = NearestNeighbors(metric=distance)
neigh.fit(coords)
distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 100, return_distance=True)

data[PREFIX + '_dist'] = distances[:, 0]

for name in ['town', 'region', 'lat', 'long', 'region_type', 'town_type', 'loctype', 'atm_type',
            'cards', 'access']:
    data[ PREFIX + '_' + name] = df.loc[indexes[:, 0], name].values
    
for name in ['region', 'town', 'region_type', 'town_type', 'loctype', 'atm_type', 'cards', 'access']:
    tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = \
        data.apply(lambda x: tmp.get((x[PREFIX + '_' + name], x['group']), 0.), axis=1).values
        
    data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())   
    
    data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
            data[PREFIX + '_' + name + '_amount']
        
res = []
for idx, (dists, ids) in enumerate(zip(distances, indexes)):
    info = []
    diffs = np.array([dists[i + 1] - dists[i] for i in range(len(dists) - 1)])
    info.append(diffs[0])
    for k in gazprom_diff_k_values:
        info += [diffs[:k].mean(), diffs[:k].std(), diffs[:k].max()]
    res.append(info)
    
gazprom_new_features = [PREFIX + '_min_diff']
for k in gazprom_diff_k_values:
    gazprom_new_features += [PREFIX + '_dist_' + str(k) + '_' + word for word in ['mean', 'std', 'max']]
    
data[gazprom_new_features] = pd.DataFrame(res)

### Росбанк и партнеры (сайт)

In [23]:
PREFIX = 'partners'
logging.info('creating partners features')

df = pd.read_csv("./data/prep/bank_rosbank.csv")
df = df[~df[['lat', 'long']].isnull().any(axis=1)].copy().reset_index()

coords = df[['lat', 'long']].values
neigh = NearestNeighbors(metric=distance)
neigh.fit(coords)

distances, indexes = neigh.kneighbors(data[['lat', 'long']].values, 10, return_distance=True)

data[PREFIX + '_dist'] = distances[:, 0]

###

for name in ['bank', 'region', 'location']:
    data[ PREFIX + '_' + name] = df.loc[indexes[:, 0], name].values
    
    tmp = data.groupby([PREFIX + '_' + name, 'group']).size()
    data[PREFIX + '_' + name + '_group_amount'] = \
        data.apply(lambda x: tmp.get((x[PREFIX + '_' + name], x['group']), 0.), axis=1).values
        
    data[PREFIX + '_' + name + '_amount'] = data[PREFIX + '_' + name].map(data.groupby(PREFIX + '_' + name).size())   
    
    data[PREFIX + '_' + name + '_rel'] = data[PREFIX + '_' + name + '_group_amount'] / \
            data[PREFIX + '_' + name + '_amount']
        
res = []
for idx, (dists, ids) in enumerate(zip(distances, indexes)):
    info = []
    diffs = np.array([dists[i + 1] - dists[i] for i in range(len(dists) - 1)])
    info.append(diffs[0])
    for k in partners_diff_k_values:
        info += [diffs[:k].mean(), diffs[:k].std(), diffs[:k].max()]
    res.append(info)
    
partners_new_features = [PREFIX + '_min_diff']
for k in partners_diff_k_values:
    partners_new_features += [PREFIX + '_dist_' + str(k) + '_' + word for word in ['mean', 'std', 'max']]
    
data[partners_new_features] = pd.DataFrame(res)

### additional

In [24]:
group2name = {
    496.5: 'Россельхозбанк', 1022.0: 'Ак Барс',
    1942.0: 'Альфа-Банк', 3185.5: 'Газпромбанк',
    5478.0: 'Уралсиб', 8083.0: 'Росбанк'
}

atms = pd.read_csv("./data/prep/bank_sravni_2.csv")
data['group_bank'] = data['group'].map(group2name)

tmp = atms.groupby(['town', 'bank']).size()
data['atms_town_group_bank_amount'] = data.apply(lambda x: tmp.get((x['izbir_town'], x['group_bank']), 0.), axis=1)
data['izbir_atm_town'] = data['izbir_town'].map(atms.groupby('town').size())
data['group_bank_amount'] = data['group_bank'].map(atms.groupby('bank').size())

In [25]:
features = []
features_dict = defaultdict(list)
###

PREFIX = 'izbir'

features += [PREFIX + '_' + word for word in ['dist', 'lat', 'lon', 'voters_in', 'idx']]
#features += ['izbir_voters_out']
for name in ['type', 'region', 'location_type', 'town', 'address']:
    features += [PREFIX + '_' + name + word for word in ['_group_amount', '_amount', '_rel']]
for name in ['town']:
    for group in groups:
        features += [PREFIX + '_' + name + '_group_' + str(group) + '_amount']    
    
features_dict[PREFIX] += features

### 
PREFIX = 'city'

features += [PREFIX + '_' + word for word in ['dist', 'lat', 'lon', 'population']]
features_dict[PREFIX] += [PREFIX + '_' + word for word in ['dist', 'lat', 'lon', 'population']]

for name in ['town', 'region', 'region_type', 'district', 'time']:
    features += [PREFIX + '_' + name + word for word in ['_group_amount', '_amount', '_rel']]
    features_dict[PREFIX] += [PREFIX + '_' + name + word for word in ['_group_amount', '_amount', '_rel']]
    
for name in ['town']:
    for group in groups:
        features += [PREFIX + '_' + name + '_group_' + str(group) + '_amount']
        features_dict[PREFIX] += [PREFIX + '_' + name + '_group_' + str(group) + '_amount']
        
features += city_diff_features
features_dict[PREFIX] += city_diff_features

for city in important_coords:
    features += ['dist2' + city]
    features_dict[PREFIX] += ['dist2' + city]
    
###
for filename in metro_fields:
    
    features += [filename + '_' + word for word in ['dist', 'lat', 'lon']]
    features_dict['metro'] += [filename + '_' + word for word in ['dist', 'lat', 'lon']]
    
    for name in metro_fields[filename]:
        features += [filename + '_' + name + word for word in ['_group_amount', '_amount', '_rel']]
        features_dict['metro'] += [filename + '_' + name + word for word in ['_group_amount', '_amount', '_rel']]
        
    for rad in metro_rad_values:
        features += [filename + '_dist_count_' + str(rad)]
        features_dict['metro'] += [filename + '_dist_count_' + str(rad)]
        
    features += [filename + '_' + word for word in metro_new_features]
    features_dict['metro'] += [filename + '_' + word for word in metro_new_features]
    
### initial
PREFIX = 'initial'

features += ['id', 'loc_null', 'lat', 'long', ]
features_dict[PREFIX] += ['id', 'loc_null', 'lat', 'long', ]

for name in ['coord']:
    features += [PREFIX + '_' + name + word for word in ['_amount', '_group_amount']]#, '_rel']]
    features_dict[PREFIX] += [PREFIX + '_' + name + word for word in ['_amount', '_group_amount']]
    
for name in ['coord']:
    for group in groups:
        features += [PREFIX + '_' + name + '_group_' + str(group) + '_amount']
        features_dict[PREFIX] += [PREFIX + '_' + name + '_group_' + str(group) + '_amount']
        
features += ['group_amount'] # под вопросом. Может быть лик
features_dict[PREFIX] += ['group_amount']

# rus cities wiki, gks_table
features += ['gks_population', 'wiki_town_population', 'wiki_region_population']
features_dict[PREFIX] += ['gks_population', 'wiki_town_population', 'wiki_region_population']

# fees
features += ['fee_min_work2all']
features_dict[PREFIX] += ['fee_min_work2all']

# atm
features += new_atm_features
features_dict['atm'] += new_atm_features

for k in atm_k_values:
    features += ['atm_dist_std_' + str(k)]
    features_dict['atm'] += ['atm_dist_std_' + str(k)]
    
for rad in atm_rad_values:
    features += ['atm_dist_count_' + str(rad)]
    features_dict['atm'] += ['atm_dist_count_' + str(rad)]
    
for rad in atm_group_rad_values:
    features += [str(group) + str(rad) for group in groups]
    features_dict['atm'] += [str(group) + str(rad) for group in groups]

# OSM
features += osm_features
features_dict['osm'] += osm_features

# sberbank xls
PREFIX = 'sberbank'

features += [PREFIX + '_' + word for word in ['lat', 'long', 'index', 'dist', 'town_amount_outer']]
features_dict[PREFIX] += [PREFIX + '_' + word for word in ['lat', 'long', 'index', 'dist', 'town_amount_outer']]

for name in ['town', 'type', 'region', 'district']:
    features += [PREFIX + '_' + name + word for word in ['_amount', '_group_amount', '_rel']]
    features_dict[PREFIX] += [PREFIX + '_' + name + word for word in ['_amount', '_group_amount', '_rel']]

features += sberbank_new_features
features_dict[PREFIX] += sberbank_new_features

# rosbank p2p
PREFIX = 'rosbank_p2p'

features += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]
features_dict[PREFIX] += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]

for name in ['town', 'region', 'regime']:
    features += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]
    features_dict[PREFIX] += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]

features += rosbank_p2p_new_features
features_dict[PREFIX] += rosbank_p2p_new_features

# sravni
PREFIX = 'sravni'

features += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]
features_dict[PREFIX] += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]

for name in ['bank', 'town', 'service', 'time']:
    features += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]
    features_dict[PREFIX] += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]

features += sravni_new_features
features_dict[PREFIX] += sravni_new_features

# россельхоз банк
PREFIX = 'rshb'

features += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]
features_dict[PREFIX] += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]

for name in ['region', 'access', 'currency', 'shedule']:
    features += [PREFIX + '_' + name + '_' + word for word in ['group_amount', 'amount', 'rel']]
    features_dict[PREFIX] += [PREFIX + '_' + name + '_' + word for word in ['group_amount', 'amount', 'rel']]

features += rshb_new_features
features_dict[PREFIX] += rshb_new_features

# raiffaizen
PREFIX = 'raif'

features += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]
features_dict[PREFIX] += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]

for name in ['time']:
    features += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]
    features_dict[PREFIX] += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]
    
features += raif_new_features
features_dict[PREFIX] += raif_new_features
                              
# gazprombank
PREFIX = 'gazprom'

features += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]
features_dict[PREFIX] += [PREFIX + '_' + word for word in ['dist', 'lat', 'long']]

for name in ['region', 'town', 'region_type', 'town_type', 'loctype', 'atm_type', 'cards', 'access']:
    features += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]
    features_dict[PREFIX] += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]
    
features += gazprom_new_features
features_dict[PREFIX] += gazprom_new_features

# partners
PREFIX = 'partners'

features += [PREFIX + '_' + word for word in ['dist']]
features_dict[PREFIX] += [PREFIX + '_' + word for word in ['dist']]

for name in ['bank', 'region', 'location']:
    features += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]
    features_dict[PREFIX] += [PREFIX + '_' + name + '_' + word for word in ['amount', 'group_amount', 'rel']]

    
features += partners_new_features
features_dict[PREFIX] += partners_new_features

features += ['atms_town_group_bank_amount', 'izbir_atm_town', 'group_bank_amount']
features_dict[PREFIX] += ['atms_town_group_bank_amount', 'izbir_atm_town', 'group_bank_amount']
                 
#########
logging.info('Result: {} features'.format(len(features)))

In [26]:
if HOLDOUT_MODE:

    data.to_csv('./data/prep_data_HO.csv', index=False)
    np.save('./data/prep_features_HO.npy', features)
    np.save('./data/prep_features_dict_HO.npy', features_dict)
    
else:
    data.to_csv('./data/prep_data.csv', index=False)
    np.save('./data/prep_features.npy', features)
    np.save('./data/prep_features_dict.npy', features_dict)