In [1]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import json
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from matplotlib.markers import MarkerStyle
from postal.parser import parse_address
from postal.parser import parse_address as libpostal_parser_address
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from scipy.interpolate import spline
from sklearn.cluster import DBSCAN
from collections import Counter
from transliterate import translit, get_available_language_codes
from enum import Enum

from rating.validator import union_addresses, extract_addresses, CityValidator, StructValidator
from rating.parser import InvalidAddressesLibpostalParser, InvalidAddressCityParser
from rating.ya_maps_client import YandexApiClient, Request, Response
from rating.address import Address
from rating.utils import get_city_

import requests 



fontsize = 18
figsize = (15,10)
plot_alpha = 0.3
linewidth = 4

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

In [3]:
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)
test = test.rename(columns={'atm_group': '0-bank'})
train = train.rename(columns={'atm_group': '0-bank'})
banks = np.uint(np.sort(test['0-bank'].unique() * 10))

cities = pd.read_csv(open('data/cities.csv'))

In [4]:
test = test.rename(columns={'lat': '6-lat', 'long': '6-long'})
train = train.rename(columns={'lat': '6-lat', 'long': '6-long'})

# Prepare

In [5]:
test['0-bank'] = np.uint(test['0-bank'] * 10)
train['0-bank'] = np.uint(train['0-bank'] * 10)
columns_for_drop = ['address', 'address_rus']

### Запоминаем индексы

In [6]:
train['is_train'] = True
test['is_train'] = False
test['index'] = test.index + 1
train['index'] = train.index + 1
train['train_index'] = train.index + 1
test['test_index'] = test.index + 1
train['test_index'] = 0
test['train_index'] = 0
train['index'] = train.index

### Создаём общие данные tran-test

In [7]:
data_all = train.append(test, sort=False)

In [8]:
data_all['address_fail'] = 0
data_all.at[data_all['address_rus'].isnull(), 'address_fail'] = 1
data_all['base_target'] = data_all['target']
data_all.at[(data_all['address_fail'] == 1) & (data_all['is_train']), 'target'] = 0

In [9]:
data_all['address_rus'] = data_all['address_rus'].fillna('')

In [10]:
y_all = data_all[['target', 'base_target', 'is_train', 'address_fail']]
y_all['index'] = data_all['index']
y_all['id'] = data_all['id']
data_all = data_all.drop(['target', 'base_target'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
counts = data_all.groupby('address_rus')['id'].count().reset_index().rename(columns={'id':'the_same_address_terminals'})
data_all = pd.merge(data_all, counts, how='left', on='address_rus')
data_all.at[data_all['address_fail'] == 1, 'the_same_address_terminals'] = 0

### Fillna for address_rus

In [12]:
data_all.at[data_all['address_fail'], 'address_rus'] = data_all['address_rus'].fillna("")

### Fillna for lat, long

In [13]:
data_all.at[:, ['6-lat', '6-long']] = data_all[['6-lat', '6-long']].fillna(0)

### Запоминаем ID-Index

In [14]:
data_all['id'] = data_all['id'].astype(np.uint)

In [15]:
data_all = data_all.rename(columns={'id': '7-id', 'index': '7-index'})
y_all = y_all.rename(columns={'id': '7-id', 'index': '7-index'})

### Создаём one-hot-encoding for bank

In [17]:
data_all['0-bank_base'] = data_all['0-bank']
data_all = pd.get_dummies(data_all, columns=['0-bank'], prefix='0-bank')

# Research

### Удаляем токены геообъектов (API). Russian

In [17]:
inds = data_all[data_all['address_fail'] == 1]['7-id']
addrs = data_all[data_all['address_fail'] == 1]['address'].values

reqs = []
for ind, addr in zip(inds, addrs):
    reqs.append(Request(addr, ind))

In [18]:
token = 'c5e55b33-6b43-4312-ab3e-0a5f33a9207a'
ya_map_client = YandexApiClient(token)

In [19]:
resps_en = ya_map_client.geocode(reqs)
resps_ru = ya_map_client.translit_geocode(reqs)

100%|██████████| 420/420 [01:58<00:00,  3.35it/s]
100%|██████████| 420/420 [01:49<00:00,  3.90it/s]


### Если были ошибки сервера, то здесь можно их поправить, указав порядковый номер

In [20]:
# index = [i for i, item in enumerate(resps_en) if item.index == 5087][0]
# obj = data_all[data_all['address_fail'].astype(bool)]['address'].values[index]
# resps_en[index] = Response(ya_map_client.geocode_i(obj), index)
# resps_ru[index] = Response(ya_map_client.translit_geocode_i(obj), index)

### Валидация адресов по домам и совпадениям

In [21]:
valid_addresses, invalid_addresses = extract_addresses(resps_en, resps_ru)

Valid: 189. Invalid: 231.


### Подготовка запросов на основе невалидных адресов

In [22]:
invalid_addresses_libpostal_parser = InvalidAddressesLibpostalParser()
reqs = invalid_addresses_libpostal_parser.parse(invalid_addresses)

### Повторные запросы к геокодеру

In [23]:
resps_en_2 = ya_map_client.geocode(reqs)
resps_ru_2 = ya_map_client.translit_geocode(reqs)

100%|██████████| 420/420 [00:59<00:00,  3.75it/s]
100%|██████████| 420/420 [00:59<00:00,  2.18it/s]


### Если были ошибки сервера, то здесь можно их поправить, указав порядковый номер

In [24]:
# index = [i for i, item in enumerate(resps_en_2) if item.index == 3614][0]
# obj = data_all[data_all['address_fail'].astype(bool)]['address'].values[index]
# resps_en_2[index] = Response(ya_map_client.geocode_i(obj), index)
# resps_ru_2[index] = Response(ya_map_client.translit_geocode_i(obj), index)

### Извлекаем валидные и невалидные адреса при помощи перестановки местами токенов

In [25]:
valid_addresses_2, invalid_addresses_2 = extract_addresses(resps_en_2, resps_ru_2)

Valid: 85. Invalid: 146.


### Объединяем их с уже имеющимися

In [26]:
valid_addresses_all = union_addresses(valid_addresses_2, valid_addresses)
invalid_addresses_all = invalid_addresses_2
print('Valid: %d. Invalid %d' % (len([el for el in valid_addresses_all if el is not None]), len([el for el in invalid_addresses_all if el is not None])))

Valid: 274. Invalid 146


### Смотрим на адреса, которые так и не распознались и отбираем те, у которых есть города

In [27]:
cityValidator = CityValidator()
cityValidator.validate(invalid_addresses_all)

************************************************************
Not detected addresses (these addresses will replace by ""):
************************************************************
NAB-CHELNINSKIJ NAB. CHELNY, ABB, 6B
BUL.SVOBOD SURGUT G., DOM. 2 KORP. 2
PR-KT. KOMSOMOL'SK TOMSK G., DOM. 13B
UL. SHIROKOVA V.F. KOSTROMA G.
NOVOSIBIRSK, POLZUNOVA, OB.50, 15
UL. SEVASTOPOL'S GUBKIN G., DOM. 101A
TURGENEVSKOE S., KRASHODAR, 27
MOSKOVSKIY UL. S.-PETERBURG, 188M
BUL. HMEL'NIC BELGOROD G., DOM. 137T PR-KT.
EKATERINBURG, UL. LUNAC, DOM. 128, LITERA A
EKATERINBURG, UL. LUNAC, DOM. 128, LITERA A
SH. GOSTIL PETERGOF G., DOM. 58, LITERA A
FAKE_ADDRESS_AISUD88ZUX89CUA0SKDPKAPOOKCPOZKXC90IAIS09I
KHANDYGA, J.KUDRIAVOGO, 24
PARATUNKA, MOLCHANOVA, 22
KRASNOARMEJSK. JOSHKAR-OLA, ABB, 111
NAROFOMINSKOE SH. KUBINKA, 10
VLADIVOSTOK, 100 LET VLADIVOSTOKU A, 51
TRAKTOVAYA UL. SREDNIY, 14B
UL. KALARASH SOCHI G., DOM. 111
MKR. PERVOMAISKII IRKUTSK G., DOM. 54
NAB. OBVO SANKT-PETERB, DOM. 120, LITERA 1
FAKE_A

In [28]:
valid_addresses_city = union_addresses(cityValidator.valid_addresses, valid_addresses_all)
invalid_addresses_city = cityValidator.invalid_addresses

print('Valid: %d. Invalid: %d.' % (len([el for el in valid_addresses_city if el is not None]), len([el for el in invalid_addresses_city if el is not None])))

Valid: 374. Invalid: 46.


In [29]:
invalid_address_city_parser = InvalidAddressCityParser()

In [30]:
reqs = invalid_address_city_parser.parse(cityValidator.valid_addresses)

### Повторные запросы к геокодеру

In [31]:
resps_en_3 = ya_map_client.geocode(reqs)
resps_ru_3 = ya_map_client.translit_geocode(reqs)

100%|██████████| 420/420 [00:25<00:00, 16.52it/s]
100%|██████████| 420/420 [00:21<00:00, 19.69it/s]


### Объединение в ответ

In [32]:
### extract_addresses without house_validator

addrs = [Address(item[0], item[1]) for item in zip(resps_en_3, resps_ru_3)]

struct_validator_2 = StructValidator()
struct_validator_2.validate(addrs, True)

valid_addresses_final = union_addresses(valid_addresses_all, struct_validator_2.valid_addresses)
invalid_addresses_final = cityValidator.invalid_addresses

valid = len([el for el in valid_addresses_final if el is not None])
invalid = len([el for el in invalid_addresses_final if el is not None])
print('Valid: %d. Invalid: %d.' % (valid, invalid))

Valid: 374. Invalid: 46.


### Запись ответа

In [33]:
vals = [(el.get_best_coords(), str(el.level), get_city_(el.sturct_ru) if get_city_(el.sturct_en) == '' else get_city_(el.sturct_en), el.index) if el is not None else None for i, el in enumerate(valid_addresses_final)]
invals = [(None, None, None, el.index) if el is not None else None for el in invalid_addresses_final]

In [34]:
res = []
for val, inval in zip(vals, invals):
    if val is None:
        res.append(inval)
    if inval is None:
        res.append(val)
    if inval is None and val is None:
        print('WTF-1?')
    if inval is not None and val is not None:
        print('WTF-2?')
        
with open('missed_coords.json', 'w') as outfile:
    json.dump(res, outfile)