### Примечание

Исходные трейн и тест лежат в папке ./data под названиями init_train.csv, init_test.csv

Абсолютно не гарантируется воспроизводимость внешних данных, поэтому прикладываются исходные данные и обработанные данные в папках ./data/external и ./data/prep соответственно.

Также в папке ./data/external лежит файл README, в котором продублирована информация про источники внешних данных без подробностей (некоторые из тех, что лежат там, уже не используются)

# Создание вспомогательных файлов

In [28]:
import numpy as np, pandas as pd
import re, json, gc, requests, time
import urllib.request

from collections import Counter, defaultdict
from tqdm import tqdm_notebook
from bs4 import BeautifulSoup
from yandex_geocoder import Client

### Википедия. Список городов России

Вручную скопировал по ссылке https://ru.wikipedia.org/wiki/Список_городов_России табличку с городами в файл "wiki_cities.txt".

Выделяется город, регион, федеральный округ, и население.

In [2]:
f = open('./data/external/wiki_cities.txt')
text = f.readlines()

def preprocess(text):
    splitted = text.lower().split('\t')
    return [el[:-1] for el in splitted[2:-2]] + [int(re.findall('\d+', splitted[-2])[0])] + \
            [int(re.findall('\d+', splitted[-1])[0])]

df = pd.DataFrame([preprocess(s) for s in text[:-2]], 
                  columns=['town', 'region', 'federal_district', 'population', 'year'])

df.to_csv("./data/prep/wiki_cities.csv", index=False)

### Метро

+ metro_moscow.txt --- скопировано с сайта http://www.lovrikinfo.ru/metrogps.php
+ metro_peter.txt --- скопировано с http://quantron-systems.com/ru/article/89
+ metro_russia.json --- скачано с https://github.com/hhru/api/blob/master/docs/metro.md

In [3]:
# Москва

f = open('./data/external/metro_moscow.txt')
text = f.readlines()
df = pd.DataFrame([line.split('\t') for line in text], columns=['station', 'lat', 'lon'])
df[['lon', 'lat']] = df[['lon', 'lat']].astype(float)
df.to_csv("./data/prep/metro_moscow.csv", index=False)

In [4]:
# Петербург

f = open('./data/external/metro_peter.txt')
text = f.readlines()

prev_line=''
tmp = []
data = []
flg = 0
for line in [line for line in text if line != '\t\n']:
    if line[:2].isdigit() and flg == 0:
        tmp += [prev_line, line]
        flg = 1
    elif flg == 1:
        tmp += [line]
        flg = 0
        data.append(tmp)
        tmp = []
    else:
        prev_line = line
        
df = pd.DataFrame(data, columns=['station', 'lat', 'lon'])
df[['lon', 'lat']] = df[['lon', 'lat']].astype(float)
df['station'] = df['station'].map(lambda x: x[:-1])
df.to_csv("./data/prep/metro_peter.csv", index=False)

In [5]:
# Россия

f = open('./data/external/metro_russia.json')
metro_data = json.loads(f.read())

station_coords = []
for town in metro_data:
    for line in town['lines']:
        for station in line['stations']:
            station_coords.append([town['name'], line['name'], station['name'], station['lat'], station['lng']])
            
df = pd.DataFrame(station_coords, columns=['town', 'line', 'station', 'lat', 'lon'])
df.to_csv("./data/prep/metro.csv", index=False)

### Избирательные участки

+ Файл: Russia_merged_uik_data_w_migration.xlsx
+ Скачан из: https://drive.google.com/drive/folders/1apRilLMPs02QL9dChWD0h001Yz9IPXLH
+ Для удобства переименован в: votes.xlsx

In [6]:
def parse_coords(text):
    coord_dict = json.loads(re.sub("\'", '\"', text))
    return [coord_dict['lat'], coord_dict['lon']]

cols_to_drop = ['uik #', 'addrs', 'koib(1)/keg(2)/none(0)', 'votephone', 'url', 'uikpage', 'phone']
votes = pd.read_excel("./data/external/votes.xlsx").drop(cols_to_drop, axis=1)


chosen_cols = ['votecoords', 'coords', 'voteaddress', 'address', 'region', 'location_type']
chosen_votes = votes.loc[~votes[['votecoords', 'coords', 'voteaddress', 'address']].isnull().any(axis=1), chosen_cols]

coords = []
for idx, row in tqdm_notebook(chosen_votes.drop_duplicates().iterrows(), leave=False):
    for word in ['', 'vote']:
        votecoords = parse_coords(row[word + 'coords'])
        coords.append([row[word + 'address']] + votecoords + [word, row['region'], row['location_type']])
        
votes_info = pd.DataFrame(coords, columns=['address', 'lat', 'lon', 'type', 'region', 'location_type'])
votes_info.drop(votes_info.index[votes_info['lon'] == ''], inplace=True)
votes_info.reset_index(drop=True, inplace=True)

for ctype in ['lon', 'lat']:
    votes_info[ctype] = votes_info[ctype].map(lambda x: re.sub(' ', '', x))
    
votes_info[['lon', 'lat']] = votes_info[['lon', 'lat']].astype(float)

votes_info['voters_in'] = votes_info['address'].map(votes.groupby('voteaddress')['voters_in'].sum())
votes_info['voters_out'] = votes_info['address'].map(votes.groupby('voteaddress')['voters_out'].sum())


patterns = ['город ([^,]+)', 'г\. ([^,]+)', 'г\.([^,]+)', 'село ([^,]+)', 'поселок ([^,]+)',
           'п\. ([^,]+)', 'п\.([^,]+)', 'пос\.([^,]+)', 'с\. ([^,]+)', 'с\.([^,]+)',
            'посёлок ([^,]+)', 'поселение ([^,]+)', 'деревня ([^,]+)', 'дер\. ([^,]+)',
            'д\. ([^,]+)', 'аул ([^,]+)', 'город-курорт ([^,]+)', '(северодвинск)', 
            ', ([^,]+) городское поселение', ', ([^,]+) сельское поселение', 'дер\.([^,]+)',
            'гор\.([^,]+)', 'городской округ ([^,]+)', ', ([^,]+) городской округ',
            'пгт ([^,]+)', 'пгт\. ([^,]+)','д\.([^,]+)',
            'закрытое административно-территориальное образование ([^,]+)',
            'городское поселения ([^,]+)',
            'станица ([^,]+)', 'хутор ([^,]+)', 
            'аал ([^,]+)',
            'сумон ([^,]+)'
           ]

pattern = ''
for p in patterns:
    pattern +=p + '|'
pattern = pattern[:-1]

re_pattern = re.compile(pattern)

def get_town(text):
    found = re.findall(re_pattern, text)
    if len(found) == 0:
        return np.nan
    else:
        found = found[0]
    for record in found:
        if record != '':
            return record
    return np.nan

def clean_town(text):
    if 'город ' in text:
        return re.findall('город ([^"]+)', text)[0]
    else:
        return text

def get_idx(x):
    found = re.findall('^(\d+),', x)
    if len(found) > 0:
        return int(found[0])
    else:
        return np.nan
    
votes_info.drop_duplicates(['lat', 'lon'], inplace=True)
votes_info['town'] = votes_info['address'].map(lambda x: get_town(x.lower()))
votes_info.dropna(inplace=True)
votes_info['town'] = votes_info['town'].map(clean_town)
votes_info['idx'] = votes_info['address'].map(get_idx)
votes_info.reset_index(drop=True, inplace=True)

votes_info.to_csv("./data/prep/votes.csv", index=False)

del votes
del votes_info

gc.collect()



28

### Население регионов

Файл Tabl-01-18.xls из архива за 2018 год по ссылке http://www.gks.ru/wps/wcm/connect/rosstat_main/rosstat/ru/statistics/publications/catalog/afc8ea004d56a39ab251f2bafc3a6fce.

Переименован в population_gks.xls

In [7]:
pop2izbir = {
    'г. Москва': 'город Москва',
    'г.Санкт-Петербург ': 'город Санкт-Петербург',
    'Республика Татарстан': 'Республика Татарстан (Татарстан)',
    'Ямало-Ненецкий авт. округ': 'Ямало-Ненецкий автономный округ',
    'Республика Адыгея': 'Республика Адыгея (Адыгея)',
    'Архангельская область без Ненецкого авт. округа': 'Архангельская область',
    'Чувашская Республика': 'Чувашская Республика - Чувашия',
    'Тюменская область включая авт. округа  ': 'Тюменская область',
    'Республика Северная Осетия-Алания':  'Республика Северная Осетия - Алания',
    'Ханты-Мансийский авт. округ': 'Ханты-Мансийский автономный округ - Югра', 
    'Республика Марий Эл ': 'Республика Марий Эл'
}


col_names = ['region', 'population', 'city_population', 'country_population', 
             'population17', 'city_population17', 'country_population17']
df = pd.read_excel('./data/external/population_gks.xls', header=5).reset_index()
df.columns = col_names
df['region'] = df['region'].apply(lambda x: pop2izbir[x] if x in pop2izbir else x)

df.to_csv("./data/prep/population_gks.csv", index=False)

### Данные по городам России
#### Anton Zhiyanov (nalgeon)'s github

Файл cities.csv --- скачан по ссылке https://gist.github.com/nalgeon/5307af065ff0e3bc97927c832fabe26b.

In [8]:
df = pd.read_csv("./data/external/cities.csv")\
       .rename({'Регион': 'region', 'Город': 'town', 'Население': 'population'}, axis=1)

df.to_csv("./data/prep/cities.csv", index=False)

### Плотность населения

Файл statdata_population.txt --- скопировано с http://www.statdata.ru/nasel_regions.

In [9]:
df = pd.read_table('./data/external/population_statdata.txt')\
       .rename({'Субъект России': 'region'}, axis=1).replace(',', '.')
    
names = ['Плотн.насел., чел/км²', 'Население', 'Площадь,км²']
new_names = ['density', 'populus', 'area']

df[names[0]] = df[names[0]].apply(lambda x: re.sub(',', '.', x)).astype(float)
df[names[1]] = df[names[1]].apply(lambda x: ''.join(x.split())).astype(int)
df[names[2]] = df[names[2]].apply(lambda x: ''.join(x.split())).astype(int)
df.columns = ['idx', 'region'] + new_names + ['FO']

df.to_csv("./data/prep/population_statdata.csv", index=False, sep=';')

### Прожиточный минимум
Документ скачан с http://www.gks.ru/free_doc/new_site/population/urov/vpm/proj-min.html.

Переименован в fee_min.txt

In [10]:
f = open('./data/external/fee_min.txt')
text = f.readlines()

df = pd.DataFrame([text[6 * i : 6 * (i + 1)] for i in range(int(len(text) / 6.))], 
                  columns=['region', 'period', 'fee_all', 'fee_work', 'fee_pension', 'fee_child'])

df['region'] = df['region'].apply(lambda x: x[:-1])
df['region'] = df['region'].apply(lambda x: pop2izbir[x] if x in pop2izbir else x)
df[['fee_all', 'fee_work', 'fee_pension', 'fee_child']] = df[['fee_all', 'fee_work', 'fee_pension', 'fee_child']]\
                .astype(int)
    
df.to_csv("./data/prep/fee_min.csv", index=False)

### Средняя зарплата по регионам

fee_avg.txt --- скопировано с https://visasam.ru/russia/rabotavrf/zarplaty-v-rossii.html 

In [11]:
f = open('./data/external/fee_avg.txt')

text = f.readlines()[0]
text = re.findall('(\w+ \d+ \d+)', text)
text = [re.sub('(\d+) (\d+)', r'\1.\2', line).split() for line in text]

stats = pd.DataFrame(text, columns=['town', 'fee'])
stats['town'] = stats['town'].map(lambda x: x.lower())
stats['fee'] = stats['fee'].astype(float)

stats.to_csv("./data/prep/fee_avg.csv", index=False)

### Заработные платы в социальной и научной сферах
Скачано с http://www.gks.ru/free_doc/new_site/population/trud/itog_monitor/itog-monitor05-18.html.

Переименован в fee_work.xlsx.

In [12]:
fees = pd.read_excel('./data/external/fee_work.xlsx', header=2, nrows=99)
fees = fees.replace('…1)', np.nan).reset_index().rename({'index': 'region'}, axis=1)
fees = fees.replace('-', np.nan)
fees.drop(fees.index[fees['region'].isin({'А', 'в том числе:'})], axis=0, inplace=True)
fees['region'] = fees['region'].apply(lambda x: pop2izbir[x] if x in pop2izbir else x)

fees.to_csv("./data/prep/fee_work.csv", index=False)

### Банкоматы росбанка, p2p.

Файл bank_rosbank_p2p.xls: скачан по ссылке https://www.rosbank.ru/files/p2p/P2Pb.xls.

Нахождение координат по адресам с помощью яндекс геокодера.

In [15]:
df = pd.read_excel('./data/external/bank_rosbank_p2p.xls').rename({'Адрес': 'address'}, axis=1)

latitudes = []
longitudes = []

for address in tqdm_notebook(df['address'].values, leave=False):
    try:
        long, lat = Client.coordinates(address)
        latitudes.append(lat)
        longitudes.append(long)
    except Exception as e:
        longitudes.append(np.nan)
        latitudes.append(np.nan)
        
df['lat'] = latitudes
df['lat'] = df['lat'].astype(float)

df['long'] = longitudes
df['long'] = df['long'].astype(float)

df.to_csv("./data/prep/bank_rosbank_p2p.csv", index=False)




### Банкоматы Сбербанка

bank_sberbank.xlsx - файл скинули в ODS канале контеста, также можно найти на официальном сайте сбербанка.

Долго работает.

In [20]:
sber = pd.read_excel('./data/external/bank_sberbank.xlsx', encoding='cp1251')\
         .rename({'Название населенного пункта': 'town'}, axis=1)
    
addr = []
for row in sber.values:
    addr.append(" ".join([str(el) for el in row[1:].tolist() if el == el and el != 'Банкомат']))
    
latitudes = []
longitudes = []

for address in tqdm_notebook(addr):
    try:
        long, lat = Client.coordinates(address)
        latitudes.append(lat)
        longitudes.append(long)
    except Exception as e:
        longitudes.append(np.nan)
        latitudes.append(np.nan)
        
sber['lat'] = latitudes
sber['lat'] = sber['lat'].astype(float)

sber['long'] = longitudes
sber['long'] = sber['long'].astype(float)

sber.to_csv('./data/prep/bank_sberbank.csv', index=False)

### Газпромбанк

Скачивание файла gazprombank.xlsx с сайта https://www.gazprombank.ru/ajax/ru/.

In [26]:
url = 'https://www.gazprombank.ru/ajax/ru/atms_to_xlsx.php'  
urllib.request.urlretrieve(url, './data/prep/bank_gazprombank.xlsx')  

('./data/prep/bank_gazprombank.xlsx',
 <http.client.HTTPMessage at 0x7fb6ed7ec6a0>)

### Россельхозбанк

Парсинг сайта https://www.rshb.ru/atms/moscow на предмет банкоматов.

In [29]:
page = requests.get('https://www.rshb.ru/atms/moscow')
soup = BeautifulSoup(page.content, 'html.parser')

regions = [(line['data-branch-code'], line.contents[0][14:-8]) 
           for line in soup.find('body').find_all('span') if 'data-region-id' in line.attrs]

df = pd.DataFrame()

for region_codename, region_name in tqdm_notebook(regions, leave=False):
    params = { 'branchCode': region_codename,  'locality': 'Все',  'type': 'atms.list'}
    f = requests.post('https://www.rshb.ru/ajax/get-data.php', data=params) 
    raw_json = f.json()['atmItems']
    tmp_df = pd.DataFrame.from_dict(raw_json, orient='index')
    tmp_df['region'] = region_name
    
    df = df.append(tmp_df, sort=False)
    
df.to_csv("./data/prep/bank_rshb.csv", index=False)

### Райффайзенбанк

Парсинг сайта https://www.raiffeisen.ru/common/branch_atm/ на предмет банкоматов.

In [30]:
params = {
    'gz':1,
    'filter': 'rub1',
    'act': 'listAtmP',
    'ready': 0
}

link = 'https://www.raiffeisen.ru/common/branch_atm/new_ajax.php'

atms = []
soup_len = 1
while soup_len != 0:
    f = requests.get(link, params=params)
    soup = BeautifulSoup(f.content, 'html.parser')
    soup_len = len(soup)
    
    for atm in soup.find_all(name='div', attrs={'class': 'e-office-item'}):
        coords = atm.contents[3].contents[1].attrs['data-coords']
        address = atm.contents[3].contents[1].contents[0]
        time = atm.contents[3].contents[3].contents[0]
        atms.append([address, coords, time])
        
    params['ready'] += 6
    
    print(params['ready'], end='\r')
    
pd.DataFrame(atms, columns=['address', 'coords', 'time']).to_csv("./data/prep/bank_raif.csv", index=False)

1542

### Росбанк

Парсинг сайта https://www.rosbank.ru/ru/atms/ на предмет банкоматов.

rosbank_regions.txt создан копированием исходного кода страницы, вылезающей при выборе региона.

In [None]:
import requests, re
import numpy as np

from collections import Counter, defaultdict
from tqdm import tqdm_notebook
from bs4 import BeautifulSoup

file = open('./data/external/rosbank_regions.txt')
regions_txt = file.readlines()

regions = []
ids = []
for line in regions_txt:
    if 'city__body_list' in line:
        regions += [re.findall('title="(.+)">', s)[0] for s in line.split('/li')[:-1]]
        ids += [int(el) for el in re.findall('\?region=(\d+)" title', line)]
        
region2idx = dict(zip(regions, ids))

def get_text(region_id, pagenumber):
    link = "https://www.rosbank.ru/ru/atms/list.php?page_25=" + str(pagenumber)
    f = requests.get(link, cookies={'regionrb': str(region_id)})
    return f

def parse(page):
    soup = BeautifulSoup(page.content, 'html.parser')

    banks = [el.text for el in soup.find_all('div', class_='address-logo')]
    addresses = [el.text for el in soup.find_all('div', class_="address-title")]
    locations = [el.text for el in soup.find_all('div', class_="address-type")]
    time = [el.text for el in soup.find_all('div', class_="page-atm__table_col page-atm__table_col--time")]
    cash = [el.text for el in soup.find_all('div', class_="page-atm__table_col page-atm__table_col--currency")]
    metros = [[el.text for el in metros.find_all('div')] for metros in soup.find_all('ul', class_="address-metro")] 
    ids = [re.findall('\d+', el.find('a').attrs['href'])[0] for el in soup.find_all('div', class_="address-map")]

    return [list(el) for el in list(zip(banks, ids, addresses, locations, time, metros, cash))]


res = []
for region in tqdm_notebook(region2idx):
    print(region + '...')
    region_id = region2idx[region]
    
    pagenumber = 1
    page = get_text(region_id, pagenumber)
    tmp = [re.findall('page active">(\d+)</a></li>', line) 
                       for line in page.text.splitlines() if 'pagination-page active' in line]
    if len(tmp) > 0:
        real_pagenumber = tmp[0][0]
        print(real_pagenumber, end='\r')
    else:
        print(1, end='\r')
    res += [[region] + el for el in parse(page)]
    
    
    if len(tmp) > 0:
        pagenumber += 1
        page = get_text(region_id, pagenumber)
        tmp = [re.findall('page active">(\d+)</a></li>', line) 
                       for line in page.text.splitlines() if 'pagination-page active' in line]
        if len(tmp) > 0:
            real_pagenumber = tmp[0][0]
            print(real_pagenumber, end='\r')
        else:
            print(1, end='\r')

    while real_pagenumber != '1' and len(tmp) > 0:
        res += [[region] + el for el in parse(page)]
        pagenumber += 1
        page = get_text(region_id, pagenumber)
        tmp = [re.findall('page active">(\d+)</a></li>', line) 
                       for line in page.text.splitlines() if 'pagination-page active' in line]
        if len(tmp) > 0:
            real_pagenumber = tmp[0][0]
            print(real_pagenumber, end='\r')
        else:
            print(1, end='\r')
            
df = pd.DataFrame(res, columns=['region', 'bank', 'id', 'address', 'location', 'time', 'metro', 'cash'])

latitudes = []
longitudes = []

addr = df['address'].values
for address in tqdm_notebook(addr):
    try:
        long, lat = Client.coordinates(address)
        latitudes.append(lat)
        longitudes.append(long)
    except Exception as e:
        longitudes.append(np.nan)
        latitudes.append(np.nan)
        
df['lat'] = latitudes
df['lat'] = df['lat'].astype(float)

df['long'] = longitudes
df['long'] = df['long'].astype(float)

df['bank'] = df['bank'].apply(lambda x: " ".join(x.split()))

df.to_csv("./data/prep/bank_rosbank.csv", index=False)

### Сайт sravni.ru

In [None]:
def parse_addr(text):
    parts = text.split(',')
    return [parts[0], parts[1][1:], parts[2][1:]]

link = 'https://www.sravni.ru/banki/spisok-bankomatov/'
f = requests.get(link)
banks = [(re.findall('>(.+)</a>', line)[0], re.findall('spisok-bankomatov-(.+)/">', line)[0]) 
     for line in f.text.splitlines() if '<a href="/banki/spisok-bankomatov' in line and 
         all([word not in line for word in {'Список банкоматов', 'Сбербанк России страница'}])]

In [None]:
def get_features(bank_url, atm_id):

    link = "https://www.sravni.ru/bank/" + bank_url + "/bankomat/" + atm_id + '/'
    f = requests.get(link)
    text = f.text.splitlines()
    
    tmp = []
    time = []
    services = []

    flag = 0 
    for idx, line in enumerate(text):

        if flag == 0 and 'hoursAvailable' in line:
            flag = 1
        elif flag == 1:
            if line == '\t\t\t\t\t\t\t\t\t\t\t\t\t\t</div>':
                tmp.append(time)
                flag = 2
            else:
                time.append(re.sub('\t|<div>|</div>', '', line))
        elif flag == 2 and 'productSupported' in line:
            flag = 3
        elif flag == 3:
            if line == '\t\t\t\t\t\t\t\t\t\t</div>':
                flag = 4
                tmp.append(services)
                break
            elif line != '\t\t\t\t\t\t\t\t\t\t<div class="text">':
                services.append(re.sub('\t|<span>|</span>', '', line))
    return tmp

In [None]:
atms = []
for bank_name, bank_url in tqdm_notebook(banks[266:], leave=False):
    print(bank_name + '...')
    link = "https://www.sravni.ru/banki/spisok-bankomatov-" + bank_url + "/"
    f = requests.get(link)
    counter = 0
    print(counter, end='\r')
    for line in f.text.splitlines():
        if 'href="/bank/' + bank_url + '/bankomat/' in line:
            counter += 1
            print(counter, end='\r')
            atm_id = re.findall('/bankomat/(\d+)/">', line)[0]
            atm_address = re.sub('\t.+">|</a>', '', line) 
            try:
                atm_features = get_features(bank_url, atm_id)
            except Exception as e:
                time.sleep(0.5)
                atm_features = get_features(bank_url, atm_id)
            atms.append([atm_id, bank_name, atm_address] + atm_features)

In [None]:
df = pd.DataFrame(atms, columns=['id', 'bank', 'address', 'time', 'service'])
df['town'] = df['address'].apply(lambda x: x.lower().split(',')[0])
df['town'] = df['address'].apply(lambda x: re.findall('\w+', x.lower())[0])
df['town'] = df['address'].apply(lambda x: x.lower().split(',')[0].split(' ')[0])

In [None]:
from yandex_geocoder import Client

latitudes = []
longitudes = []

for address in tqdm_notebook(df['address'].values):
    try:
        long, lat = Client.coordinates(address)
        latitudes.append(lat)
        longitudes.append(long)
    except Exception as e:
        longitudes.append(np.nan)
        latitudes.append(np.nan)
        
df['lat'] = latitudes
df['lat'] = df['lat'].astype(float)

df['long'] = longitudes
df['long'] = df['long'].astype(float)

#df.to_csv("./data/prep/bank_sravni_1.csv", index=False)

In [None]:
atms = atms[~atms[['lat', 'long']].isnull().any(axis=1)].copy()  # оставим только банкоматы с координатами
atms.reset_index(drop=True, inplace=True)

In [None]:
def clean(text):
    text = text.lower()
    text = re.sub('час\.', '', text)
    text = re.sub('(\d\d)\.(\d\d)\.', r'\1:\2', text)
    text = re.sub('00(\d)', r'0\1', text)
    text = re.sub('( \d+.\d+):', r'\1-', text)
    text = re.sub('ежедневно, ', 'ежедневно: ', text)
    text = re.sub('кр([^у])', r'\1', text)
    text = re.sub('вск', 'вс', text)
    text = re.sub(' -- |–| - ', '-', text)
    text = re.sub(' -- |–| - ', '-', text)
    text = re.sub("(пн-пт) ", r'\1: ', text)
    text = re.sub("(сб-вс) ", r'\1: ', text)
    text = re.sub(" (сб) ", r' \1:', text)
    text = re.sub(" (вс)", r' \1:', text)
    text = re.sub('вс\.- ', 'вс: ', text)
    text = re.sub('сб – |сб - |сб ', 'сб: ', text)
    text = re.sub('\.', ':', text)
    text = re.sub(":0'", ":00'", text)
    text = re.sub('с ([\d+|:]+) до ([\d+|:]+)', r'\1-\2', text)
    text = re.sub(' (\d\d)-', r' \1:00-', text)
    text = re.sub(' (\d)-', r' 0\1:00-', text)
    text = re.sub(' (\d:)', r' 0\1', text)
    text = re.sub('-(\d\d) ', r'-\1:00 ', text)
    text = re.sub('-(\d\d),', r'-\1:00,', text)
    text = re.sub("-(\d\d)'", r"-\1:00'", text)
    text = re.sub('24:00', '00:00', text)
    return text

workdays = ['пн', 'вт', 'ср', 'чт', 'пт']
weekends = ['сб', 'вс']
weekdays = workdays + weekends

def get_time(text, name):
    tmp = re.findall(name + ": ([\d+|:|\-|\.]+)|" + name + ": ([а-я]+)" , text)
    if len(tmp) == 0:
        return 0.
    else:
        tmp = tmp[0]
    for val in list(tmp):
            if val != '':
                if val == 'круглосуточно':
                    return 24.
                elif val == 'выходной':
                    return 0.
                else:
                    val1, val2 = val.split('-')[:2]
                    return (pd.to_datetime(val2) - pd.to_datetime(val1)).components.hours
    
def f(text):
    text = clean(text)
    tmp = dict()
    if 'пн-вс' in text:
        t = get_time(text, 'пн-вс')
        for day in weekdays:
            tmp[day] = t
    elif 'ежедневно' in text:
        t = get_time(text, 'ежедневно')
        for day in workdays:
            tmp[day] = t
    else:
        if 'пн-пт' in text:
            t = get_time(text, 'пн-пт')
            for day in workdays:
                tmp[day] = t
        if 'сб-вс' in text:
            t = get_time(text, 'сб-вс')
            for day in weekends:
                tmp[day] = t
        if 'ср-вс' in text:
            t = get_time(text, 'сб-вс')
            for day in ['ср', 'чт', 'пт'] + weekends:
                tmp[day] = t
        if 'пн-вт' in text:
            t = get_time(text, 'пн-вт')
            for day in ['пн', 'вт']:
                tmp[day] = t
        if 'пн-чт' in text:
            t = get_time(text, 'пн-чт')
            for day in ['пн', 'вт', 'ср', 'чт']:
                tmp[day] = t
        if 'пт-вс' in text:
            t = get_time(text, 'пт-вс')
            for day in ['пт', 'сб', 'вс']:
                tmp[day] = t
        if 'пн-сб' in text:
            t = get_time(text, 'пн-чт')
            for day in workdays + ['сб']:
                tmp[day] = t
        
        for day in workdays + weekends:
            if "'" + day + ":" in text:
                tmp[day] = get_time(text, day)
    if len(tmp) > 0:
        return tmp
    else:
        return np.nan
    
def analyze(x):
    if x == x:
        work = [x.get(el, 0.) for el in workdays]
        end = [x.get(el, 0.) for el in weekends]
        
        return [np.sum(work), np.mean(work), np.max(work), np.min(work)] \
                + [np.sum(end), np.mean(end), np.max(end), np.min(end)]
    else:
        return [0] * 8

In [None]:
tmp = atms['time'].apply(lambda x: analyze(f(x)) if x == x else [0] * 8)

atm_words = []
for prefix in ['work', 'end']:
    atm_words += [prefix + '_time_' + word for word in ['sum', 'mean', 'max', 'min']]
    
atms[atm_words] = pd.DataFrame(tmp.tolist())

atms.to_csv("./data/prep/bank_sravni_2.csv", index=False)