In [None]:
import os
import pandas as pd
import numpy as np
from functools import reduce

In [66]:
import json
import sys
from multiprocessing.pool import Pool

import requests

from data.models import Filial, Organization
from doublegis_api.api import Api2Gis


def modded_request(session, allowed_errors, **kwargs):
    while True:
        try:
            r = session.get(**kwargs, timeout=3)
            if r.status_code in allowed_errors:
                return r
            else:
                print('Error in request: {0}'.format('Status code == ' + r.status_code),
                      file=sys.stderr)
                continue
        except Exception as e:
            # print('Error in request: {0}'.format(e), file=sys.stderr)
            continue
            

def update_filials_dates(ids, filials_ids, organizations_ids, verbose=False):
    new_filials = []
    new_organizations = []
    
    session = requests.Session()
    for i in ids:
        r = modded_request(session=session,
                           allowed_errors={200, 410, 404},
                           url='https://2gis.ru/spb/firm/{0}'.format(i))

        try:
            if r.status_code == 410:
                data = r.text.split('<script type="text/javascript">')[1]\
                             .split('</script>')[0]\
                             .split('stat[pr]\\":10}]":')[1]
                data = data[:data.find(',null,null]') + len(',null,null]')]
                company = json.loads(data)[0]['data']
                
                if 'region_id' not in company.keys() \
                        or int(company['region_id']) != 38:
                    continue
                
                if int(company['id'].split('_')[0]) in filials_ids:
                    continue
    
                f = Filial()
                f.doublegis_id = int(company['id'].split('_')[0])
                f.organization_id = int(company['org']['id'])
                f.building_id = int(company['address']['building_id'])
                f.street_name = company['address']['components'][0]['street']
                f.house = company['address']['components'][0]['number']
                f.address_synonyms.append(company['address_name'])
                f.longitude = float(company['point']['lon'])
                f.latitude = float(company['point']['lat'])
                f.created_at_json['2gis_appear_at'] = company['dates']['created_at'] if 'created_at' in company[
                    'dates'].keys() else ''
                f.updated_at_json['2gis_updated_at'] = company['dates']['updated_at'] if 'updated_at' in company[
                    'dates'].keys() else ''
                f.closed_at_json['2gis_removed_at'] = company['dates']['removed_at'] if 'removed_at' in company[
                    'dates'].keys() else ''

                filials_ids.append(f.doublegis_id)
                new_filials.append(f)
                
                if int(company['org']['id']) in organizations_ids:
                    continue
                o = Organization()
                o.id = int(company['org']['id'])
                o.name = company['name']
                o.name_primary = company['name_ex']['primary']
                o.name_extension = company['name_ex']['extension'] if 'extension' in company['name_ex'] else ''
                o.name_synonyms.append(o.name)
                o.name_synonyms.append(o.name.lower())
                o.name_synonyms.append(o.name_primary)
                o.name_synonyms.append(o.name_primary.lower())
                
                if 'rubrics' in company.keys():
                    o.main_rubrics['doublegis_rubrics_ids'] = np.unique(np.array([r['parent_id'] for r in company['rubrics']]))
                    o.sub_rubrics['doublegis_rubrics_ids'] = np.unique(np.array([r['id'] for r in company['rubrics']]))

                if 'contact_groups' in company.keys():
                    contacts = [c for group in company['contact_groups'] for c in group['contacts']]
                    o.contacts_json['email'] += [c['value'] for c in contacts if 'email' == c['type']]
                    o.contacts_json['phone'] += [c['value'] for c in contacts if 'phone' == c['type']]
                    o.contacts_json['other'] += [c['value'] for c in contacts if 'phone' != c['type'] and 'email' != c['type']]

                organizations_ids.append(o.id)
                new_organizations.append(o)
                
        except Exception as e:
            print('Error while company parse {0}. Data: {1}. Status code: {2}'.format(e, i, r.status_code))
            
    return new_filials, new_organizations
            

def update_filials_dates_parallel(workers, ids, verbose=False):
    all_filials = []
    all_organizations = []
    
    api = Api2Gis()
    api.load()
    filials_ids = list(map(lambda x: x.doublegis_id, api.filials))
    organizations_ids = list(map(lambda x: x.id, api.organizations))
    
    pool = Pool(processes=workers)

    chunk_size = len(ids) // workers
    results = [pool.apply_async(update_filials_dates, (ids[chunk_size * i:chunk_size * (i + 1)],
                                                       filials_ids,
                                                       organizations_ids,
                                                       verbose))
               for i in range(workers)]

    data = []
    for async in results:
        data.append(async.get())
        
    for fil, org in data:
        all_filials += fil
        all_organizations += org
        
    return np.unique(all_filials), np.unique(all_organizations)

In [67]:
with open('removed_organizations', 'rb') as f:
    ids = np.load(f)
ids.shape

(1524,)

In [68]:
fil, org = update_filials_dates_parallel(15, ids)
print(fil.shape)
print(org.shape)

Error while company parse 'building_id'. Data: 5348552839072078. Status code: 410


Error while company parse 'building_id'. Data: 5348552838614152. Status code: 410


Error while company parse 'building_id'. Data: 5348552838719768. Status code: 410


Error while company parse 'building_id'. Data: 5348552838667085. Status code: 410


Error while company parse 'building_id'. Data: 5348552839886943. Status code: 410


Error while company parse 'building_id'. Data: 5348552838629986. Status code: 410


Error while company parse 'building_id'. Data: 5348552838554480. Status code: 410


Error while company parse 'building_id'. Data: 5348552839012311. Status code: 410


Error while company parse 'building_id'. Data: 5348552838574221. Status code: 410


Error while company parse 'building_id'. Data: 5348552839381819. Status code: 410


Error while company parse 'building_id'. Data: 5348552838787480. Status code: 410


Error while company parse 'building_id'. Data: 5348552838787836. Status code: 410


Error while company parse 'building_id'. Data: 5348552838531183. Status code: 410


(1449,)
(911,)


In [69]:
api = Api2Gis()
api.load()

print('Filials before: {0}'.format(api.filials.shape[0]))
print('Orgs before: {0}'.format(api.organizations.shape[0]))
api.filials = np.unique(np.append(api.filials, fil))
api.organizations = np.unique(np.append(api.organizations, org))
print('Filials after: {0}'.format(api.filials.shape[0]))
print('Orgs after: {0}'.format(api.organizations.shape[0]))

Filials before: 154624
Orgs before: 103206


Filials after: 156073
Orgs after: 104117


In [70]:
api.save()