In [1]:
import os
import pandas as pd
import numpy as np
from functools import reduce


data = pd.read_pickle('final.pickle').reset_index()

In [2]:
from doublegis_api.api import Api2Gis

api = Api2Gis()
api.load()

In [3]:
from multiprocessing.pool import ThreadPool, Pool
from utility.distance import distance


def by_nearby_place(df):
    distances = {}
    saved_distances = {}
    for index, row in df.iterrows():
        lon, lat = float(row['longtitude']), float(row['latitude'])
        
        if (lon, lat) not in saved_distances:
            all_distances = [{'distance': distance(lat, lon, f.latitude, f.longitude), 'id': f.doublegis_id}
                             for f in api.filials]
            saved_distances[(lon, lat)] = all_distances
        else:
            all_distances = saved_distances[(lon, lat)]
        
        by_distances = np.array(list(map(lambda x: x['id'], 
                                         filter(lambda x: x['distance'] <= 100.0, all_distances))))
        
        address = row['address'].strip().split(', ')
        st, h = address[0], address[1]
        by_address = np.array([f.doublegis_id for f in api.filials 
                               if f.street_name == st and f.house == h])
        
        summary = np.unique(np.append(by_distances, by_address))
        
        def _summary_pairs(x):
            _summary_pair = []
            for x in summary:
                f = next(f for f in api.filials if f.doublegis_id == x)
                o = next(o for o in api.organizations if o.id == f.organization_id)
                _summary_pair.append((f, o))
            return _summary_pair
        distances[index] = list(map(_summary_pairs, summary))
        
    return distances


def by_nearby_place_parallel(df):
    all_distances = {}
    df.sort(['address'])
    chunk_size = df.shape[0] // 12
    pool = Pool(processes=12)
    results = [pool.apply_async(by_nearby_place, (df[chunk_size * i:chunk_size * (i + 1)],))
               for i in range(12)]
    for async in results:
        all_distances.update(async.get())
    return all_distances

In [4]:
from difflib import SequenceMatcher
from functools import reduce


def _by_name(df, by_address):
    for index, row in df.iterrows():
        if len(by_address[index]):
            yield (index, None)
        
        filtered_by_name = list(filter(lambda x: x[1].name == row['company'], by_address[index]))
        if len(filtered_by_name) > 0:
            yield (index, filtered_by_name[0])
            
        filtered_by_primary_name = list(filter(lambda x: row['company'].lower().find(x[1].name_primary.lower()) != -1, 
                                               by_address[index]))
        if len(filtered_by_primary_name) > 0:
            yield (index, filtered_by_primary_name[0])
            
        filtered_by_synonyms = list(filter(lambda x: row['company'].lower() in x[1].name_synonyms, 
                                           by_address[index]))
        if len(filtered_by_synonyms) > 0:
            yield(index, filtered_by_synonyms[0])
            
        with_similarity = list(map(lambda x: (x[0], x[1], SequenceMatcher(None, x[1].name.strip().lower(), 
                                                                          row['company'].strip().lower()).ratio()), 
                                   by_address[index]))
        max_similar = reduce(lambda acc, x: acc if acc[2] > x[2] else x, 
                             by_address[index], 
                             with_similarity[0])
        if max_similar[2] >= 80.0:
            yield(index, max_similar[0])
            
        print('Искомая компания не имеет соответствия: {0}'.format(row))
        yield(index, None)

In [5]:
by_address = by_nearby_place_parallel(data)



Process ForkPoolWorker-11:


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in _summary_pairs
    f = next(f for f in api.filials if f.doublegis_id == x)


Process ForkPoolWorker-10:


Process ForkPoolWorker-12:


Process ForkPoolWorker-1:


Process ForkPoolWorker-9:


Process ForkPoolWorker-2:


Process ForkPoolWorker-8:


Process ForkPoolWorker-7:


Process ForkPoolWorker-4:


Process ForkPoolWorker-5:


Process ForkPoolWorker-3:


Traceback (most recent call last):


Traceback (most recent call last):


Traceback (most recent call last):


Traceback (most recent call last):


Traceback (most recent call last):


Traceback (most recent call last):


Traceback (most recent call last):


Process ForkPoolWorker-6:


Traceback (most recent call last):


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


Traceback (most recent call last):


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


Traceback (most recent call last):


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


Traceback (most recent call last):


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


  File "<ipython-input-3-ed0d6f9c00ec>", line 32, in _summary_pairs
    o = next(o for o in api.organizations if o.id == f.organization_id)


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in _summary_pairs
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in _summary_pairs
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "<ipython-input-3-ed0d6f9c00ec>", line 32, in _summary_pairs
    o = next(o for o in api.organizations if o.id == f.organization_id)


  File "<ipython-input-3-ed0d6f9c00ec>", line 32, in _summary_pairs
    o = next(o for o in api.organizations if o.id == f.organization_id)


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in _summary_pairs
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "<ipython-input-3-ed0d6f9c00ec>", line 32, in <genexpr>
    o = next(o for o in api.organizations if o.id == f.organization_id)


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in <genexpr>
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "<ipython-input-3-ed0d6f9c00ec>", line 32, in <genexpr>
    o = next(o for o in api.organizations if o.id == f.organization_id)


  File "<ipython-input-3-ed0d6f9c00ec>", line 32, in <genexpr>
    o = next(o for o in api.organizations if o.id == f.organization_id)


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in <genexpr>
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in <genexpr>
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in _summary_pairs
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


Traceback (most recent call last):


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in _summary_pairs
    f = next(f for f in api.filials if f.doublegis_id == x)


KeyboardInterrupt


KeyboardInterrupt


KeyboardInterrupt


KeyboardInterrupt


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


KeyboardInterrupt


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


KeyboardInterrupt


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in <genexpr>
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in <genexpr>
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "<ipython-input-3-ed0d6f9c00ec>", line 32, in _summary_pairs
    o = next(o for o in api.organizations if o.id == f.organization_id)


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in _summary_pairs
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in <genexpr>
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


KeyboardInterrupt


KeyboardInterrupt


  File "<ipython-input-3-ed0d6f9c00ec>", line 32, in <genexpr>
    o = next(o for o in api.organizations if o.id == f.organization_id)


KeyboardInterrupt


  File "/usr/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))


KeyboardInterrupt


  File "<ipython-input-3-ed0d6f9c00ec>", line 35, in by_nearby_place
    distances[index] = list(map(_summary_pairs, summary))


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in <genexpr>
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in _summary_pairs
    f = next(f for f in api.filials if f.doublegis_id == x)


  File "<ipython-input-3-ed0d6f9c00ec>", line 31, in <genexpr>
    f = next(f for f in api.filials if f.doublegis_id == x)


KeyboardInterrupt


KeyboardInterrupt


KeyboardInterrupt: 

In [43]:
df