In [None]:
!pip install yargy
!pip install natasha

In [1]:
import yargy
import natasha
import pandas as pd
import numpy as np
import functools
import re
from yargy import Parser
from natasha import AddrExtractor
from natasha import MorphVocab
from natasha.syntax import print_markup
from IPython.display import display
from yargy import (
    rule,
    or_, and_
)
from yargy.interpretation import fact
from yargy.predicates import (
    eq, lte, gte, gram, type, tag,
    length_eq,
    in_, in_caseless, dictionary,
    normalized, caseless,
    is_title
)
from yargy.pipelines import morph_pipeline
from yargy.tokenizer import QUOTES

pd.options.display.max_colwidth = 100

###Константы

In [2]:
COUNTRY_KEY_WORDS = ("страна", "россия", "рф")

REGIONS_KEY_WORDS = ("республика", "респ", "рес", "округ", "область", "обл", "об",\
                     "край", "область", "ао", "автономный", "автономная")
REGIONS_KEY_WORDS_BEFORE = ("республика", "респ", "рес", "округ", "область", "обл", "об")
REGIONS_KEY_WORDS_AFTER = ("край", "область", "обл", "об", "ао", "автономный")
SPECIAL_REGIONS = ("краснодарский", "ростовская", "алтайский", "владимирская", \
                   "московская", "нижегородская", "приморский", "калининградская", \
                   "калужская", "кировская", "вологодская", "пензенская", "амурская", \
                   "ставропольский", "ленинградская", "воронежская", "орловская", \
                   "новгородская", "смоленская", "псковская", "ивановская", "волгоградская", \
                   "ярославская", "курганская", "липецкая", "курская", "самарская", "тамбовская", \
                   "тюменская", "омская", "рязанская", "магаданская")

TOWNS_KEY_WORDS = ("г", "гор", "город")
SPECIAL_TOWNS = ('октябрьский',)

SETTLEMENT_KEY_WORDS = ("село", "с", "сл", "деревня", "дер", "поселок", "посёлок", "п", "пос", "рп", "пгт", "ст")
SETTLEMENTS_TO_REMOVE = ("пушкина", "ленина", "новгород", "революции", "люксембург", \
                         "энтузиастов", "ермакова", "мира", "чапаев", "ряд", "улица", \
                         "кирова", "гагарина", "энгельса", "калинина", "саха", "")
COMPLEX_SETTLEMENTS_TO_REMOVE = ("большая дмитровка", "1 мая", "8 марта", "карла маркса",\
                    "красный путь", "розы люксембург", "степана разина", "красный октябрь", \
                    "льва толстого", "красная площадь", "красные зори", "большая покровская", \
                    "максима горького", "красная пресня")
SETTLEMENTS_TO_ADD = ("Зеленоград", "Винсады", "Вольск-18")

STREETS_KEY_WORDS = ("улица", "ул", "пр", "проспект", "пр-кт", "пр-т", "проезд", "пр-зд", "пр-д", \
              "переулок", "пер", "площадь", "пл", "ш", "шоссе", "набережная", "наб", "бульвар", \
             "бульвар", "б-р", "б", "бул", "бр")


# Загрузка данных

Адреса на парсинг

In [3]:
df = pd.read_excel("addresses.xlsx")
df = df[['Column1']]
df = df.dropna()
df = df.drop_duplicates() # потом убрать
df = df.reset_index() # потом убрать
df = df.drop(columns=['index']) # потом убрать
df.rename({"Column1": "address"}, inplace=True, axis=1)

Формируем таблицу из 500 рандомных записей.

In [4]:
data = df.copy()

In [5]:
data = df.sample(n=1000, random_state=39)

Таблица городов России

In [6]:
url = r'https://ru.wikipedia.org/wiki/%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D0%B3%D0%BE%D1%80%D0%BE%D0%B4%D0%BE%D0%B2_%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D0%B8'
tables = pd.read_html(url)
table = tables[1]
table.columns = table.columns.droplevel()
wiki_towns = table[['Город', 'Регион']]

Список регионов России

In [7]:
regs = wiki_towns['Регион'].drop_duplicates().to_list()

Таблица сёл, посёлков, деревень

In [8]:
settlements_df = pd.read_excel('locations.xlsx')

# Обработка данных


Удаление знаков препинания, лишних пробелов, слов, замена известных названий, состоящих из 2-ух слов на одну единицу, например, Ясный Бор -> ясный_бор **ПЕРЕДЕЛАТЬ ВСЕ ПОД МНОЖЕСТВА**

In [9]:
def drop_punctuation(s):
    return s.replace(".", " . ").replace(",", " , ")

def drop_spaces(s):
    return re.sub(' +', ' ', s)

def changer(s, dictionary):
    for key, value in dictionary.items():
        s = re.sub(key, value, s, flags=re.IGNORECASE)
    return s

def locality_dict(locs):
    locality_dict = {}
    for loc in locs:
        if len(re.split(r'[\s]\s*', loc)) > 1:
            name = re.sub(r'\([^()]*\)', "", loc, flags=re.IGNORECASE)
            name = re.split(r'[\s]\s*', name)
            if "" in name:
                name.remove("")
            if " " in name:
                name.remove(" ")
            locality_dict.update({loc: "_".join(name)})
    return locality_dict

def drop_spaces_and_brackets(loc):
    if len(re.split(r'[\s]\s*', loc)) > 1:
        name = re.sub(r'\([^()]*\)', "", loc, flags=re.IGNORECASE)
        name = re.split(r'[\s]\s*', name)
        if "" in name:
            name.remove("")
        if " " in name:
            name.remove(" ")
    else:
        name = (loc,)
    return " ".join(name)

def extract_complex_names(locations):
    result = list()
    extracted = list()
    for loc in locations:
        if len(loc.split()) > 1:
            extracted.append(loc)
        else:
            result.append(loc)
    return result, extracted

def generate_ngrams(s, n):
    s = s.lower()
    s = re.sub(r'^[a-zA-Z0-9,.\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

def locs_changer(s, locs_dict):
    grams_list = generate_ngrams(s, 2)
    for gram in grams_list:
        if locs_dict.get(gram):
            s = re.sub(gram, locs_dict.get(gram), s, flags=re.IGNORECASE)
            break
    return s

def extract_adjf_names(locations):
    result = list()
    extracted = list()
    end_list = ("ий", "ый", "ая", "яя", "ое", "ее", "ой")
    for loc in locations:
        if loc[-2:] in end_list:
            extracted.append(loc)
        else:
            result.append(loc)
    return result, extracted

Обработка списка городов

In [10]:
towns = wiki_towns['Город'].to_list()

change = functools.partial(changer, dictionary={'ё': 'е'})
towns = map(change, towns)
change = functools.partial(changer, dictionary={"оспаривается": ""})
towns = list(map(change, towns))
towns_dict = locality_dict(towns)
change = functools.partial(changer, dictionary=towns_dict)
towns = list(map(change, towns))
towns = list(dict.fromkeys(towns))

Обработка списка регионов

In [11]:
regions = list()
for reg in regs:
    reg = reg.lower()
    reg = reg.replace(" ао", "").replace(" — югра", "").replace(" — алания", "").replace(" область", "").replace(" край", "")
    regions.append("_".join(re.split(r'[\s]\s*', reg)))
regions.append("чувашская")
regions.append("саха")
regions.remove("москва")
regions.remove("санкт-петербург")

Обработка списка поселений

In [12]:
settlements = settlements_df['location'].to_list()
for elem in list(SETTLEMENTS_TO_ADD):
    settlements.append(elem)

change = functools.partial(changer, dictionary={'ё': 'е'})
settlements = list(map(change, settlements))
settlements = list(map(drop_spaces_and_brackets, settlements))
settlements = list(map(str.lower, settlements))
settlements = list(dict.fromkeys(settlements))
settlements, complex_settlements = extract_complex_names(settlements)
settlements, settlements_adjf = extract_adjf_names(settlements)

for elem in list(SETTLEMENTS_TO_REMOVE):
    if elem in settlements:
        settlements.remove(elem)

for elem in list(COMPLEX_SETTLEMENTS_TO_REMOVE):
    if elem in complex_settlements:
        complex_settlements.remove(elem)

complex_settlements_dict = locality_dict(complex_settlements)
complex_list = list(complex_settlements_dict.values())
settlements_list = settlements + complex_list

In [13]:
data = df.sample(n=300, random_state=25)
#data = df[df.index==155756]
data['address'] = data['address'].apply(str)
data['address'] = data['address'].apply(drop_punctuation)
data['address'] = data['address'].apply(drop_spaces)
data['address'] = data['address'].apply(changer, dictionary={'ё': 'е'})
data['address'] = data['address'].apply(changer, dictionary=towns_dict)
data['address'] = data['address'].apply(locs_changer, locs_dict=complex_settlements_dict)
#data['address'] = data['address'].apply(changer, dictionary=locations_dict)

Формирование таблицы, которая будет заполняться найденными сущностями

In [14]:
data['region'] = ""
data['town'] = ""
data['district'] = ""
data['settlement'] = ""
data['street'] = ""
data['building'] = ""
data['index'] = ""
data['place'] = ""
data['none'] = ""
data['else'] = ""

# Парсинг

In [15]:
#заменить северную осетию
def regions_extractor(s, names, types_before=None, types_after=None, special_names=None):
    #names = list(map(str.lower, names))
    addr_type_before = 0
    addr_type_after = 0
    name = None
    for word in s.lower().split():
        if word in names and not word in special_names:
            return word.capitalize()
        elif special_names and word in special_names:
            if addr_type_before == 1:
                return word.capitalize()
            else:
                name = word
                addr_type_after = 1
                continue
        elif word in types_after and addr_type_after:
            return name.capitalize()
        elif word in types_before:
            addr_type_before = 1
            continue
        if addr_type_after:
            addr_type_after = 0
        if addr_type_before:
            addr_type_before = 0
    return ""

def addr_part_extractor(s, names, types=None, special_names=list()):
    names = list(map(str.lower, names))
    addr_type = False
    for word in s.split():
        if types:
            if word.lower() in types:
                addr_type = True
            elif word.lower() in special_names and addr_type or word.lower() in names and not word.lower() in special_names:
                return word
        else:
            if word.lower() in names:
                return word
    return ""

def delete_addr_part(s, addr_name, addr_type):
    if addr_name == "":
        return(s)
    result = list()
    deleted = list()
    addr_name = list(map(str.lower, addr_name))
    addr_type = list(map(str.lower, addr_type))
    key = False
    # удаляем повторяющиеся названия, но стараемся не трогать другие совпадающие названия
    for word in s.split():
        if not word.lower() in addr_name and not word.lower() in addr_type:
            result.append(word)
        elif key and word.lower() in addr_name and not word.lower() in deleted:
            result.append(word)
        elif not key and word.lower() in addr_name:
            key = True
            deleted.append(word.lower())
    return " ".join(result)

###Города

In [16]:
data['town'] = data['address'].apply(addr_part_extractor, names=towns, types=TOWNS_KEY_WORDS, special_names=SPECIAL_TOWNS)
data['else'] = data.apply(lambda x: delete_addr_part(x['address'], (x['town'],), TOWNS_KEY_WORDS), axis=1)

###Регионы

In [17]:
data['region'] = data['else'].apply(regions_extractor, names=regions, \
                                       types_before=REGIONS_KEY_WORDS_BEFORE, \
                                       types_after=REGIONS_KEY_WORDS_AFTER, \
                                       special_names=SPECIAL_REGIONS)
data['else'] = data.apply(lambda x: delete_addr_part(x['else'], (x['region'],), REGIONS_KEY_WORDS), axis=1)

###Поселения

In [18]:
%%time
data['settlement'] = data[data['town']==""]['else'].apply(addr_part_extractor, names=settlements_list, types=SETTLEMENT_KEY_WORDS)
data = data.fillna(value="")
data['else'] = data.apply(lambda x: delete_addr_part(x['else'], (x['settlement'],), SETTLEMENT_KEY_WORDS), axis=1)

CPU times: user 389 ms, sys: 4.06 ms, total: 393 ms
Wall time: 393 ms


In [None]:
settlements

#Yargy

###Yargy правила

In [19]:
def value(key):
        @property
        def field(self):
            return getattr(self, key)
        return field

In [31]:
from natasha.grammars.addr import GOROD, INDEX, FED_OKRUG, RESPUBLIKA, KRAI, AUTO_OKRUG, DEREVNYA, \
SELO, POSELOK

from yargy import Parser
from IPython.display import display
from yargy import (
    rule,
    or_, and_
)
from yargy.interpretation import fact
from yargy.predicates import (
    eq, lte, gte, gram, type, tag,
    length_eq,
    in_, in_caseless, dictionary,
    normalized, caseless,
    is_title
)
from yargy.pipelines import morph_pipeline
from yargy.tokenizer import QUOTES

INT = type('INT')
DOT = eq('.')
ADJF = gram('ADJF')
NOUN = gram('NOUN')
TITLE = is_title()
DASH = eq('-')
SLASH = eq('/')

ANUM = rule(
    INT,
    DASH.optional(),
    in_caseless({
        'я', 'й', 'е',
        'ое', 'ая', 'ий', 'ой'
    })
)

def value(key):
    @property
    def field(self):
        return getattr(self, key)
    return field

work_place = fact(
    'work_place',
    ['number', 'type']
)

AddrPart = fact(
    'AddrPart',
    ['value']
)

work_place_part = fact(
    'AddrPart',
    ['value']
)

OnlyNumberIsu = fact(
    'OnlyNumberIsu',
    ['number']
)

OnlyNumberPlace = fact(
    'OnlyNumberPlace',
    ['number']
)

Region = fact(
    'Region',
    ['name', 'type']
)

Building = fact(
    'Building',
    ['number', 'type']
)

Room = fact(
    'Room',
    ['number', 'type']
)

Settlement = fact(
    'Settlement',
    ['name', 'type']
)

Street = fact(
    'Street',
    ['name', 'type']
)

class Region(Region):
    value = value('name')

class Settlement(Settlement):
    value = value('name')
    
class Street(Settlement):
    value = value('name')

class Building(Building):
    value = value('number')
    
class Room(Room):
    value = value('number')

class OnlyNumberIsu(OnlyNumberIsu):
    type = 'ВСП/ДО'
    value = value('number')
    
class OnlyNumberPlace(OnlyNumberPlace):
    type = 'место'
    value = value('number')    
    
class work_place(work_place):
    value = value('number')   
    
class work_place_part(AddrPart):
    @property
    def obj(self):
        from natasha import obj

        part = self.value
        return obj.AddrPart(part.value, part.type)

#########
#
#  Number
#
##########

LETTER = in_caseless(set('абвгдежзийклмнопрстуфхшщэюя'))
LETTER_LATIN = in_caseless(set('qwertyuiopasdfghjklzxcvbnm'))
LETTERS = or_(
    rule(LETTER),
    rule(LETTER_LATIN)
)
SEP = in_(r'/\-')

SIMPLE_NUMBER = rule(
    INT,
    LETTERS.optional()
)

NUMBER_CASES = or_(
    rule(SIMPLE_NUMBER),
    rule(SIMPLE_NUMBER, SEP, SIMPLE_NUMBER),
    rule(SIMPLE_NUMBER, SEP, LETTERS)
)

NUMBER = rule(
    eq('№').optional(),
    NUMBER_CASES
)

#########
#
#  Floor
#
##########

FLOORNAME = rule(normalized('этаж')
).interpretation(
    work_place.type.const('этаж'))

FLOOR_NUMBER = NUMBER.interpretation(
    work_place.number
)

FLOOR = rule(
    FLOORNAME,
    FLOOR_NUMBER
).interpretation(
    work_place
)

#########
#
#  Cabinet
#
##########


CABNAME = or_(
    rule(normalized('кабинет')),
    rule(
        caseless('каб'),
        DOT.optional()
    ),
    rule(
        caseless('к'),
        DOT
    )
).interpretation(
    work_place.type.const('кабинет')
)

CAB_NUMBER = NUMBER.interpretation(
    work_place.number
)

CABINET = rule(
    CABNAME,
    CAB_NUMBER
).interpretation(
    work_place
)

#########
#
#  Sector
#
##########

QUOTE = in_(QUOTES)

DOT_INT = rule(DOT, INT)

SECTORNAME = or_(
    caseless('сектор'),
    caseless('блок')
).interpretation(
    work_place.type.const('сектор'))

LETTERS_OPT_INT = or_(
    rule(LETTER, INT.optional()),
    rule(LETTER_LATIN, INT.optional())
)

LETTERS_INT = or_(
    rule(LETTER, INT),
    rule(LETTER_LATIN, INT)
)

LETTERS_INTS = rule(LETTERS_OPT_INT, DOT_INT)

QUOTE_LETTER_INT = rule(
    QUOTE, LETTERS_OPT_INT, QUOTE
)

SECTOR_NUMBER = or_(
    rule(QUOTE_LETTER_INT),
    rule(LETTERS_INT, DOT_INT.optional()),
    rule(LETTERS_OPT_INT),
    rule(LETTERS, DOT_INT),
    rule(INT, DOT_INT, DOT_INT.optional()),
    rule(LETTERS, DOT_INT),
    rule(SIMPLE_NUMBER),
    rule(SIMPLE_NUMBER, SEP, SIMPLE_NUMBER),
    rule(SIMPLE_NUMBER, SEP, LETTERS)
).interpretation(
    work_place.number
)

SECTOR = rule(
    SECTORNAME,
    eq('№').optional(),SECTOR_NUMBER
).interpretation(
    work_place
)

# #########
# #
# #  Place
# #
# ##########

# COMPLEX_PLACENAME = morph_pipeline([
#     'рабочее место',
#     'раб.место'
# ])

# PLACENAME = or_(
#     rule(normalized('место')),
#     rule(
#         caseless('мес'),
#         DOT.optional()
#     ),
#     rule(caseless('м'), DOT),
#     rule(caseless('рм'), DOT.optional()),
#     rule(
#         caseless('р'),
#         DOT.optional(),
#         caseless('м'),
#         DOT.optional()
#     ),
#     COMPLEX_PLACENAME
# ).interpretation(
#     work_place.type.const('место'))

# PLACE_NUMBER = or_(
#     rule(LETTERS, DOT_INT, DOT_INT),
#     rule(INT, DOT_INT, DOT_INT.optional()),
#     rule(LETTERS_INT, DOT_INT.optional()),
#     rule(SIMPLE_NUMBER),
#     rule(SIMPLE_NUMBER, SEP, SIMPLE_NUMBER),
#     rule(SIMPLE_NUMBER, SEP, LETTERS)
# ).interpretation(
#     work_place.number
# )

# COMPLEX_PLACE = rule(COMPLEX_PLACENAME, 
#          eq('№').optional(),
#          PLACE_NUMBER)

# PLACE = or_(
#     rule(PLACENAME,
#          eq('№').optional(),
#          PLACE_NUMBER),
#     COMPLEX_PLACE
# ).interpretation(
#     work_place
# )

# ONLY_NUMBER_PLACE = or_(
#     rule(INT, DOT_INT, DOT_INT.optional()), # 10.10.10
#     rule(LETTERS_INTS, DOT_INT.optional()), # B2.10.10 , B.10.10, B2.10
#     rule(INT, DOT, LETTERS_INTS), # 10.B10.10, 10.B.10
#     rule(LETTERS_INTS, DOT_INT, DOT_INT.optional()), # B2.10.10, B.10.10, B2.10.10.10, B.10.10.10
#     rule(INT, DOT, LETTERS, DOT_INT, DOT_INT.optional()), # 10.B.10, 10.B.10.10
# ).interpretation(
#     OnlyNumberPlace.number
# ).interpretation(
#     OnlyNumberPlace
# )

#########
#
#  Room
#
##########

ROOMNAME = or_(
    rule(caseless('комната')),
    rule(caseless('ком'),
         DOT.optional()),
    rule(caseless('комн'),
         DOT.optional())
).interpretation(
    work_place.type.const('комната'))

ROOM_NUMBER = or_(
    rule(LETTERS_INT),
    rule(INT, DOT_INT, DOT_INT.optional()),
    rule(SIMPLE_NUMBER)
).interpretation(
    work_place.number
)

ROOM = rule(
    ROOMNAME,
    eq('№').optional(),ROOM_NUMBER
).interpretation(
    work_place
)

#########
#
#  Window
#
##########

WINDOWNAME = rule(normalized('окно')
).interpretation(
    work_place.type.const('окно'))

WINDOW_NUMBER = NUMBER.interpretation(
    work_place.number
)

WINDOW = rule(
    WINDOWNAME,
    WINDOW_NUMBER
).interpretation(
    work_place
)

#########
#
#  ВСП
#
##########

INT_FOUR = and_(INT,
                length_eq(4))

ONLY_NUMBER_ISU = rule(
    INT_FOUR,
    SLASH,
    INT
).interpretation(
    OnlyNumberIsu.number
).interpretation(
    OnlyNumberIsu
)


COMPLEX_ISUNAME = morph_pipeline([
    'дополнительный офис',
    'доп.офис'
])

ISUNAME = or_(
    rule(caseless('ВСП')),
    rule(caseless('ДО')),
    COMPLEX_ISUNAME
).interpretation(
    work_place.type.const('ВСП/ДО'))

ISU_NUMBER = or_(
    rule(SIMPLE_NUMBER),
    rule(SIMPLE_NUMBER, SEP, SIMPLE_NUMBER),
    rule(SIMPLE_NUMBER, SEP, LETTERS),
).interpretation(
    work_place.number
)

COMPLEX_ISU = rule(COMPLEX_PLACENAME, 
         eq('№').optional(),
         ISU_NUMBER)

ISU = or_(
    rule(ISUNAME,
         eq('№').optional(),
         ISU_NUMBER),
    rule(COMPLEX_ISU),
).interpretation(
    work_place
)

############
#
#    OBLAST IMP
#
############


OBLAST_WORDS = or_(
    rule(normalized('область')),
    rule(
        caseless('обл'),
        DOT.optional()
    )
).interpretation(
    Region.type.const('область')
)

OBLAST_NAME = dictionary({
    'амурский',
    'архангельский',
    'астраханский',
    'белгородский',
    'брянский',
    'владимирский',
    'волгоградский',
    'вологодский',
    'воронежский',
    'горьковский',
    'ивановский',
    'ивановский',
    'иркутский',
    'калининградский',
    'калужский',
    'камчатский',
    'кемеровский',
    'кировский',
    'костромской',
    'курганский',
    'курский',
    'ленинградский',
    'липецкий',
    'магаданский',
    'московский',
    'мурманский',
    'нижегородский',
    'новгородский',
    'новосибирский',
    'омский',
    'оренбургский',
    'орловский',
    'пензенский',
    'пермский',
    'псковский',
    'ростовский',
    'рязанский',
    'самарский',
    'саратовский',
    'сахалинский',
    'свердловский',
    'смоленский',
    'тамбовский',
    'тверской',
    'томский',
    'тульский',
    'тюменский',
    'ульяновский',
    'челябинский',
    'читинский',
    'ярославский',
}).interpretation(
    Region.name
)

OBLAST = or_(
    rule(OBLAST_NAME, OBLAST_WORDS),
    rule(OBLAST_WORDS, OBLAST_NAME)
).interpretation(
    Region
)

##########
#
#  RAION IMP
#
###########


RAION_WORDS = or_(
    rule(caseless('р'), '-', in_caseless({'он', 'н'})),
    rule(caseless('мрн')),
    rule(caseless('мкр')),
    rule(normalized('район')),
    rule(normalized('микрорайон'))
).interpretation(
    Region.type.const('район')
)

RAION_SIMPLE_NAME = and_(
    ADJF,
    TITLE
)

RAION_MODIFIERS = rule(
    in_caseless({
        'усть',
        'северо',
        'александрово',
        'гаврилово',
    }),
    DASH.optional(),
    TITLE
)

RAION_COMPLEX_NAME = rule(
    RAION_MODIFIERS,
    RAION_SIMPLE_NAME
)

RAION_NAME = or_(
    rule(RAION_SIMPLE_NAME),
    RAION_COMPLEX_NAME
).interpretation(
    Region.name
)

RAION = rule(
    RAION_NAME,RAION_WORDS
    #rule(RAION_WORDS,RAION_NAME)
).interpretation(
    Region
)

##############
#
#   ADDR PERSON CPY
#
############


ABBR = and_(
    length_eq(1),
    is_title()
)

PART = and_(
    TITLE,
    or_(
        gram('Name'),
        gram('Surn')
    )
)

MAYBE_FIO = or_(
    rule(TITLE, PART),
    rule(PART, TITLE),
    rule(ABBR, '.', TITLE),
    rule(ABBR, '.', ABBR, '.', TITLE),
    rule(TITLE, ABBR, '.', ABBR, '.')
)

POSITION_WORDS_ = or_(
    rule(
        dictionary({
            'мичман',
            'геолог',
            'подводник',
            'краевед',
            'снайпер',
            'штурман',
            'бригадир',
            'учитель',
            'политрук',
            'военком',
            'ветеран',
            'историк',
            'пулемётчик',
            'авиаконструктор',
            'адмирал',
            'академик',
            'актер',
            'актриса',
            'архитектор',
            'атаман',
            'врач',
            'воевода',
            'генерал',
            'губернатор',
            'хирург',
            'декабрист',
            'разведчик',
            'граф',
            'десантник',
            'конструктор',
            'скульптор',
            'писатель',
            'поэт',
            'капитан',
            'князь',
            'комиссар',
            'композитор',
            'космонавт',
            'купец',
            'лейтенант',
            'лётчик',
            'майор',
            'маршал',
            'матрос',
            'подполковник',
            'полковник',
            'профессор',
            'сержант',
            'старшина',
            'танкист',
            'художник',
            'герой',
            'княгиня',
            'строитель',
            'дружинник',
            'диктор',
            'прапорщик',
            'артиллерист',
            'графиня',
            'большевик',
            'патриарх',
            'сварщик',
            'офицер',
            'рыбак',
            'брат',
        })
    ),
    rule(normalized('генерал'), normalized('армия')),
    rule(normalized('герой'), normalized('россия')),
    rule(
        normalized('герой'),
        normalized('российский'), normalized('федерация')),
    rule(
        normalized('герой'),
        normalized('советский'), normalized('союз')
    ),
)

ABBR_POSITION_WORDS = rule(
    in_caseless({
        'адм',
        'ак',
        'акад',
    }),
    DOT.optional()
)

POSITION_WORDS = or_(
    POSITION_WORDS_,
    ABBR_POSITION_WORDS
)

MAYBE_PERSON = or_(
    MAYBE_FIO,
    rule(POSITION_WORDS, MAYBE_FIO),
    rule(POSITION_WORDS, TITLE)
)

###########
#
#   IMENI CPY
#
##########


IMENI_WORDS = or_(
    rule(
        caseless('им'),
        DOT.optional()
    ),
    rule(caseless('имени'))
)

IMENI = or_(
    rule(
        IMENI_WORDS.optional(),
        MAYBE_PERSON
    ),
    rule(
        IMENI_WORDS,
        TITLE
    )
)

##########
#
#   LET CPY
#
##########


LET_WORDS = or_(
    rule(caseless('лет')),
    rule(
        DASH.optional(),
        caseless('летия')
    )
)

LET_NAME = in_caseless({
    'влксм',
    'ссср',
    'алтая',
    'башкирии',
    'бурятии',
    'дагестана',
    'калмыкии',
    'колхоза',
    'комсомола',
    'космонавтики',
    'москвы',
    'октября',
    'пионерии',
    'победы',
    'приморья',
    'района',
    'совхоза',
    'совхозу',
    'татарстана',
    'тувы',
    'удмуртии',
    'улуса',
    'хакасии',
    'целины',
    'чувашии',
    'якутии',
})

LET = rule(
    INT,
    LET_WORDS,
    LET_NAME
)

##########
#
#    ADDR DATE CPY
#
#############


MONTH_WORDS = dictionary({
    'январь',
    'февраль',
    'март',
    'апрель',
    'май',
    'июнь',
    'июль',
    'август',
    'сентябрь',
    'октябрь',
    'ноябрь',
    'декабрь',
})

DAY = and_(
    INT,
    gte(1),
    lte(31)
)

YEAR = and_(
    INT,
    gte(1),
    lte(2100)
)

YEAR_WORDS = normalized('год')

DATE = or_(
    rule(DAY, MONTH_WORDS),
    rule(YEAR, YEAR_WORDS)
)

#########
#
#   MODIFIER CPY
#
############


MODIFIER_WORDS_ = rule(
    dictionary({
        'большой',
        'малый',
        'средний',

        'верхний',
        'центральный',
        'нижний',
        'северный',
        'дальний',

        'первый',
        'второй',

        'старый',
        'новый',

        'красный',
        'лесной',
        'тихий',
    }),
    DASH.optional()
)

ABBR_MODIFIER_WORDS = rule(
    in_caseless({
        'б', 'м', 'н'
    }),
    DOT.optional()
)

SHORT_MODIFIER_WORDS = rule(
    in_caseless({
        'больше',
        'мало',
        'средне',

        'верх',
        'верхне',
        'центрально',
        'нижне',
        'северо',
        'дальне',
        'восточно',
        'западно',

        'перво',
        'второ',

        'старо',
        'ново',

        'красно',
        'тихо',
        'горно',
    }),
    DASH.optional()
)

MODIFIER_WORDS = or_(
    MODIFIER_WORDS_,
    ABBR_MODIFIER_WORDS,
    SHORT_MODIFIER_WORDS,
)


##########
#
#   ADDR NAME IMP
#
##########


ROD = gram('gent')

SIMPLE = and_(
    or_(
        ADJF,  # Школьная
        and_(NOUN, ROD),  # Ленина, Победы
    ),
)

COMPLEX = or_(
    rule(
        and_(ADJF),
        NOUN
    ),
    rule(
        TITLE,
        DASH.optional(),
        TITLE
    ),
)

EXCEPTION = dictionary({
    'арбат',
    'варварка',
    'мельникайте',
    'каховка',
    'зорге'
})

MAYBE_NAME = or_(
    rule(SIMPLE),
    COMPLEX,
    rule(EXCEPTION)
)

NAME = or_(
    MAYBE_NAME,
    LET,
    DATE,
    IMENI
)

NAME = rule(
    MODIFIER_WORDS.optional(),
    NAME
)

ADDR_CRF = tag('I').repeatable()

NAME = or_(
    NAME,
    ANUM,
    rule(NAME, ANUM),
    rule(ANUM, NAME),
    rule(INT, DASH.optional(), NAME),
    rule(NAME, DASH, INT),
    ADDR_CRF
)

ADDR_NAME = NAME

########
#
#    STREET CPY
#
#########


STREET_WORDS = or_(
    rule(normalized('улица')),
    rule(
        caseless('ул'),
        DOT.optional()
    )
).interpretation(
    Street.type.const('улица')
)

STREET_NAME = ADDR_NAME.interpretation(
    Street.name
)

STREET = or_(
    rule(STREET_WORDS, STREET_NAME),
    rule(STREET_NAME, STREET_WORDS)
).interpretation(
    Street
)

##########
#
#    PROSPEKT CPY
#
##########


PROSPEKT_WORDS = or_(
    rule(
        in_caseless({'пр', 'просп'}),
        DOT.optional()
    ),
    rule(
        caseless('пр'),
        '-',
        in_caseless({'кт', 'т'}),
        DOT.optional()
    ),
    rule(normalized('проспект'))
).interpretation(
    Street.type.const('проспект')
)

PROSPEKT_NAME = ADDR_NAME.interpretation(
    Street.name
)

PROSPEKT = or_(
    rule(PROSPEKT_WORDS, PROSPEKT_NAME),
    rule(PROSPEKT_NAME, PROSPEKT_WORDS)
).interpretation(
    Street
)


############
#
#    PROEZD CPY
#
#############


PROEZD_WORDS = or_(
    rule(caseless('пр'), DOT.optional()),
    rule(
        caseless('пр'),
        '-',
        in_caseless({'зд', 'д'}),
        DOT.optional()
    ),
    rule(normalized('проезд'))
).interpretation(
    Street.type.const('проезд')
)

PROEZD_NAME = ADDR_NAME.interpretation(
    Street.name
)

PROEZD = or_(
    rule(PROEZD_WORDS, PROEZD_NAME),
    rule(PROEZD_NAME, PROEZD_WORDS)
).interpretation(
    Street
)


###########
#
#   PEREULOK CPY
#
##############


PEREULOK_WORDS = or_(
    rule(
        caseless('п'),
        DOT
    ),
    rule(
        caseless('пер'),
        DOT.optional()
    ),
    rule(normalized('переулок'))
).interpretation(
    Street.type.const('переулок')
)

PEREULOK_NAME = ADDR_NAME.interpretation(
    Street.name
)

PEREULOK = or_(
    rule(PEREULOK_WORDS, PEREULOK_NAME),
    rule(PEREULOK_NAME, PEREULOK_WORDS)
).interpretation(
    Street
)


########
#
#  PLOSHAD CPY
#
##########


PLOSHAD_WORDS = or_(
    rule(
        caseless('пл'),
        DOT.optional()
    ),
    rule(normalized('площадь'))
).interpretation(
    Street.type.const('площадь')
)

PLOSHAD_NAME = ADDR_NAME.interpretation(
    Street.name
)

PLOSHAD = or_(
    rule(PLOSHAD_WORDS, PLOSHAD_NAME),
    rule(PLOSHAD_NAME, PLOSHAD_WORDS)
).interpretation(
    Street
)

########
#
#  ADDR VALUE IMP
#
##########

LETTER = or_(
    rule(LETTER),
    rule(QUOTE, LETTER, QUOTE)
)

VALUE = rule(
    INT,
    LETTER.optional()
)

SEP = in_(r'/\-')

VALUE = or_(
    rule(VALUE),
    rule(VALUE, SEP, VALUE),
    rule(VALUE, SEP, LETTER),
)

ADDR_VALUE = rule(
    eq('№').optional(),
    VALUE
)

############
#
#   SHOSSE IMP
#
###########

SHOSSE_WORDS = or_(
    rule(
        caseless('ш'),
        DOT.optional()
    ),
    rule(normalized('шоссе'))
).interpretation(
    Street.type.const('шоссе')
)

SHOSSE_NAME = ADDR_NAME.interpretation(
    Street.name
)

SHOSSE = or_(
    rule(SHOSSE_NAME, SHOSSE_WORDS),
    rule(SHOSSE_WORDS, SHOSSE_NAME)
).interpretation(
    Street
)

########
#
#  NABEREG CPY
#
##########


NABEREG_WORDS = or_(
    rule(
        caseless('наб'),
        DOT.optional()
    ),
    rule(normalized('набережная'))
).interpretation(
    Street.type.const('набережная')
)

NABEREG_NAME = ADDR_NAME.interpretation(
    Street.name
)

NABEREG = or_(
    rule(NABEREG_WORDS, NABEREG_NAME),
    rule(NABEREG_NAME, NABEREG_WORDS)
).interpretation(
    Street
)


########
#
#  BULVAR IMP
#
##########


BULVAR_WORDS = or_(
    rule(
        caseless('б'),
        '-',
        caseless('р'),
        DOT.optional()
    ),
    rule(
        caseless('б'),
        DOT
    ),
    rule(
        caseless('бул'),
        DOT.optional()
    ),
    rule(normalized('бульвар'))
).interpretation(
    Street.type.const('бульвар')
)

BULVAR_NAME = ADDR_NAME.interpretation(
    Street.name
)

BULVAR = or_(
    rule(BULVAR_WORDS, BULVAR_NAME),
    rule(BULVAR_NAME, BULVAR_WORDS)
).interpretation(
    Street
)

############
#
#    DOM IMP
#
#############


DOM_WORDS = or_(
    rule(normalized('дом')),
    rule(
        caseless('д'),
        DOT.optional()
    )
).interpretation(
    Building.type.const('дом')
)

DOM_VALUE = ADDR_VALUE.interpretation(
    Building.number
)

DOM = rule(
    DOM_WORDS,
    DOM_VALUE
).interpretation(
    Building
)

#########
#
#  Place
#
##########

COMPLEX_PLACENAME = morph_pipeline([
    'рабочее место',
    'раб.место'
])

PLACENAME = or_(
    rule(normalized('место')),
    rule(
        caseless('мес'),
        DOT.optional()
    ),
    rule(caseless('м'), DOT),
    rule(caseless('рм'), DOT.optional()),
    rule(
        caseless('р'),
        DOT.optional(),
        caseless('м'),
        DOT.optional()
    ),
    COMPLEX_PLACENAME
).interpretation(
    work_place.type.const('место'))

PLACE_NUMBER = or_(
    rule(LETTERS, DOT_INT, DOT_INT),
    rule(INT, DOT_INT, DOT_INT.optional()),
    rule(LETTERS_INT, DOT_INT.optional()),
    rule(SIMPLE_NUMBER),
    rule(SIMPLE_NUMBER, SEP, SIMPLE_NUMBER),
    rule(SIMPLE_NUMBER, SEP, LETTERS)
).interpretation(
    work_place.number
)

COMPLEX_PLACE = rule(COMPLEX_PLACENAME, 
         eq('№').optional(),
         PLACE_NUMBER)

PLACE = or_(
    rule(PLACENAME,
         eq('№').optional(),
         PLACE_NUMBER),
    COMPLEX_PLACE
).interpretation(
    work_place
)

ONLY_NUMBER_PLACE = or_(
    rule(INT, DOT_INT, DOT_INT.optional()), # 10.10.10
    rule(LETTERS_INTS, DOT_INT.optional()), # B2.10.10 , B.10.10, B2.10
    rule(INT, DOT, LETTERS_INTS), # 10.B10.10, 10.B.10
    rule(LETTERS_INTS, DOT_INT, DOT_INT.optional()), # B2.10.10, B.10.10, B2.10.10.10, B.10.10.10
    rule(INT, DOT, LETTERS, DOT_INT, DOT_INT.optional()), # 10.B.10, 10.B.10.10
).interpretation(
    OnlyNumberPlace.number
).interpretation(
    OnlyNumberPlace
)

###########
#
#  KORPUS IMP
#
##########


KORPUS_WORDS = or_(
    rule(
        in_caseless({'корп', 'кор'}),
        DOT.optional()
    ),
    rule(normalized('корпус'))
).interpretation(
    Building.type.const('корпус')
)

KORPUS_VALUE = ADDR_VALUE.interpretation(
    Building.number
)

KORPUS = rule(
        KORPUS_WORDS,
        KORPUS_VALUE
).interpretation(
    Building
)

###########
#
#  STROENIE IMP
#
##########


STROENIE_WORDS = or_(
    rule(
        caseless('стр'),
        DOT.optional()
    ),
    rule(normalized('строение'))
).interpretation(
    Building.type.const('строение')
)

STROENIE_VALUE = ADDR_VALUE.interpretation(
    Building.number
)

STROENIE = rule(
    STROENIE_WORDS,
    STROENIE_VALUE
).interpretation(
    Building
)

###########
#
#   OFIS CPY
#
#############


OFIS_WORDS = or_(
    rule(
        caseless('оф'),
        DOT.optional()
    ),
    rule(normalized('офис'))
).interpretation(
    Room.type.const('офис')
)

OFIS_VALUE = ADDR_VALUE.interpretation(
    Room.number
)

OFIS = rule(
    OFIS_WORDS,
    OFIS_VALUE
).interpretation(
    Room
)


###########
#
#   KVARTIRA CPY
#
#############


KVARTIRA_WORDS = or_(
    rule(
        caseless('кв'),
        DOT.optional()
    ),
    rule(normalized('квартира'))
).interpretation(
    Room.type.const('квартира')
)

KVARTIRA_VALUE = ADDR_VALUE.interpretation(
    Room.number
)

KVARTIRA = rule(
    KVARTIRA_WORDS,
    KVARTIRA_VALUE
).interpretation(
    Room
)

#########
#
#  work_place_part (final rule)
#
##########

WORK_PLACE_PART = or_(
    INDEX,
    
    GOROD,
    FED_OKRUG,
    RESPUBLIKA,
    KRAI,
    OBLAST,
    AUTO_OKRUG,
    
    DEREVNYA,
    SELO,
    POSELOK,
    
    STREET,
    PROSPEKT,
    PROEZD,
    PEREULOK,
    PLOSHAD,
    SHOSSE,
    NABEREG,
    BULVAR,
    
    DOM,
    KORPUS,
    STROENIE,
    OFIS,
    KVARTIRA,
    
    FLOOR,
    PLACE,
    ONLY_NUMBER_PLACE,
    SECTOR,
    ROOM,
    CABINET,
    WINDOW,
    RAION,
    ISU,
    ONLY_NUMBER_ISU
).interpretation(
    AddrPart.value
).interpretation(
    AddrPart
)

parser = Parser(WORK_PLACE_PART)

In [28]:
df[df.index==8251]

Unnamed: 0,address
8251,"Москва Оружейный переулок, д.41 20 этаж, С5, рм 20.54"


In [32]:
text = 'Москва Оружейный переулок, д.41 20 этаж, С5, рм 20.54'


region = ''
district = ''
location = ''
street = ''
building = ''
index = ''
town = ''
place = ''
isnone = ''

#addr_list = []
tmp_one_place_list = []
building_list = []
work_place_list = ['окно', 'место', 'кабинет', 'этаж', 'ВСП/ДО', 'сектор', 'комната']
street_list = ['улица', 'проспект', 'проезд', 'переулок', 'площадь', 'шоссе', 'набережная', 'бульвар'] 

# from ipymarkup import show_span_ascii_markup as show_markup

# for line in text_list:
#     matches = list(parser.findall(line))
#     spans = [_.span for _ in matches]
#     show_markup(line, spans)

for match in parser.findall(text):
    #print(match)
    print(match.tokens[0].value)
    #print(match.span.stop)
    display(match.fact)
    if match.fact.value.type in work_place_list:
        tmp_one_place_list.append(match.fact.value.type)
        tmp_one_place_list.append(match.fact.value.number)
        place = ' '.join(tmp_one_place_list)
    elif match.fact.value.type in ('федеральный округ', 'республика', 'край', 'область', 'автономный округ'):
        region = match.fact.value.type + ' ' + match.fact.value.name
    elif match.fact.value.type == 'район':
        district = match.fact.value.name + ' ' + match.fact.value.type
    elif match.fact.value.type == 'город':
        town = match.fact.value.value
    elif match.fact.value.type in ('село', 'деревня', 'посёлок'):
        location = match.fact.value.type + ' ' + match.fact.value.name
    elif match.fact.value.type in street_list:
        street = match.fact.value.type + ' ' + match.fact.value.name
    elif match.fact.value.type in ('дом', 'корпус', 'строение', 'офис', 'квартира'):
        building_list.append(match.fact.value.type)
        building_list.append(match.fact.value.number)
        building = " ".join(building_list)
    elif match.fact.value.type == 'индекс':
        index = match.fact.value.value
    elif match.fact.value.type is None:
        isnone = match.fact.value.value

print('Region: {reg}\nDistrict: {dis}\nTown: {to}\nLocation: {lo}\nStreet: {stre}\nBuilding: {bui}\nIndex: {ind}\nNone: {no}\nPlace: {pl}'.format(reg=region, dis=district, to=town, lo=location, \
                                                                        stre=street, bui=building, ind=index, no=isnone, pl=place))

Москва


AddrPart(
    value=Street(
        name='Москва Оружейный',
        type='переулок'
    )
)

д


AddrPart(
    value=Building(
        number='41',
        type='дом'
    )
)

рм


AddrPart(
    value=work_place(
        number='20.54',
        type='место'
    )
)

Region: 
District: 
Town: 
Location: 
Street: переулок Москва Оружейный
Building: дом 41
Index: 
None: 
Place: место 20.54


###Функция yargy

In [33]:
def yargy_parser(text, parser=Parser(WORK_PLACE_PART)):
    region = ''
    town = ''
    district = ''
    settlement = ''
    street = ''
    building = ''
    index = ''
    place = ''
    isnone = ''
    deleted_addr_parts = text
    i = 0

    #addr_list = []
    tmp_one_place_list = []
    building_list = []
    work_place_list = ['окно', 'место', 'кабинет', 'этаж', 'ВСП/ДО', 'сектор', 'комната']
    street_list = ['улица', 'проспект', 'проезд', 'переулок', 'площадь', 'шоссе', 'набережная', 'бульвар']

    for match in parser.findall(text):
      
        if not match.fact.value.type is None:
            deleted_addr_parts = deleted_addr_parts[0: match.span.start - i] + deleted_addr_parts[match.span.stop - i:]
            i += match.span.stop - match.span.start

        if match.fact.value.type in work_place_list:
            tmp_one_place_list.append(match.fact.value.type)
            tmp_one_place_list.append(match.fact.value.number)
            place = ' '.join(tmp_one_place_list)
        elif match.fact.value.type in ('федеральный округ', 'республика', 'край', 'область', 'автономный округ'):
            region = match.fact.value.type + ' ' + match.fact.value.name
        elif match.fact.value.type == 'район':
            district = match.fact.value.name + ' ' + match.fact.value.type
        elif match.fact.value.type == 'город':
            town = match.fact.value.value
        elif match.fact.value.type in ('село', 'деревня', 'посёлок'):
            settlement = match.fact.value.type + ' ' + match.fact.value.name
        elif match.fact.value.type in street_list:
            street = match.fact.value.type + ' ' + match.fact.value.name
        elif match.fact.value.type in ('дом', 'корпус', 'строение', 'офис', 'квартира'):
            building_list.append(match.fact.value.type)
            building_list.append(match.fact.value.number)
            building = " ".join(building_list)
        elif match.fact.value.type == 'индекс':
            index = match.fact.value.value
        elif match.fact.value.type is None:
            isnone = match.fact.value.value
        
    return region, town, district, settlement, street, building, index, place, isnone, deleted_addr_parts

# Запуск yargy

In [34]:
d = data.copy()

d['region'], d['town'], d['district'], d['settlement'], d['street'], \
d['building'], d['index'], d['place'], d['none'], d['else'] = zip(*d['else'].apply(yargy_parser))

d['town'] = data['town'].add(d['town'], fill_value="")
d['region'] = data['region'].add(d['region'], fill_value="")
d['settlement'] = data['settlement'].add(d['settlement'], fill_value="")

In [None]:
 #182399 удалить дополнительный убрать склеивания возможно добавить села-поселки из 3 слов

In [68]:
d[190:200]

Unnamed: 0,address,region,town,district,settlement,street,building,index,place,none,else
96851,Ульяновск Хрустальная 2,,Ульяновск,,,,,,,,Хрустальная 2
127304,"с . Ребриха Алтайский край , Ребрихинский район",Алтайский,,Ребрихинский район,Ребриха,,,,,,". ,"
44349,"г . Воронеж ул . Кольцовская , 35а",,Воронеж,,,улица Кольцовская,,,,,". , 35а"
29786,"Налобиха Алтайский край Косихинский р-он с . Налобиха ул . Тельмана , 37 ОСБ 7492/067",Алтайский,,Косихинский район,Налобиха,улица Тельмана,,,ВСП/ДО 7492/067,,". , 37 ОСБ"
118970,"Санкт-Петербург ул . Красного Текстильщика , д . 2 (комната 454) 454",,Санкт-Петербург,,,улица Красного Текстильщика,дом 2,,комната 454,,", () 454"
122898,"санкт-петербург новое девяткино , ло , арсенальная 14",,санкт-петербург,,,,,,,,"новое девяткино , ло , арсенальная 14"
13093,"Иркутск ул . Нижняя Набережная , д . 10 920",,Иркутск,,,улица Нижняя Набережная,дом 10,,,,", 920"
153156,Москва Варшавское шоссе д . 9 стр 1,,Москва,,,шоссе Варшавское,дом 9 строение 1,,,,
172430,г . Санкт- Петербург пр . Обуховской Обороны 295 Комната № 3,,,,,проспект Обуховской Обороны,,,комната 3,Санкт- Петербург,. Санкт- Петербург 295
124044,"г Элиста , Респ Калмыкия ул М . Горького , 13Б 1",Калмыкия,Элиста,,,улица М . Горького,,,,,", , 13Б 1"


# Сохранение результатов

In [None]:
data.to_csv('address_parsing.csv', index=True)  

#Ненужное

In [None]:
# def key_setter(s):
#     special_list = ("", "большая_дмитровка", "1_мая", "8_марта", "карла_маркса",\
#                     "красный_путь", "розы_люксембург", "степана_разина", "красный_октябрь", \
#                     "льва_толстого", "красная_площадь", "красные_зори", "большая_покровская", \
#                     "максима_горького", "красная_пресня", "", "", "", "", "", "", "", "", "", "", "" )
#     norm_list = ("теплый_стан", "марьина_роща", "белый_яр", "нижняя_тавда", "газимурский_завод", \
#                  "красный_яр", "высокий_мыс", "крутая_горка", "красная_поляна", "", "", "", "" )
#     if s in special_list or s in norm_list:
#         return 0
#     else:
#         return 1
# data['key'] = data['settlement'].apply(key_setter)

# def extracted_locs_split(locations):
#     result = list()
#     for loc in locations:
#         for word in loc.split():
#             result.append(word)
#     return result

# def extract_similar_to_complex(locs, complex_locs):
#     result = list()
#     complex_locs = list(map(str.lower, complex_locs))
#     for loc in locs:
#         if not loc.lower() in complex_locs:
#             result.append(loc)
#     return result

# complex_locations_list = list(set(extracted_locs_split(complex_locations)))
# locations = extract_similar_to_complex(locations, complex_locations_list)

#complex_list = list(locations_dict.values())

# %%time
# no_towns['else'] = no_towns['else'].apply(locs_changer, locs_dict=settlements_dict)

In [None]:
if "":
    print('yes')
else:
    print('no')