In [None]:
import difflib
import numpy as np
import pandas as pd
from Levenshtein import distance

In [None]:
data = pd.read_csv('../input/geo1.csv', error_bad_lines=False, sep=';')
data.isnull().values.any()

In [None]:
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

**Расстояние Левенштейна**

In [None]:
def levenshtein_distance(df):
    return np.sum([distance(df[x], df[x + 1]) for x in range(df.shape[0] - 1)])

In [None]:
print(f'Расстояние Левенштейна для "грязных" адресов: {levenshtein_distance(data.iloc[:, 0])}')
print(f'Расстояние Левенштейна для "чистых" адресов": {levenshtein_distance(data.iloc[:, 1])}')

In [None]:
# function to alter string with mask that contains '?', '^' elements
def word(line, mask):
    caret = mask.rfind('^')
    whitespace = line.rfind(' ')
    if caret > whitespace:
        return line[whitespace + 1:].lower()
    else:
        return line[:caret + 1].lower()


# dataframe for changes detection
changes = pd.DataFrame({'dirty': [], 'clear': []})

# compare addresses using difflib
for dirty_add, clear_add in np.nditer([data.iloc[:, 0], data.iloc[:, 1]], flags=['refs_ok']):
    modifications = []
    for line in difflib.Differ().compare(str(dirty_add).replace(', ',',').strip().split(','),
                                         str(clear_add).replace(', ',',').strip().split(',')):
        
        # save all lines that relate to changes
        if line.startswith(('+', '-', '?')):
            modifications.append(line.replace('\n','').strip())
    
    # processing of different types of comparisons from difflib
    if all(modification.startswith('- ') for modification in modifications):
        for modification in modifications:
            changes = changes.append(pd.DataFrame({'dirty': [modification.strip('- ')],
                                                   'clear': ['']}),
                                     ignore_index=True)
    else:
        n_cycles = 0
        if any(modification.startswith('? ') for modification in modifications):
            n_cycles = len(modifications) - 3
        else:
            n_cycles = len(modifications) - 1
        
        mod_index = 99 # костыль
        for mod_i in range(n_cycles):
            following = modifications[mod_i + 1]
            current = modifications[mod_i]
            
            if following.startswith('-') and current.startswith('+'):
                for modification in modifications:
                    if modification.endswith(current.split(' ')[-1]) and \
                    modification.startswith('-'):
                        if current.split(' ')[-1] == modification.split(' ')[-1]:
                            changes = changes.append(pd.DataFrame({'dirty': [modification[2:modification.rfind(' ')].lower()],
                                                                   'clear': [current[2:current.rfind(' ')].lower()]}),
                                                     ignore_index=True)
                        else:
                            changes = changes.append(pd.DataFrame({'dirty': [modification[2:].lower()],
                                                                   'clear': [current[2:].lower()]}),
                                                     ignore_index=True)
                        mod_index = modifications.index(modification)
            
            if following.startswith(('-', '+')) and current.startswith('-') and mod_i != mod_index:
                    changes = changes.append(pd.DataFrame({'dirty': [current.strip('- ').strip('+ ')],
                                                           'clear': ['']}), ignore_index=True)

            if following.startswith('?') and current.startswith(('-')):
                dirty = current.strip('- ')
                clear = modifications[mod_i + 2].strip('+ ')
                changes = changes.append(pd.DataFrame({'dirty': [word(dirty,
                                                                      following[2:]).lower()],
                                                       'clear': [word(clear,
                                                                      modifications[mod_i + 3][2:]).lower()]}),
                                         ignore_index=True)
            
            
changes.shape

Группируем и считаем количество изменений

In [None]:
changes = changes.groupby(['dirty','clear']) \
                 .size() \
                 .reset_index() \
                 .rename(columns={0:'count'}) \
                 .sort_values(by=['count'])

In [None]:
changes.tail(10)

Объединяем изменения с одинаковыми ключевыми словами

In [None]:
mutations = changes.copy()
keywords = ['область', 'обл.', 'обл', 'край', 'республика', 'респ', 'автономный округ', 'район' , 'р-н', 'городской округ']
for keyword in keywords:
    to_drop = mutations.where(((mutations['dirty'].str.contains(r'(?i)(\b){}(\b)'.format(keyword))) | \
                               (mutations['dirty'].str.contains(r'(?i)(\b){}(\b)'.format(keyword)))) & \
                               (mutations['clear'] == '')).dropna()
    mutations = mutations.drop(to_drop.index).append(pd.DataFrame({'dirty' : [keyword],
                                                                   'clear' : [''],
                                                                   'count': [to_drop['count'].sum()]}),
                                                     ignore_index=True)


mutations = mutations.sort_values(by=['count'], ascending=False).reset_index(drop=True)
mutations.head(10)

Применяем изменения и считаем расстояние Левенштейна. Если оно уменьшилось, отменяем изменение.

In [None]:
ld = levenshtein_distance(data.iloc[:, 0])

for mutation in mutations.iloc[:, 0]:
    modifications = changes['dirty'].str.contains(r'(?i)(\b){}(\b)'.format(mutation))
    to_change_values = pd.Series(changes.where(modifications)['clear'].values,
                                 index=changes.where(modifications)['dirty']).dropna().to_dict()

    mask = r'(?i)(\b)' + r'(\b)|(\b)'.join(to_change_values.keys()) + r'(\b)'
    to_change = data.iloc[:, 0].str.contains(mask)
    data_copy = data.copy()
    
    data_copy.loc[data_copy.iloc[:, 0] == \
                  data_copy.where(to_change).iloc[:, 0]] = data_copy.loc[data_copy.iloc[:, 0] == \
                                                                         data_copy.where(to_change).iloc[:, 0]] \
                                                                     .replace(to_change_values, regex=True)
    data_copy.iloc[:, 0] = data_copy.iloc[:, 0].str.strip(' ,').str.replace(' ,', '')
    
    print(levenshtein_distance(data_copy.iloc[:, 0]))  

    if levenshtein_distance(data_copy.iloc[:, 0]) < ld:
        data.loc[data.iloc[:, 0] == data.where(to_change).iloc[:, 0]] = data_copy.copy()
        ld = levenshtein_distance(data_copy.iloc[:, 0])
        
data