This notebook will standardize all district names according to the 'Somalia Districts' word document in the drive of the following files: Admissions, FSNAU_riskfactors, ipc, ipc2, locations, prevalance_estimates. <br>
Make sure to have this file in the same folder as your data and have the data in the separate 'data' folder if you want to use the same path for the data

# Import Libraries

In [131]:
import pandas
import numpy

# Define variables and functions

In [132]:
districts = ['Adan Yabaal', 'Afgooye', 'Afmadow', 'Baardheere', 'Badhaadhe', 'Baidoa', 
 'Baydhaba/Bardaale', 'Baki', 'Balcad', 'Banadir', 'Bandarbeyla', 'Baraawe', 'Belet Weyne', 'Belet Weyne (Mataban)','Belet Xaawo', 
'Berbera', 'Borama', 'Bossaso', 'Bu\'aale', 'Bulo Burto', 'Burco', 'Burtinle', 'Buuhoodle',
'Buur Hakaba', 'Cabudwaaq', 'Cadaado', 'Cadale', 'Caluula', 'Caynabo', 'Ceel Afweyn', 'Ceel Barde',
'Ceel Buur', 'Ceel Dheer', 'Ceel Waaq', 'Ceerigaabo', 'Dhuusamarreeb', 'Diinsoor', 'Doolow',
'Eyl', 'Gaalkacyo', 'Galdogob', 'Garbahaarey', 'Garoowe', 'Gebiley', 'Hargeysa', 'Hobyo', 'Iskushuban',
'Jalalaqsi', 'Jamaame', 'Jariiban', 'Jilib', 'Jowhar', 'Kismaayo', 'Kurtunwaarey', 'Laas Caanood', 'Laasqoray', 
'Laasqoray/Badhan', 'Badhan', 'Lughaye', 'Luuq', 'Marka', 'Owdweyne', 'Qandala', 'Qansax Dheere', 'Qardho', 'Qoryooley', 
'Rab Dhuure', 'Saakow', 'Sablaale', 'Sheikh', 'Taleex', 'Tayeeglow', 'Waajid', 'Wanla Weyn',
'Xarardheere', 'Xudun', 'Xudur', 'Zeylac']


In [133]:
def levenshteinDistanceDP(token1, token2):
    '''
    This function implements the levenshtein text similarity measure 
    and returns a numeric value representing the distance between two words
    '''
    distances = numpy.zeros((len(token1) + 1, len(token2) + 1))

    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1

    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2
        
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if (token1[t1-1] == token2[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

In [134]:
def update_districts(df):
    '''
    This function checks whether the district name is the standard. 
    If it is not, then the word is corrected by known value or by levenshtein distance.
    It creates a list with all the standard districts and sets that list as district column.
    Uncomment print statements in last loop to see what districts changed by levenshtein algo.
    '''

    new_series = []
    for token1 in df['district']:

        #If district is standard append to list
        if token1 in districts: 
            new_series.append(token1)

        #If district is not standard and a known variant
        elif token1 == 'Mogadishu': 
            correct_district = 'Banadir'
            new_series.append(correct_district)
        elif token1 == 'Baydhaba': 
            correct_district = 'Baidoa'
            new_series.append(correct_district)
        elif token1 == 'Belethawa': 
            correct_district = 'Belet Xaawo'
            new_series.append(correct_district)
        elif token1 == 'Abudwak': 
            correct_district = 'Cabudwaaq'
            new_series.append(correct_district)
        elif token1 == 'Adado': 
            correct_district = 'Cadaado'
            new_series.append(correct_district)

        #If district is not a known variant, apply levenshtein algo
        elif token1 not in districts: 
            #print('old: %s' % token1)
            distances = []
            for token2 in districts: 
                distances.append(levenshteinDistanceDP(token1, token2))
            min_value = min(distances)
            correct_district = districts[distances.index(min_value)]
            #print('new: %s' % correct_district)
            new_series.append(correct_district)
    df['district'] = new_series
    return df

# Read files 

In [135]:
#Change path to your data path if needed 

path_admissions = 'data/admissions.csv'
path_FSNAU = 'data/FSNAU_riskfactors.csv'
path_ipc = 'data/ipc.csv'
path_ipc2 = 'data/ipc2.csv'
path_locations = 'data/locations.csv'
path_prevalence = 'data/prevalence_estimates.csv'
path_production = 'data/production.csv'

In [136]:
df_admissions = pandas.read_csv(path_admissions)

df_FSNAU = pandas.read_csv(path_FSNAU)

df_ipc = pandas.read_csv(path_ipc)
df_ipc.rename({'area': 'district'}, axis=1, inplace=True)

df_ipc2 = pandas.read_csv(path_ipc2)

df_locations = pandas.read_csv(path_locations)
df_locations = df_locations[df_locations.district != 'Grand Total']

df_prevalence = pandas.read_csv(path_prevalence)

df_production = pandas.read_csv(path_production)

# Update District Names

In [137]:
df_admissions = update_districts(df_admissions)
df_FSNAU = update_districts(df_FSNAU)
df_ipc = update_districts(df_ipc)
df_ipc2 = update_districts(df_ipc2)
df_locations = update_districts(df_locations)
df_prevalence = update_districts(df_prevalence)

# Write new dataframes to CSV

In [138]:
df_admissions.to_csv(path_admissions)
df_FSNAU.to_csv(path_FSNAU)
df_ipc.to_csv(path_ipc)
df_ipc2.to_csv(path_ipc2)
df_locations.to_csv(path_locations)
df_prevalence.to_csv(path_prevalence)