In [1]:
import numpy as np
import pandas as pd

In [2]:
# read census data and split them into district/town data

# source: http://www.nbs.go.tz/nbstz/index.php/english/statistics-by-subject/population-and-housing-census/844-tanzania-total-population-by-district-regions-2016
# modifications -   moved districts to seperate tab, 
#                   cut off any decimals, 
#                   string modifications for easy comparison to training data

xlsx = pd.read_excel('./data/Tanzania_Total_Population_by_District-Regions-2016_2017.xlsx', sheet_name=['lga', 'district'])

# extract the lga-tab data from the dictionary and change all items in the name column to lower case 
# to protect from random capital letters during comparison
df_town, df_district = xlsx['lga'], xlsx['district']
for item in [df_town, df_district]:
    item['name'] = item.name.apply(lambda x: x.lower()).map(lambda x: x.strip())
    #item['name'] = item['name']

df_town.rename(columns={'name': 'lga'}, inplace=True)
df_district.rename(columns={'name': 'region'}, inplace=True)
df_town.head()
#print(df_district.loc[df_district['name'] == 'iringa'])

Unnamed: 0,lga,male 2012,female 2012,total 2012,male 2016,female 2016,total 2016,male 2017,female 2017,total 2017
0,kondoa,107341.0,103341.0,210682,116661.0,112314.0,228975.0,119115.0,114676.0,233792.0
1,mpwapwa,147306.0,157750.0,305056,160096.0,171447.0,331544.0,163464.0,175053.0,338518.0
2,kongwa,149221.0,160752.0,309973,162178.0,174710.0,336888.0,165589.0,178385.0,343974.0
3,dodoma,199487.0,211469.0,410956,216808.0,229831.0,446639.0,221369.0,234665.0,456034.0
4,bahi,105975.0,115670.0,221645,115176.0,125713.0,240890.0,117599.0,128358.0,245957.0


In [3]:
fields = ['id','gps_height','region', 'lga', 'latitude', 'longitude', 'construction_year']
training_data = pd.read_csv('training_data.csv', skipinitialspace=True, usecols=fields, index_col=0)
training_data[['region','lga']] = training_data[['region','lga']].apply(lambda x: x.str.lower()).apply(lambda x: x.str.replace(' urban','').str.replace(' rural', ''))
#training_data[['region','lga']] = training_data[['region','lga']].apply(lambda x: x.str.lower()).apply(lambda x: x.replace([' urban', ' rural'],['', ''])) #.str.replace(' rural', ''))
training_data.head()

Unnamed: 0_level_0,gps_height,longitude,latitude,region,lga,construction_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
69572,1390,34.938093,-9.856322,iringa,ludewa,1999
8776,1399,34.698766,-2.147466,mara,serengeti,2010
34310,686,37.460664,-3.821329,manyara,simanjiro,2009
67743,263,38.486161,-11.155298,mtwara,nanyumbu,1986
19728,0,31.130847,-1.825359,kagera,karagwe,0


In [4]:
# there are a lot of 0 values in ['longitude'] - we will replace these together with the corresponding latitudes
# via the mean of all wells in the corresponding lga 
# (the smallest area provided by the data set - the most accurate)

#calculate mean lat/lon for each lga
def calculate_mean(df, name_col, num_col):
    mean_dict = {}
    df[num_col].replace(0, np.NaN, inplace = True)
    for item in df[name_col].unique().tolist():
        mean_dict[item] = np.NaN
        mean_dict[item] = df.loc[df[name_col] == item,num_col].mean()
    return mean_dict


longitude = calculate_mean(training_data, 'lga', 'longitude')
latitude = calculate_mean(training_data, 'lga', 'latitude')
gps = calculate_mean(training_data, 'lga', 'gps_height')
constr_year = calculate_mean(training_data, 'lga', 'construction_year')

df = pd.DataFrame([latitude, longitude, gps, constr_year]).T
df.columns = ['latitude', 'longitude', 'gps_height', 'construction_year']
for item in ['gps_height', 'construction_year']:
    df[item] = df.loc[:, item].fillna(df[item].mean()) # inaccurate!
    
# for all those regions where no geodata was collected, we replace them with the mean values of all regions
# we also save the data to csv in case we need to share them
conditions = (df['latitude'].isnull()) | (df['longitude'].isnull())
df.loc[conditions, ['latitude', 'longitude']] = [df['latitude'].mean(), df['longitude'].mean()]
df.to_csv('./data/mean_lon_lat_height.csv', na_rep='NaN')

In [4]:
# this is also done for the district in case all wells in an lga dont have lat/lon values
distr_longitude = calculate_mean(training_data, 'region', 'longitude')
distr_latitude = calculate_mean(training_data, 'region', 'latitude')
distr_gps = calculate_mean(training_data, 'region', 'gps_height')
constr_year = calculate_mean(training_data, 'lga', 'construction_year')

# create a df from the district values and save it to csv
df = pd.DataFrame([distr_latitude, distr_longitude, distr_gps]).T
df.columns = ['d_latitude', 'd_longitude', 'd_gps_height']
df['d_gps_height'] = df.loc[:, 'd_gps_height'].fillna(df['d_gps_height'].mean()) # inaccurate!
df.to_csv('./data/mean_lon_lat_height_district.csv', na_rep='NaN')

In [5]:
# now we test if any value in the current df is null in order to create a function
# to replace it -> no null values :) 
# df1 = df[df.isnull().any(axis=1)]
# df1

In [5]:
df['gps_height'].mean()

901.959465198053

In [6]:
# now we replace the null values in gps and construction year with their respective column means
# this might be inaccurate since tanzania probably contains a wide range of gps heights 
# -> might be better to replace with district/lga mean/median gps height - same for construction year
# construction year could be transformed to "years/time of operation" 

def replace_null_lga(df, mean_df):
    'replaces the null values with the mean of the respective lga'
    for lga in mean_df.index:
        for item in ['longitude', 'latitude']:
            df.loc[(df['lga'] == lga) & (df[item].isnull()), item] = mean_df.loc[lga, item]
    return df

    
training_data_cleaned = replace_null_lga(training_data, df)

for item in ['gps_height', 'construction_year']:
    condition = training_data_cleaned[item].isnull()
    training_data_cleaned.loc[condition, item] = round(df[item].mean(),0)
    training_data_cleaned[item] = training_data_cleaned[item].astype('int')

#print(training_data_cleaned[training_data_cleaned.isnull().any(axis=1)])

In [7]:
# here we create a dictionary in order to add the region / town specific population to the df
# the population likely to use a well should have enourmous impact on their functionality -> lots of people, lots of damage

def swaptionary(df, name, year):
    swap_dict = dict(zip(df[name], df[year]))
    return swap_dict

training_data_cleaned['region_pop'] = training_data_cleaned['region'].map(swaptionary(df_district, 'region', 'total 2012'))
training_data_cleaned['lga_pop'] = training_data_cleaned['lga'].map(swaptionary(df_town, 'lga', 'total 2012'))
#print(swaptionary(df_district, 'region', 'total 2012'))

In [8]:
# now we change the latitude/longitude column into a shapely point in order to calculate 
# the distance to other wells properly

def create_points_from_lon_lat(lat, lon):
    from shapely.geometry import Point
    geometry = [Point(xy) for xy in zip(lat, lon)]
    return geometry

training_data_cleaned['geometry'] = create_points_from_lon_lat(training_data_cleaned['latitude'], training_data_cleaned['longitude'])
#training_data_cleaned = training_data_cleaned.drop(['latitude', 'longitude'], axis='columns')
training_data_cleaned.head()

Unnamed: 0_level_0,gps_height,longitude,latitude,region,lga,construction_year,region_pop,lga_pop,geometry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
69572,1390,34.938093,-9.856322,iringa,ludewa,1999,941238,133218,POINT (-9.856321769999999 34.93809275)
8776,1399,34.698766,-2.147466,mara,serengeti,2010,1743830,249420,POINT (-2.14746569 34.6987661)
34310,686,37.460664,-3.821329,manyara,simanjiro,2009,1425131,178693,POINT (-3.82132853 37.46066446)
67743,263,38.486161,-11.155298,mtwara,nanyumbu,1986,1270854,150857,POINT (-11.15529772 38.48616088)
19728,902,31.130847,-1.825359,kagera,karagwe,1997,2458023,332020,POINT (-1.82535885 31.13084671)


In [None]:
# to be continued

In [9]:
# LDA of numerical values! 
# year since creation -> fill missing values with mean 
