In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

%matplotlib inline

In [2]:
df = pd.read_csv('./hospital_data.csv')

In [3]:
df.head()

Unnamed: 0,Region,District,FacilityName,Type,Town,Ownership,Latitude,Longitude
0,Ashanti,Offinso North,A.M.E Zion Clinic,Clinic,Afrancho,CHAG,7.40801,-1.96317
1,Ashanti,Bekwai Municipal,Abenkyiman Clinic,Clinic,Anwiankwanta,Private,6.46312,-1.58592
2,Ashanti,Adansi North,Aboabo Health Centre,Health Centre,Aboabo No 2,Government,6.22393,-1.34982
3,Ashanti,Afigya-Kwabre,Aboabogya Health Centre,Health Centre,Aboabogya,Government,6.84177,-1.61098
4,Ashanti,Kwabre,Aboaso Health Centre,Health Centre,Aboaso,Government,6.84177,-1.61098


In [4]:
df.shape

(3756, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3756 entries, 0 to 3755
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Region        3756 non-null   object 
 1   District      3756 non-null   object 
 2   FacilityName  3756 non-null   object 
 3   Type          3756 non-null   object 
 4   Town          3603 non-null   object 
 5   Ownership     3756 non-null   object 
 6   Latitude      3732 non-null   float64
 7   Longitude     3732 non-null   float64
dtypes: float64(2), object(6)
memory usage: 234.9+ KB


In [6]:
df.describe()

Unnamed: 0,Latitude,Longitude
count,3732.0,3732.0
mean,6.955678,-1.07458
std,1.679332,1.021352
min,4.79618,-3.23588
25%,5.687642,-1.809792
50%,6.468705,-1.058117
75%,7.430555,-0.21587
max,11.14667,1.19144


In [7]:
df.Type.unique()

array(['Clinic', 'Health Centre', 'Maternity Home', 'CHPS', 'Hospital',
       'District Hospital', 'Centre', 'RCH', 'Training Institution',
       'Municipal Health Directorate', 'Teaching Hospital',
       'Regional Hospital', 'CPHS', 'District Health Directorate',
       'Regional Health Directorate', 'Polyclinic', 'Municipal Hospital',
       'Municipal  Health Directorate', 'Metropolitan Hospital',
       'Metropolitan Health Directorate', 'DHD', 'Psychiatric Hospital',
       'Research Institution', 'Others', 'clinic'], dtype=object)

In [8]:
df.Type.value_counts()

Clinic                             1171
Health Centre                       786
CHPS                                652
Maternity Home                      369
Hospital                            277
RCH                                 152
District Health Directorate          99
District Hospital                    82
Training Institution                 74
Others                               31
Polyclinic                           16
Regional Hospital                     9
Regional Health Directorate           9
Municipal Health Directorate          7
Municipal Hospital                    4
Teaching Hospital                     3
Psychiatric Hospital                  3
clinic                                2
Research Institution                  2
Centre                                2
Metropolitan Health Directorate       2
DHD                                   1
CPHS                                  1
Municipal  Health Directorate         1
Metropolitan Hospital                 1


In [9]:
df.Ownership.value_counts()

Government          2210
Private             1179
CHAG                 257
Quasi-Government      91
government             6
Islamic                4
private                2
NGO                    2
Maternity Home         2
Mission                1
Clinic                 1
Muslim                 1
Name: Ownership, dtype: int64

In [10]:
df.Ownership.unique()

array(['CHAG', 'Private', 'Government', 'Quasi-Government', 'Islamic',
       'Muslim', 'Maternity Home', 'Clinic', 'private', 'NGO',
       'government', 'Mission'], dtype=object)

In [11]:
df.Region.unique()

array(['Ashanti', 'Brong Ahafo', 'Central', 'Eastern', 'Greater Accra',
       'Northern', 'Upper East', 'Upper West', 'Volta', 'Western'],
      dtype=object)

In [12]:
region_district_df = df.groupby(['Region','District']).size().reset_index(name='Count')

In [13]:
# List all districts per region to find same district having different names
for val in list(region_district_df['Region'].unique()):
    districts = region_district_df.query('Region == @val')['District'].unique()
    print(" Region: {}, Number of Districts: {}\n{}\n".format(val, len(districts),districts))

 Region: Ashanti, Number of Districts: 27
['Adansi North' 'Adansi South' 'Afigya-Kwabre' 'Ahafo-Ano North'
 'Ahafo-Ano South' 'Amansie Central' 'Amansie West'
 'Asante-Akim North Municipal' 'Asante-Akim South' 'Atwima-Kwanwoma'
 'Atwima-Mponua' 'Atwima-Nwabiagya' 'Bekwai Municipal' 'Bosome Freho'
 'Bosomtwe' 'Ejisu-Juaben Municipal' 'Ejura Sekyeredumasi'
 'Kumasi Metropolitan' 'Kwabre' 'Mampong Municipal' 'Obuasi Municipal'
 'Offinso Municipal' 'Offinso North' 'Sekyere Afram Plains'
 'Sekyere Central' 'Sekyere East' 'Sekyere South']

 Region: Brong Ahafo, Number of Districts: 25
['Asunafo North Municipal' 'Asunafo South' 'Asutifi' 'Atebubu-Amanten'
 'Berekum Municipal' 'Dormaa East' 'Dormaa Municipal' 'Jaman North'
 'Jaman South' 'Kintampo North Municipal' 'Kintampo South' 'Nkoranza'
 'Nkoranza North' 'Pru' 'Sene' 'Sunyani' 'Sunyani Municipal'
 'Sunyani West' 'Tain' 'Tano North' 'Tano South' 'Techiman'
 'Techiman Municipal' 'Wenchi' 'Wenchi Municipal']

 Region: Central, Number of Dist

In [14]:
# Missing towns
missing_towns = df[df.Town.isnull()]
missing_towns

Unnamed: 0,Region,District,FacilityName,Type,Town,Ownership,Latitude,Longitude
596,Ashanti,Kumasi Metropolitan,Amansie Enterprise Clinic,Clinic,,Private,6.415000,-1.375480
626,Ashanti,Kumasi Metropolitan,Restoration Clinic,Clinic,,Private,6.415000,-1.375480
645,Ashanti,Mampong Municipal,Calvary Health Service,Clinic,,Private,7.033970,-1.242940
653,Ashanti,Offinso Municipal,Offinso District Health Directorate,District Health Directorate,,Government,6.563090,-1.402680
969,Brong Ahafo,Dormaa Municipal,Florence Maternity Home,Maternity Home,,Maternity Home,7.277284,-2.876257
...,...,...,...,...,...,...,...,...
3726,Western,Tarkwa Nsuaem Municipal,Church of Pentecost Clinic,Clinic,,CHAG,5.299986,-1.992202
3727,Western,Tarkwa Nsuaem Municipal,Sam Jonna Clinic,Clinic,,Private,5.299986,-1.992202
3728,Western,Tarkwa Nsuaem Municipal,Divine Clinic,Clinic,,Private,5.299986,-1.992202
3730,Western,Wassa-Amenfi West,Kwabeng Clinic,Clinic,,Private,5.653611,-2.165901


In [15]:
missing_towns.shape

(153, 8)

In [16]:
# missing coordinates
missing_coordinates = df[df.Latitude.isnull()|df.Longitude.isnull()]
missing_coordinates

Unnamed: 0,Region,District,FacilityName,Type,Town,Ownership,Latitude,Longitude
357,Ashanti,Bosome Freho,Dunkura Health Centre,Health Centre,Dunkura,Government,,
358,Ashanti,Bosome Freho,Yapesa St.Mary Clinic,Clinic,Yapesa,CHAG,,
1651,Eastern,Akyemansa,St Johns Clinic,Clinic,Ofoase,CHAG,,
2437,Northern,Gushegu,Nabuli Health Centre,Health Centre,Nabuli,Government,,
2438,Northern,Gushegu,Damakung Clinic,Clinic,Damankung,Government,,
2439,Northern,Gushegu,Gushegu Hospital,Hospital,Gushiegu,Government,,
3416,Western,Nzema East Municipal,Ewuku CHPS,CHPS,Ewuku,Government,,
3417,Western,Nzema East Municipal,Kutukrom Health Centre,Health Centre,Kutukrom,Government,,
3663,Western,Ellembele,Aiyinase Health Centre,Health Centre,Aiyinase,Government,,
3664,Western,Ellembele,Nana Benie M. Clinic,Clinic,,Private,,


In [17]:
missing_coordinates.shape

(24, 8)

In [18]:
# save data with missing data into a dataframe so that client can help retrieve them.
missing_coordinates.to_csv('missing_coordinates.csv', index=False)
missing_towns.to_csv('missing_towns.csv', index=False)

In [19]:
df[df.District == 'Bosome Freho']

Unnamed: 0,Region,District,FacilityName,Type,Town,Ownership,Latitude,Longitude
79,Ashanti,Bosome Freho,Asiwa Health Centre,Health Centre,Asiwa,Government,6.42344,-1.33434
125,Ashanti,Bosome Freho,Detieso Community Clinic,Clinic,Detieso,Government,6.47674,-1.44168
130,Ashanti,Bosome Freho,Duasi Community Clinic,Clinic,Duasi,Government,6.46863,-1.39522
156,Ashanti,Bosome Freho,Gyasikrom Health Centre,Clinic,Gyasikrom,Government,6.40145,-1.47313
357,Ashanti,Bosome Freho,Dunkura Health Centre,Health Centre,Dunkura,Government,,
358,Ashanti,Bosome Freho,Yapesa St.Mary Clinic,Clinic,Yapesa,CHAG,,


## Assessment

### Quality Issues
* Missing Data: Latitude, Longitude
* Different names for the same value under a column

Type: 
    * District Health Directorate,DHD
    * Clinic, clinic
    * Health Centre, Centre
    * CHPS, CPHS
    * Municipal  Health Directorate, Municipal Health Directorate
    
Ownership:
    * Government, government
    * Private, private
    * Islamic, Muslim

Region: 
    Western:
        * 'Prestea-Huni Valley','Pretea-Huni Valley'
        * 'Nzema East','Nzema East Municipal'
    Eastern:
        * 'Birim South','Birim south'
        * 'Kwaebibirem','Kweabibirem'
        * 'New Juaben','New Juaben Municipal'
    Brong Ahafo:
        * 'Sunyani' 'Sunyani Municipal'
        * 'Techiman','Techiman Municipal'
        * 'Wenchi','Wenchi Municipal'
    Greater Accra
        * 'Dangme West' 'Dangme west'
    Upper West
        * 'Wa' 'Wa Municipal'
        
        
        
* Typographical Errors
St. Martin de Porres Hospitakl  

#### Missing Coordinates Comments
* Dunkura Health Centre - Used coordinates on the Dunkura town from https://geographic.org/geographic_names/name.php?uni=-2874515&fid=1969&c=ghana
* Yapesa St. Mary Clinic - Got location from Google Maps
* St. Johns Clinic (Ofoase) - Got coordinates from https://www.ghanayello.com/company/53475/Ofoase_St_Johns_Clinic#map. Location is plotted as St. John the Baptist Catholic Church on google maps.
* Gushegu Hospital - Location from Google Maps
* Ewuku CHPS - https://geographic.org/geographic_names/name.php?uni=-2875130&fid=1958
* Kutukrom Health Centre - https://geographic.org/geographic_names/name.php?uni=-2878816&fid=1967&c=ghana
* Aiyinase Health Centre - Google Maps
* Nana Benie M. Clinic - Google Maps as Nana Benie Memorial Clinic
* 

## Cleaning

In [20]:
# Create copy of the original dataframe
clean_df = df.copy()

In [21]:
missing_coordinates

Unnamed: 0,Region,District,FacilityName,Type,Town,Ownership,Latitude,Longitude
357,Ashanti,Bosome Freho,Dunkura Health Centre,Health Centre,Dunkura,Government,,
358,Ashanti,Bosome Freho,Yapesa St.Mary Clinic,Clinic,Yapesa,CHAG,,
1651,Eastern,Akyemansa,St Johns Clinic,Clinic,Ofoase,CHAG,,
2437,Northern,Gushegu,Nabuli Health Centre,Health Centre,Nabuli,Government,,
2438,Northern,Gushegu,Damakung Clinic,Clinic,Damankung,Government,,
2439,Northern,Gushegu,Gushegu Hospital,Hospital,Gushiegu,Government,,
3416,Western,Nzema East Municipal,Ewuku CHPS,CHPS,Ewuku,Government,,
3417,Western,Nzema East Municipal,Kutukrom Health Centre,Health Centre,Kutukrom,Government,,
3663,Western,Ellembele,Aiyinase Health Centre,Health Centre,Aiyinase,Government,,
3664,Western,Ellembele,Nana Benie M. Clinic,Clinic,,Private,,


In [22]:
# Function to Convert Pascal Case Names to SnakeCase
def pascal_to_snake_case(name):
    return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()

In [23]:
# make column names snake_case
clean_df.rename(columns=lambda x: pascal_to_snake_case(x), inplace=True, errors="raise")

In [24]:
# Verify
list(clean_df.columns)

['region',
 'district',
 'facility_name',
 'type',
 'town',
 'ownership',
 'latitude',
 'longitude']

#### Missing Data

In [25]:
# Code
# Replace missing coordinates with available data
data_to_replace = {
    357: [6.466667,-1.433333],
    358: [6.4279761,-1.3352532],
    1651:[6.1695534,-1.1417168],
    2439:[9.9291129,-0.2140466],
    3416: [4.861389 ,-2.195556],
    3417: [5.033333, -2.266667],
    3663: [5.04348,-2.46458],
    3664: [5.0494868, -2.4820236],
    3669: [4.9743222, -2.4239446],
    3672: [5.1439822, -2.3725535],
    3678: [5.5485504, -0.1992678]
}

for index  in data_to_replace:
    clean_df.loc[index, 'latitude'] = data_to_replace[index][0]
    clean_df.loc[index, 'longitude'] = data_to_replace[index][1]
    

In [26]:
# Test
clean_df[clean_df.latitude.isnull()|clean_df.longitude.isnull()].shape

(13, 8)

#### Different names for the same value under a column

In [27]:
# TYPE

# Code
clean_df.type.replace(['DHD','clinic','Centre','CPHS', 'Municipal  Health Directorate'],
                      ['District Health Directorate','Clinic','Health Centre','CHPS','Municipal Health Directorate'],
                      inplace=True)

In [28]:
# Test
clean_df.type.value_counts()

Clinic                             1173
Health Centre                       788
CHPS                                653
Maternity Home                      369
Hospital                            277
RCH                                 152
District Health Directorate         100
District Hospital                    82
Training Institution                 74
Others                               31
Polyclinic                           16
Regional Health Directorate           9
Regional Hospital                     9
Municipal Health Directorate          8
Municipal Hospital                    4
Psychiatric Hospital                  3
Teaching Hospital                     3
Research Institution                  2
Metropolitan Health Directorate       2
Metropolitan Hospital                 1
Name: type, dtype: int64

In [29]:
# OWNERSHIP

## Code
clean_df.ownership.replace(['government','private','Muslim'],['Government','Private','Islamic'], inplace=True)

In [30]:
# Test
clean_df.ownership.value_counts()

Government          2216
Private             1181
CHAG                 257
Quasi-Government      91
Islamic                5
NGO                    2
Maternity Home         2
Mission                1
Clinic                 1
Name: ownership, dtype: int64

In [34]:
# DISTRICT

# Code
clean_df.district.replace(['Pretea-Huni Valley','Nzema East','Birim south','Kweabibirem','New Juaben',
                              'Sunyani','Techiman','Wenchi', 'Dangme west', 'Wa'],
                          ['Prestea-Huni Valley','Nzema East Municipal','Birim South','Kwaebibirem',
                           'New Juaben Municipal','Sunyani Municipal','Techiman Municipal','Wenchi Municipal',
                           'Dangme West', 'Wa Municipal'],
                          inplace=True)

In [35]:
#Test
error_region_district_df = clean_df.groupby(['region','district']).size().reset_index(name='Count')
error_regions = ['Eastern','Western','Brong Ahafo', 'Greater Accra', 'Upper West']

for val in list(error_regions):
    districts = error_region_district_df.query('region == @val')['district'].unique()
    print(" Region: {}, Number of Districts: {}\n{}\n".format(val, len(districts),districts))

 Region: Eastern, Number of Districts: 19
['Akwapim North' 'Akwapim South' 'Akyemansa' 'Asuogyamang' 'Atiwa'
 'Birim North' 'Birim South' 'East Akim' 'Fanteakwa' 'Kwaebibirem'
 'Kwahu North' 'Kwahu South' 'Kwahu West' 'Manya Krobo'
 'New Juaben Municipal' 'Suhum Kraboa Coaltar' 'Upper Manya' 'West Akim'
 'Yilo Krobo']

 Region: Western, Number of Districts: 18
['Ahanta West' 'Aowin Suaman' 'Bia' 'Bibiani-Anhwiaso-Bekwai' 'Ellembele'
 'Jomoro' 'Juabeso' 'Mpohor-Wassa East' 'Nzema East Municipal'
 'Prestea-Huni Valley' 'Sefwi Wiawso' 'Sefwi-Akontombra' 'Shama'
 'Shama Ahanta East' 'Tarkwa Nsuaem Municipal' 'Wassa West'
 'Wassa-Amenfi East' 'Wassa-Amenfi West']

 Region: Brong Ahafo, Number of Districts: 22
['Asunafo North Municipal' 'Asunafo South' 'Asutifi' 'Atebubu-Amanten'
 'Berekum Municipal' 'Dormaa East' 'Dormaa Municipal' 'Jaman North'
 'Jaman South' 'Kintampo North Municipal' 'Kintampo South' 'Nkoranza'
 'Nkoranza North' 'Pru' 'Sene' 'Sunyani Municipal' 'Sunyani West' 'Tain'
 'Ta

In [36]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3756 entries, 0 to 3755
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   region         3756 non-null   object 
 1   district       3756 non-null   object 
 2   facility_name  3756 non-null   object 
 3   type           3756 non-null   object 
 4   town           3603 non-null   object 
 5   ownership      3756 non-null   object 
 6   latitude       3743 non-null   float64
 7   longitude      3743 non-null   float64
dtypes: float64(2), object(6)
memory usage: 234.9+ KB


## Visualization