## EDA in Pandas

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
# Import the dataset
df = pd.read_csv("health-facilities-in-kenya.csv")
df

Unnamed: 0,Facility Code,Facility Name,Province,County,District,Division,Type,Owner,Location,Sub Location,...,IPD,OPD,OUTREACH,PMTCT,RAD/XRAY,RHTC/RHDC,TB DIAG,TB LABS,TB TREAT,YOUTH
0,19224,CDF Kiriari Dispensary,Eastern,Embu,Manyatta,Manyatta,Dispensary,Ministry of Health,Ruguru,Ruguru,...,,,,,,,,,,
1,19310,St Jude's Huruma Community Health Services,Nairobi,Nairobi,Mathare,Huruma,Medical Clinic,Private Practice - Unspecified,Huruma,Huruma,...,,,,,,,,,,
2,14180,10 Engineer VCT,Rift Valley,Laikipia,Laikipia East,Central,Dispensary,Armed Forces,Nanyuki,Majengo,...,,,,,,,,,,
3,17486,12 Engineers,Central,Kiambu,Thika West,,Dispensary,Ministry of Health,,,...,,,,,,,,,,
4,18393,3Kl Maternity & Nursing Home,Rift Valley,Kajiado,Kajiado North,Ongata Rongai,Nursing Home,Private Practice - Clinical Officer,Gataka,Gataka,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10500,17220,Ziwa SDA,Rift Valley,Uasin Gishu,Eldoret West,Soy,Dispensary,Christian Health Association of Kenya,Ziwa,Sirikwa,...,,,,,,,,,,
10501,15788,Ziwa Sub-District Hospital,Rift Valley,Uasin Gishu,Eldoret West,Soy,Sub-District Hospital,Ministry of Health,Sirikwa,Sirikwa,...,Y,,,,,,,,,
10502,11915,Ziwani Dispensary,Coast,Taita Taveta,Taveta,Challa,Dispensary,Private Enterprise (Institution),Challa,Ziwani,...,Y,,,,,,,,,
10503,16997,Zombe Catholic Dispensary,Eastern,Kitui,Mutitu,zombe/mwitika ward,Dispensary,Kenya Episcopal Conference-Catholic Secretariat,Zombe,Thua,...,,,,,,,,,,


We have 10505 rows in the dataset and 50 columns.

In [11]:
# Get info on our data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10505 entries, 0 to 10504
Data columns (total 50 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Facility Code            10505 non-null  int64  
 1   Facility Name            10505 non-null  object 
 2   Province                 10505 non-null  object 
 3   County                   10505 non-null  object 
 4   District                 10505 non-null  object 
 5   Division                 10238 non-null  object 
 6   Type                     10505 non-null  object 
 7   Owner                    10505 non-null  object 
 8   Location                 10010 non-null  object 
 9   Sub Location             9613 non-null   object 
 10  Description of Location  5304 non-null   object 
 11  Constituency             10505 non-null  object 
 12  Nearest Town             9903 non-null   object 
 13  Beds                     10418 non-null  float64
 14  Cots                  

Some columns seem to have a significant number of null values.

In [None]:
df = df.drop(['ANC', 'BEOC', 'BLOOD', 'CAES SEC', 'CEOC', 'EPI', 'GROWM', 'HCT', 'OPD', 
              'OUTREACH', 'PMTCT', 'RAD/XRAY', 'RHTC/RHDC', 'TB DIAG', 'TB LABS', 'TB TREAT',
              'YOUTH'], axis=1)

In [14]:
# Sum up the null values
df.isnull().sum()

Facility Code                  0
Facility Name                  0
Province                       0
County                         0
District                       0
Division                     267
Type                           0
Owner                          0
Location                     495
Sub Location                 892
Description of Location     5201
Constituency                   0
Nearest Town                 602
Beds                          87
Cots                          77
Official Landline           9138
Official Fax                9965
Official Mobile             7590
Official Email              8648
Official Address            3458
Official Alternate No       9553
Town                        2667
Post Code                   3825
In Charge                   2140
Job Title of in Charge      3099
Open 24 Hours                447
Open Weekends                457
Operational Status             0
ANC                        10505
ART                         9584
BEOC      

In [20]:
# Drop columns where the sum of null values is exactly 10505 which means they are 0 values.
df = df.drop(columns=[col for col in df.columns if df[col].isnull().sum() == 10505])

In [21]:
# How many unique values are there?
# Not helpful for my case.
df.nunique()

Facility Code              10505
Facility Name              10462
Province                       8
County                        47
District                     299
Division                    1177
Type                          26
Owner                         23
Location                    3487
Sub Location                5104
Description of Location     5149
Constituency                 223
Nearest Town                3802
Beds                         161
Cots                          55
Official Landline            871
Official Fax                 207
Official Mobile             2707
Official Email              1298
Official Address            2772
Official Alternate No        835
Town                         944
Post Code                    538
In Charge                   8099
Job Title of in Charge         5
Open 24 Hours                  2
Open Weekends                  2
Operational Status             4
ART                            1
C-IMCI                         1
FP        

The 8 provinces, 47 counties indicate that the whole country is well represented. 

In [22]:
# Top 10 facilities with the highest number of beds
df.sort_values(by="Beds", ascending=False).head(10)

Unnamed: 0,Facility Code,Facility Name,Province,County,District,Division,Type,Owner,Location,Sub Location,...,In Charge,Job Title of in Charge,Open 24 Hours,Open Weekends,Operational Status,ART,C-IMCI,FP,HBC,IPD
2435,13023,Kenyatta National Hospital,Nairobi,Nairobi,Kibra,Kibra,National Referral Hospital,Ministry of Health,Golfcourse,,...,Mrs. Lily Koros,Hospital Director,Y,Y,Operational,Y,Y,Y,Y,Y
1441,13076,Mathari Hospital,Nairobi,Nairobi,Mathare,Kariokor,District Hospital,Ministry of Health,Mathare,,...,Dr. Nelly Kitazi Okatch,Medical Superintendant,Y,Y,Operational,Y,,,Y,
5286,12004,Embu Provincial General Hospital,Eastern,Embu,Manyatta,Central,Provincial General Hospital,Ministry of Health,Municipality,Njukiiri,...,Dr. Gerald Nderitu,Medical Superintendant,Y,Y,Operational,Y,Y,Y,Y,Y
7315,15288,Nakuru Provincial General Hospital (PGH),Rift Valley,Nakuru,Nakuru,Municipality,Provincial General Hospital,Ministry of Health,Nakuru Town,Viwanda,...,Dr. Murima,Medical Superintendant,N,N,Operational,Y,Y,Y,Y,Y
2963,15204,Moi Teaching Refferal Hospital,Rift Valley,Uasin Gishu,Eldoret East,Ainapkoi,National Referral Hospital,Ministry of Health,Chepkoilel,Chepkoilel,...,Dr.Mengich,,N,N,Operational,Y,Y,Y,Y,Y
5511,11289,Coast Province General Hospital,Coast,Mombasa,Mvita,Island,Provincial General Hospital,Ministry of Health,Tononoka,Tononoka,...,Dr. Iqbal Kandwalla,Doctor In Charge,Y,Y,Operational,Y,Y,Y,Y,Y
7954,13939,Nyanza Provincial General Hospital (PGH),Nyanza,Kisumu,Kisumu East,Winam,Provincial General Hospital,Ministry of Health,Kolwa,Manyatta B,...,Dr Juliana Otieno,,Y,Y,Operational,Y,Y,Y,Y,Y
3826,15915,Kakamega Provincial General Hospital (PGH),Western,Kakamega,Kakamega Central (Lurambi),Municipality,Provincial General Hospital,Ministry of Health,Bukhungu,Shirere,...,Dr. Daniel Alushula,Medical Superintendant,Y,Y,Operational,,,,,
2298,13703,Kisii Hospital (Level 5),Nyanza,Kisii,Kisii Central,Municipality,District Hospital,Ministry of Health,Bosongo,Bosongo,...,Dr Enock Ondari,Medical Superintendant,Y,Y,Operational,Y,Y,Y,Y,Y
1579,12438,Machakos Level 5 Hospital,Eastern,Machakos,Machakos,Central,Provincial General Hospital,Ministry of Health,Masaku Township,Eastleigh,...,Dr. Nzuki,Medical Superintendant,Y,Y,Operational,Y,Y,Y,,Y


In [24]:
# Check correlation
# df.corr()

In [None]:
# heatmap to view the correlations
# sns.heatmap(df.corr(), annot = True)
# plt.rsParams['figure.figsize'] = (20,7)
# plt.show()

In [None]:
# Group the data
df.groupby('County').mean()