In [1]:
import pandas as pd
import scipy.stats as stats
from scipy.stats import pearsonr
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway

Hypothesis 1: Relationship between Facility Type and Bed Availability

Null Hypothesis (H0): There is no significant difference in the number of beds available across different types of facilities.
Alternative Hypothesis (H1): There is a significant difference in the number of beds available across different types of facilities.

In [2]:
# Load the data
data = pd.read_csv('Analysis.csv')

# Check for missing values
print("Missing values in 'Beds' column:", data['Beds'].isnull().sum())

# Remove rows with missing 'Beds' values
data = data.dropna(subset=['Beds'])

# Group data by 'Type' and print group sizes and basic statistics
grouped_data = data.groupby('Type')['Beds']
for group_name, group in grouped_data:
    print(f"Group '{group_name}':")
    print(f"Number of observations: {group.count()}")
    print(f"Mean: {group.mean()}")
    print(f"Standard Deviation: {group.std()}\n")

# Ensure there are no groups with fewer than 2 observations
valid_groups = [group for name, group in grouped_data if group.count() > 1]

if len(valid_groups) < 2:
    print("Not enough valid groups for ANOVA. Ensure each group has at least two observations.")
else:
    # Perform ANOVA
    anova_result = stats.f_oneway(*valid_groups)

    # Print the results
    print(f"F-statistic: {anova_result.statistic}")
    print(f"P-value: {anova_result.pvalue}")

    # Interpretation of results
    alpha = 0.05
    if anova_result.pvalue < alpha:
        print("Reject the null hypothesis. There is a significant difference in the number of beds among different types of facilities.")
    else:
        print("Fail to reject the null hypothesis. There is no significant difference in the number of beds among different types of facilities.")



Missing values in 'Beds' column: 0
Group 'Dental Clinic':
Number of observations: 7
Mean: 0.0
Standard Deviation: 0.0

Group 'Dispensary':
Number of observations: 100
Mean: 0.45
Standard Deviation: 2.571699119966479

Group 'District Health Office':
Number of observations: 1
Mean: 0.0
Standard Deviation: nan

Group 'District Hospital':
Number of observations: 1
Mean: 112.0
Standard Deviation: nan

Group 'Eye Centre':
Number of observations: 1
Mean: 0.0
Standard Deviation: nan

Group 'Eye Clinic':
Number of observations: 1
Mean: 0.0
Standard Deviation: nan

Group 'Health Centre':
Number of observations: 48
Mean: 11.020833333333334
Standard Deviation: 9.969078967537216

Group 'Health Programme':
Number of observations: 5
Mean: 0.0
Standard Deviation: 0.0

Group 'Health Project':
Number of observations: 1
Mean: 0.0
Standard Deviation: nan

Group 'Laboratory (Stand-alone)':
Number of observations: 4
Mean: 0.0
Standard Deviation: 0.0

Group 'Maternity Home':
Number of observations: 10
Mean: 

Hypothesis 2: Ownership and Operational Status

Null Hypothesis (H0): The ownership type of a facility does not affect its operational status.
Alternative Hypothesis (H1): The ownership type of a facility affects its operational status.

In [3]:
# Load the data
data = pd.read_csv('Analysis.csv')

# Check the first few rows to understand the structure of the data
print(data.head())

# Check for missing values in 'Owner' and 'Status' columns
print("Missing values in 'Owner' column:", data['Owner'].isnull().sum())
print("Missing values in 'Status' column:", data['Status'].isnull().sum())

# Drop rows with missing values in 'Owner' or 'Status' columns
data = data.dropna(subset=['Owner', 'Status'])

# Create a contingency table
contingency_table = pd.crosstab(data['Owner'], data['Status'])

# Print the contingency table to verify its correctness
print("Contingency Table:")
print(contingency_table)

# Perform the chi-square test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

# Print the results
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:")
print(expected)

# Interpretation of results
alpha = 0.05
if p < alpha:
    print("Reject the null hypothesis. There is a significant association between Owner and Status.")
else:
    print("Fail to reject the null hypothesis. There is no significant association between Owner and Status.")


   Facility_Code                               Facility_Name            Type  \
0          19310  St Jude's Huruma Community Health Services  Medical Clinic   
1          13043                       7Kr Mrs Health Centre   Health Centre   
2          20346                     AAR Adams Health Centre  Medical Clinic   
3          16796         AAR Clinic Sarit Centre (Westlands)  Medical Clinic   
4          18859              AAR Healthcare Limited (Karen)  Medical Clinic   

                                     Owner Constituency sub_county  Beds  \
0           Private Practice - Unspecified      MATHARE    Mathare     0   
1                             Armed Forces     LANG'ATA    Langata    14   
2  Private Practice - General Practitioner        KIBRA      Kibra     0   
3         Private Enterprise (Institution)    WESTLANDS  Westlands     0   
4         Private Enterprise (Institution)     LANG'ATA    Langata     0   

   Cots All_day Weekends       Status  sub_county ppn  
0     

Hypothesis 3: Constituency and Facility Availability

Null Hypothesis (H0): The number of facilities available is evenly distributed across different constituencies.
Alternative Hypothesis (H1): The number of facilities available is not evenly distributed across different constituencies.

In [4]:
# Load the data
data = pd.read_csv('Analysis.csv')

# Create a frequency table for the number of facilities in each constituency
facility_counts = data['Constituency'].value_counts()

# Calculate the expected frequencies assuming an even distribution
num_constituencies = len(facility_counts)
total_facilities = len(data)
expected_counts = [total_facilities / num_constituencies] * num_constituencies

# Perform the chi-square goodness-of-fit test
chi2, p = stats.chisquare(facility_counts, f_exp=expected_counts)

# Print the results
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")

# Interpretation of results
alpha = 0.05
if p < alpha:
    print("Reject the null hypothesis. The distribution of facilities across constituencies is not even.")
else:
    print("Fail to reject the null hypothesis. The distribution of facilities across constituencies is even.")


Chi-square statistic: 187.53435114503816
P-value: 2.5861862730134875e-31
Reject the null hypothesis. The distribution of facilities across constituencies is not even.


Hypothesis 4: Weekend Availability and Facility Type

Null Hypothesis (H0): The type of facility does not affect its availability on weekends.
Alternative Hypothesis (H1): The type of facility affects its availability on weekends.

In [5]:
# Load the data
data = pd.read_csv('Analysis.csv')

# Create a contingency table
contingency_table = pd.crosstab(data['Type'], data['Weekends'])

# Print the contingency table to verify its correctness
print("Contingency Table:")
print(contingency_table)

# Perform the chi-square test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

# Print the results
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:")
print(expected)

# Interpretation of results
alpha = 0.05
if p < alpha:
    print("Reject the null hypothesis. There is a significant association between Type and Weekend_Availability.")
else:
    print("Fail to reject the null hypothesis. There is no significant association between Type and Weekend_Availability.")


Contingency Table:
Weekends                                       N    Y
Type                                                 
Dental Clinic                                  5    1
Dispensary                                    62   38
District Health Office                         1    0
District Hospital                              0    1
Eye Centre                                     1    0
Eye Clinic                                     1    0
Health Centre                                 18   30
Health Programme                               2    2
Health Project                                 1    0
Laboratory (Stand-alone)                       1    3
Maternity Home                                 0   10
Medical Centre                                 0    4
Medical Clinic                                67  208
National Referral Hospital                     0    1
Nursing Home                                   2   12
Other Hospital                                 3   18
Radiology

Hypothesis 5: Sub-county Population and Facility Distribution

Null Hypothesis (H0): There is no correlation between the population of a sub-county and the number of facilities in that sub-county.
Alternative Hypothesis (H1): There is a correlation between the population of a sub-county and the number of facilities in that sub-county.

In [6]:
# Load the data
df = pd.read_csv('Analysis.csv')

# Aggregate the number of facilities in each sub-county
facility_count_by_sub_county = df['sub_county'].value_counts().reset_index()
facility_count_by_sub_county.columns = ['sub_county', 'facility_count']

# Get the population data for each sub-county
population_data = df[['sub_county', 'sub_county ppn']].drop_duplicates()

# Merge the facility count data with the population data
merged_data = pd.merge(facility_count_by_sub_county, population_data, on='sub_county')

# Perform Pearson correlation test
correlation_coefficient, p_value = pearsonr(merged_data['facility_count'], merged_data['sub_county ppn'])

# Print the results
print(f'Correlation coefficient: {correlation_coefficient}')
print(f'p-value: {p_value}')

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis (H0): There is a correlation between the population of a sub-county and the number of facilities in that sub-county.")
else:
    print("Fail to reject the null hypothesis (H0): There is no correlation between the population of a sub-county and the number of facilities in that sub-county.")


Correlation coefficient: 0.6546719087806668
p-value: 0.02087762576515242
Reject the null hypothesis (H0): There is a correlation between the population of a sub-county and the number of facilities in that sub-county.


Hypothesis 6: Facility Type and Operational Status

Null Hypothesis (H0): The type of facility does not affect its operational status.
Alternative Hypothesis (H1): The type of facility affects its operational status.

In [7]:
# Load the data
df = pd.read_csv('Analysis.csv')

# Create a contingency table
contingency_table = pd.crosstab(df['Type'], df['Status'])

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Print the results
print(f'Chi-square statistic: {chi2}')
print(f'p-value: {p}')
print(f'Degrees of freedom: {dof}')
print('Expected frequencies:')
print(expected)

# Interpret the results
alpha = 0.05
if p < alpha:
    print("Reject the null hypothesis (H0): The type of facility affects its operational status.")
else:
    print("Fail to reject the null hypothesis (H0): The type of facility does not affect its operational status.")


Chi-square statistic: 11.487924544450884
p-value: 0.9999902434456894
Degrees of freedom: 38
Expected frequencies:
[[1.60305344e-01 6.79961832e+00 4.00763359e-02]
 [2.29007634e+00 9.71374046e+01 5.72519084e-01]
 [2.29007634e-02 9.71374046e-01 5.72519084e-03]
 [2.29007634e-02 9.71374046e-01 5.72519084e-03]
 [2.29007634e-02 9.71374046e-01 5.72519084e-03]
 [2.29007634e-02 9.71374046e-01 5.72519084e-03]
 [1.09923664e+00 4.66259542e+01 2.74809160e-01]
 [1.14503817e-01 4.85687023e+00 2.86259542e-02]
 [2.29007634e-02 9.71374046e-01 5.72519084e-03]
 [9.16030534e-02 3.88549618e+00 2.29007634e-02]
 [2.29007634e-01 9.71374046e+00 5.72519084e-02]
 [9.16030534e-02 3.88549618e+00 2.29007634e-02]
 [6.36641221e+00 2.70041985e+02 1.59160305e+00]
 [2.29007634e-02 9.71374046e-01 5.72519084e-03]
 [3.20610687e-01 1.35992366e+01 8.01526718e-02]
 [4.80916031e-01 2.03988550e+01 1.20229008e-01]
 [2.29007634e-02 9.71374046e-01 5.72519084e-03]
 [2.29007634e-02 9.71374046e-01 5.72519084e-03]
 [2.29007634e-02 9.713

Hypothesis 7: Facility Ownership and Cot Availability

Null Hypothesis (H0): The ownership type of a facility does not affect the number cots available.
Alternative Hypothesis (H1): The ownership type of a facility affects the number of cots available.

In [8]:
# Load the data
df = pd.read_csv('Analysis.csv')

# Drop rows with missing values in 'ownership_type' or 'beds_cots_available' columns
df = df.dropna(subset=['Owner', 'Cots'])

# Group the data by ownership type
groups = df.groupby('Owner')['Cots'].apply(list)

# Perform the ANOVA test
f_statistic, p_value = f_oneway(*groups)

# Print the results
print(f'F-statistic: {f_statistic}')
print(f'p-value: {p_value}')

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis (H0): The ownership type of a facility affects the number of cots available.")
else:
    print("Fail to reject the null hypothesis (H0): The ownership type of a facility does not affect the number of cots available.")


F-statistic: 0.6869439504060896
p-value: 0.8407009443696759
Fail to reject the null hypothesis (H0): The ownership type of a facility does not affect the number of cots available.
