## HIV data analysis for Kenya

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


filename = "table_2014_adult_hiv_prevalence_rate_by_county.csv"
df = pd.read_csv(filename)


pct_cols = [
    'art_coverage', 'hiv_prevalence_men', 'hiv_prevalence_women', 
    'poe_prevention_of_mother_to_child_transmission', 'poe_voluntering_and_testing', 
    'poe_tuberculosis', 'poe_medical_ward', 'poe_overral'
]

for col in pct_cols:
    df[col] = df[col].astype(str).str.replace('%', '').replace('nan', '0')
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

df['total_population'] = df['total_population'].fillna(0)

df['adult_art_gap'] = df['adults_in_need_of_art'] - df['adults_receiving_art']
df['child_art_gap'] = df['children_in_need_of_art'] - df['children_receiving_art']

df[['lat', 'lon']] = df['coordinates'].str.extract(r'\((.*), (.*)\)').astype(float)

df['adult_art_gap'] = df['adults_in_need_of_art'] - df['adults_receiving_art']
print("Data is loaded and 'df' is defined.")
print(f"Total counties loaded: {len(df)}")
df[['county_name', 'adult_15_hiv_prevalence', 'adult_art_gap']].head()

fig = px.scatter(df,
                x= "hiv_prevalence_men", 
                y="hiv_prevalence_women",
                hover_name="county_name",
                size="total_population",
                title="Gender Prevalence disparity by County",
                 labels={'hiv_prevalence_men':'Men (%)', 'hiv_prevalence_women' :'Women (%)'}
                )
fig.add_shape(
    type="line",
    x0=0,
    y0=0,
    x1=25,
    y1=25,
    line=dict(dash= 'dash')
)
fig.show()








In [None]:
df ['adult_15_hiv_prevalence'] = pd.to_numeric(df['adult_15_hiv_prevalence'], errors='coerce').fillna(0)
prevalence_summary = df['adult_15_hiv_prevalence'].describe()
print(prevalence_summary)

prevalence_hotspots = df.nlargest(10, 'adult_15_hiv_prevalence')[['county_name', 'adult_15_hiv_prevalence']]
prevalence_coldspots = df.nsmallest(10, 'adult_15_hiv_prevalence')[['county_name', 'adult_15_hiv_prevalence']]

print("High Burden Counties:\n", prevalence_hotspots)
print("Low Burden Counties:\n", prevalence_coldspots)





In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

plt.figure(figsize = (10, 6))
#A histogram plot, 10 x 6 units 
#15 bars, kernel density estimate - curve showing distribution
sb.histplot(df['adult_15_hiv_prevalence'], bins=15, kde=True, color='blue')
plt.title("Frequency Of Hiv Prevalence Rates across 254 counties")
plt.xlabel('prevalence (%)')
plt.ylabel('Number of Counties')
plt.show()

ANALYZING GEOGRAPHIC VARIATIONS

In [None]:
df[['lat', 'lon']] = df['coordinates'].str.extract(r'\((.*), (.*)\)').astype(float)
plt.figure(figsize=(8, 10))
plt.scatter(df['lon'], df['lat'],
            #size of the dot depend on the prevalence
    s=df['adult_15_hiv_prevalence'] *20,
            #color intensity of the dot depend on the prevalence
    c=df['adult_15_hiv_prevalence'],
            #alpha means transparency
    cmap='YlOrRd', alpha=0.7)
plt.colorbar(label='Prevalence %')
plt.title("Geographic distribution of HIV prevalence")
plt.xlabel('Longitude')
plt.ylabel('Longitude')
plt.show()


## qstn: is hiv prevalence higher iin Women or men

In [None]:
df['hiv_prevalence_men']= df['hiv_prevalence_men'].astype(str).str.replace('%', '')
df['hiv_prevalence_men'] = pd.to_numeric(df['hiv_prevalence_men'], errors='coerce').fillna(0)

df['hiv_prevalence_women'] = df['hiv_prevalence_women'].astype(str).str.replace('%', '')
df['hiv_prevalence_women']= pd.to_numeric(df['hiv_prevalence_women'], errors = 'coerce').fillna(0)


#calculating gender gap
df['gender_diff'] = df['hiv_prevalence_women'] - df['hiv_prevalence_men']

#in how many counties is it higher for one group vs another

women_higher = df[df['hiv_prevalence_women'] > df['hiv_prevalence_men']].shape[0]
men_higher = df[df['hiv_prevalence_men'] > df['hiv_prevalence_women']].shape[0]
equal = df[df['hiv_prevalence_women'] == df['hiv_prevalence_men']].shape[0]

print(f"Counties where Women have higher prevalence: {women_higher}")
print(f"Counties where men have higher prevalence: {men_higher}")
print(f"Counties where prevalence of men equals that of women: {equal}")




In [None]:
import plotly.express as px
fig = px.scatter(df,
                 x="hiv_prevalence_men",
                 y= "hiv_prevalence_women",
                hover_name="county_name",
                 title="HIV Prevalenc: Men vs Women",
                 labels={'hiv_prevalence_men':'Men %', 'hiv_prevalence_women':'Women %'},
                 trendline="ols"
                )
fig.add_shape(type="line", x0=0, y0=0, x1=25, y1=25, line=dict(dash='dash', color="red"))
fig.show()

In [None]:
total_prevalence= df['hiv_prevalence_men'] + df['hiv_prevalence_women']
df['men_living_with_hiv'] = (df['hiv_prevalence_men']/total_prevalence) * df['hiv_adults']
df['women_living_with_hiv'] = (df['hiv_prevalence_women']/total_prevalence)* df['hiv_adults']

#filling NaNs with 0
df[['men_living_with_hiv', 'women_living_with_hiv']] = df[['men_living_with_hiv', 'women_living_with_hiv']].fillna(0)
print("Estimation complete!")



In [None]:
# COMPOUND BAR CHART FOR ABOVE ANALYSIS

import plotly.graph_objects as pgo
top_burden = df.nlargest(15, 'hiv_adults')

fig = pgo.Figure(data = [
    pgo.Bar(
        name = 'Men',
        x= top_burden['county_name'],
        y = top_burden['men_living_with_hiv'],
        marker_color = "blue"
    ),
    pgo.Bar(
        name="Women",
        x= top_burden['county_name'],
        y= top_burden['women_living_with_hiv'],
        marker_color="#e377c2" #pink
    )
])

fig.update_layout(
    title = "Number of Adults Living with HIV: Men vs. Women (Top 15 Counties)",
    xaxis_title= "County",
    yaxis_title= "Estimated people count",
    barmode= "group",
    template= "plotly_white",
    legend_title = "Gennder"
)
fig.show()