## Analysis of the impact of Hepatitis B in Europe - acute and chronic stages


In [51]:
import pandas as pd

In [2]:
acute_age = pd.read_csv('Hepatitis_B_age.csv')
acute_age.head()

Unnamed: 0,HealthTopic,Population,Distribution,Unit,Time,RegionCode,RegionName,CategoryIndex,Category,Value
0,Hepatitis B,Acute cases,Distribution by age,%,2006,DE,Germany,1,0-4,0.09363295
1,Hepatitis B,Acute cases,Distribution by age,%,2006,DE,Germany,2,5-14,1.02996254
2,Hepatitis B,Acute cases,Distribution by age,%,2006,DE,Germany,3,15-19,4.49438202
3,Hepatitis B,Acute cases,Distribution by age,%,2006,DE,Germany,4,20-24,7.58426966
4,Hepatitis B,Acute cases,Distribution by age,%,2006,DE,Germany,5,25-34,23.22097378


In [3]:
acute_gender = pd.read_csv('Hepatitis_B_gender.csv')
acute_gender.head()

Unnamed: 0,HealthTopic,Population,Distribution,Unit,Time,RegionCode,RegionName,CategoryIndex,Category,Value
0,Hepatitis B,Acute cases,Distribution by gender,%,2006,DE,Germany,1,Male,69.0140845
1,Hepatitis B,Acute cases,Distribution by gender,%,2006,DE,Germany,2,Female,30.98591549
2,Hepatitis B,Acute cases,Distribution by gender,%,2006,DK,Denmark,1,Male,62.5
3,Hepatitis B,Acute cases,Distribution by gender,%,2006,DK,Denmark,2,Female,37.5
4,Hepatitis B,Acute cases,Distribution by gender,%,2006,EE,Estonia,1,Male,64.44444444


In [None]:
# rename columns 

acute_gender.rename(columns={'Value': 'Gender_Percentage'}, inplace=True)
acute_age.rename(columns={'Value': 'Age_Percentage'}, inplace=True)


In [None]:
# merge dfs on key columns

combined_df = pd.merge(acute_gender, acute_age, 
                       on=['HealthTopic', 'Population', 'Unit', 'Time', 'RegionCode', 'RegionName'], 
                       suffixes=('_gender', '_age'))


In [None]:
# change variable type from str to int

combined_df['Gender_Percentage'] = pd.to_numeric(combined_df['Gender_Percentage'], errors='coerce').fillna(0).astype(int)
combined_df['Age_Percentage'] = pd.to_numeric(combined_df['Age_Percentage'], errors='coerce').fillna(0).astype(int)


#### !!No data was found on disease percentages for each gender-age combination. The available data shows the distribution either by gender OR by age group. Therefore, we proceed with a combined probability calculation by creating a new column [Combined Percentage] that shows this calculated percentage!! 

In [None]:

combined_df['Combined_Percentage'] = (combined_df['Gender_Percentage'] / 100) * (combined_df['Age_Percentage'] / 100) * 100


In [42]:
print(combined_df[['RegionName', 'Time', 'Category_gender', 'Category_age', 'Combined_Percentage']])

     RegionName  Time Category_gender Category_age  Combined_Percentage
0       Germany  2006            Male          0-4                 0.00
1       Germany  2006            Male         5-14                 0.69
2       Germany  2006            Male        15-19                 2.76
3       Germany  2006            Male        20-24                 4.83
4       Germany  2006            Male        25-34                15.87
...         ...   ...             ...          ...                  ...
9031   Slovakia  2023          Female        25-34                 7.50
9032   Slovakia  2023          Female        35-44                 7.50
9033   Slovakia  2023          Female        45-54                 0.00
9034   Slovakia  2023          Female        55-64                 6.00
9035   Slovakia  2023          Female          65+                 7.50

[9036 rows x 5 columns]


In [None]:
combined_df

#### Let's do the same for the chronic cases

In [9]:
chronic_age = pd.read_csv('Hepatitis_B_cronich_age.csv')
chronic_age.head()

Unnamed: 0,HealthTopic,Population,Distribution,Unit,Time,RegionCode,RegionName,CategoryIndex,Category,Value
0,Hepatitis B,Chronic cases,Age-specific rate,N/100000,2006,DE,Germany,1,0-4,0.0
1,Hepatitis B,Chronic cases,Age-specific rate,N/100000,2006,DE,Germany,2,5-14,0.0
2,Hepatitis B,Chronic cases,Age-specific rate,N/100000,2006,DE,Germany,3,15-19,0.0
3,Hepatitis B,Chronic cases,Age-specific rate,N/100000,2006,DE,Germany,4,20-24,0.0
4,Hepatitis B,Chronic cases,Age-specific rate,N/100000,2006,DE,Germany,5,25-34,0.0


In [10]:
chronic_gender = pd.read_csv('Hepatitis_B_cronich_gender.csv')
chronic_gender.head()

Unnamed: 0,HealthTopic,Population,Distribution,Unit,Time,RegionCode,RegionName,CategoryIndex,Category,Value
0,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,DE,Germany,1,Male,0.0
1,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,DE,Germany,2,Female,0.0023754
2,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,DK,Denmark,1,Male,2.08500412
3,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,DK,Denmark,2,Female,4.85115878
4,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,EE,Estonia,1,Male,1.43328077


In [None]:
combined_dfc = pd.merge(chronic_gender, chronic_age, 
                       on=['HealthTopic', 'Population', 'Unit', 'Time', 'RegionCode', 'RegionName'], 
                       suffixes=('_gender', '_age'))
combined_dfc

In [13]:
chronic_gender.rename(columns={'Value': 'Gender_Percentage'}, inplace=True)
chronic_age.rename(columns={'Value': 'Age_Percentage'}, inplace=True)

In [15]:
combined_dfc['Gender_Percentage'] = pd.to_numeric(combined_dfc['Gender_Percentage'], errors='coerce').fillna(0).astype(int)
combined_dfc['Age_Percentage'] = pd.to_numeric(combined_dfc['Age_Percentage'], errors='coerce').fillna(0).astype(int)


In [40]:
combined_dfc['Combined_percentage'] = (combined_dfc['Gender_Percentage'] / 100) * (combined_dfc['Age_Percentage'] / 100) * 100
combined_dfc.head()

Unnamed: 0,HealthTopic,Population,Distribution_gender,Unit,Time,RegionCode,RegionName,CategoryIndex_gender,Category_gender,Gender_Percentage,Distribution_age,CategoryIndex_age,Category_age,Age_Percentage,Combined_percentage
0,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,DE,Germany,1,Male,0,Age-specific rate,1,0-4,0,0.0
1,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,DE,Germany,1,Male,0,Age-specific rate,2,5-14,0,0.0
2,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,DE,Germany,1,Male,0,Age-specific rate,3,15-19,0,0.0
3,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,DE,Germany,1,Male,0,Age-specific rate,4,20-24,0,0.0
4,Hepatitis B,Chronic cases,Gender-specific rate,N/100000,2006,DE,Germany,1,Male,0,Age-specific rate,5,25-34,0,0.0


In [43]:
print(combined_dfc[['RegionName', 'Time', 'Category_gender', 'Category_age', 'Combined_percentage']])

     RegionName  Time Category_gender Category_age  Combined_percentage
0       Germany  2006            Male          0-4                 0.00
1       Germany  2006            Male         5-14                 0.00
2       Germany  2006            Male        15-19                 0.00
3       Germany  2006            Male        20-24                 0.00
4       Germany  2006            Male        25-34                 0.00
...         ...   ...             ...          ...                  ...
7735   Slovakia  2023          Female        25-34                 0.12
7736   Slovakia  2023          Female        35-44                 0.36
7737   Slovakia  2023          Female        45-54                 0.44
7738   Slovakia  2023          Female        55-64                 0.28
7739   Slovakia  2023          Female          65+                 0.20

[7740 rows x 5 columns]


Now we create the final df that merge acute and chronic cases, all divided by gender and age

In [23]:
df_chronic = combined_dfc.rename(columns={'Combined_percentage': 'Chronic_percentage'})
df_acute = combined_df.rename(columns={'Combined_Percentage': 'Acute_percentage'})


In [44]:
df_combined = pd.merge(df_chronic, df_acute, 
                       on=[ 'RegionName', 'Time', 'Category_gender', 'Category_age'], 
                       how='outer')

In [45]:
df_hepatitis = df_combined[['RegionName', 'Time', 'Category_gender', 'Category_age', 'Chronic_percentage', 'Acute_percentage']]


In [47]:
df_hepatitis.head()

Unnamed: 0,RegionName,Time,Category_gender,Category_age,Chronic_percentage,Acute_percentage
0,Austria,2008,Female,0-4,,0.0
1,Austria,2008,Female,15-19,,10.92
2,Austria,2008,Female,20-24,,2.08
3,Austria,2008,Female,25-34,,11.96
4,Austria,2008,Female,35-44,,7.28


In [53]:
print(df_hepatitis.isna().sum())

# We found some missing values due to a lack of data measurements

RegionName               0
Time                     0
Category_gender          0
Category_age             0
Chronic_percentage    1764
Acute_percentage       468
dtype: int64
