# Dengue Cleaning

In [184]:
import pandas as pd

# Load dataset
file_path = "Downloads/Dengue_Global_Data.csv"
df = pd.read_csv(file_path)

In [186]:
df.head()

Unnamed: 0,adm_0_name,adm_1_name,adm_2_name,full_name,ISO_A0,FAO_GAUL_code,RNE_iso_code,IBGE_code,calendar_start_date,calendar_end_date,Year,dengue_total,case_definition_standardised,S_res,T_res,UUID
0,AFGHANISTAN,,,AFGHANISTAN,AFG,1011446,AFG,,2021-01-03,2021-01-09,2021,101.0,Suspected,Admin0,Week,WHOEMRO-ALL-2021-Y01-05
1,AFGHANISTAN,,,AFGHANISTAN,AFG,1011446,AFG,,2021-01-10,2021-01-16,2021,151.0,Suspected,Admin0,Week,WHOEMRO-ALL-2021-Y01-05
2,AFGHANISTAN,,,AFGHANISTAN,AFG,1011446,AFG,,2021-01-17,2021-01-23,2021,201.0,Suspected,Admin0,Week,WHOEMRO-ALL-2021-Y01-05
3,AFGHANISTAN,,,AFGHANISTAN,AFG,1011446,AFG,,2021-01-24,2021-01-30,2021,202.0,Suspected,Admin0,Week,WHOEMRO-ALL-2021-Y01-05
4,AFGHANISTAN,,,AFGHANISTAN,AFG,1011446,AFG,,2021-01-31,2021-02-06,2021,100.0,Suspected,Admin0,Week,WHOEMRO-ALL-2021-Y01-05


In [188]:
# Convert time periods to proper date format
df['calendar_start_date'] = pd.to_datetime(df['calendar_start_date'])
df['calendar_end_date'] = pd.to_datetime(df['calendar_end_date'])

# Keep relevant columns
df = df[['full_name', 'Year', 'dengue_total']]

# Convert dengue_total to numeric
df['dengue_total'] = pd.to_numeric(df['dengue_total'], errors='coerce')

# Group by Country and Year, summing all reported cases within each year as some data is reported weekly and monthly
dengue_yearly = df.groupby(['full_name', 'Year'], as_index=False)['dengue_total'].sum()

# Rename for clarity
dengue_yearly.rename(columns={'dengue_total': 'Annual_dengue_total'}, inplace=True)

In [190]:
# Standardize country names
dengue_yearly['full_name'] = dengue_yearly['full_name'].str.title()

In [192]:
# Rename columns
dengue_yearly.rename(columns={
    'full_name': 'Country',
    'ISO_A0' : 'Country Code'
}, inplace=True)

In [194]:
dengue_yearly.head()

Unnamed: 0,Country,Year,Annual_dengue_total
0,Afghanistan,2021,4176.0
1,American Samoa,1955,0.0
2,American Samoa,1979,0.0
3,American Samoa,1980,1.0
4,American Samoa,1981,1.0


In [196]:
dengue_yearly.describe()

Unnamed: 0,Year,Annual_dengue_total
count,3368.0,3368.0
mean,2000.501781,16867.67
std,14.901339,98589.09
min,1924.0,0.0
25%,1991.0,1.0
50%,2002.0,183.0
75%,2013.0,4297.75
max,2023.0,2253883.0


In [218]:
# reducing years to last 25 years for more reliable reporting metrics as well removal of 2023 as reporting is incomplete
df_recent = dengue_yearly[~dengue_yearly['Year'].between(1924, 1999)]
df_recent = df_recent[df_recent['Year'] != 2023]

## Check base statistics, null values, data types, and any duplicates for final dataframe before export

In [221]:
df_recent.describe()

Unnamed: 0,Year,Annual_dengue_total
count,1907.0,1907.0
mean,2010.939171,25037.31
std,6.723865,127912.5
min,2000.0,0.0
25%,2005.0,4.0
50%,2011.0,370.0
75%,2017.0,8370.0
max,2022.0,2253883.0


In [223]:
df_recent.isnull().sum()

Country                0
Year                   0
Annual_dengue_total    0
dtype: int64

In [225]:
df_recent.dtypes

Country                 object
Year                     int64
Annual_dengue_total    float64
dtype: object

In [227]:
print(df_recent.duplicated().sum())

0


In [229]:
df_recent.to_csv("Yearly_Dengue_Data.csv", index=False)