## 📊 Project: Cleaning and Preparing COVID-19 Data from an API

"""
🔍 Objective: Retrieve COVID-19 data, clean and prepare it for analysis.
🛠️ Tools: Python, requests, pandas, datetime
📚 Skills: API consumption, data cleaning, time-series transformation
"""

# Import Libraries


In [86]:
import pandas as pd
import requests
from datetime import datetime

# Step 1: Fetch COVID-19 Historical Data

In [89]:
# Fetch COVID-19 Historical Data
url = "https://disease.sh/v3/covid-19/historical?lastdays=30"
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
else:
    raise Exception(f"API request failed with status code {response.status_code}")


# Step 2: Normalize JSON into DataFram

In [91]:
# Normalize JSON into DataFram
records = []

for country_data in data:
    country = country_data.get('country')
    timeline = country_data.get('timeline', {})
    cases = timeline.get('cases', {})
    deaths = timeline.get('deaths', {})
    recovered = timeline.get('recovered', {})

    for date in cases:
        records.append({
            'country': country,
            'date': date,
            'cases': cases.get(date, None),
            'deaths': deaths.get(date, None),
            'recovered': recovered.get(date, None)
        })

df = pd.DataFrame(records)
df.head()

Unnamed: 0,country,date,cases,deaths,recovered
0,Afghanistan,2/8/23,208771,7896,0
1,Afghanistan,2/9/23,208771,7896,0
2,Afghanistan,2/10/23,208943,7896,0
3,Afghanistan,2/11/23,208971,7896,0
4,Afghanistan,2/12/23,208982,7896,0


In [94]:
df.to_csv("raw_covid_data.csv", index=False)
print("Raw data saved successfully!")

Raw data saved successfully!


# Step 3: Clean and Prepare Data

In [97]:
# Convert Date Strings to Datetime
df['date'] = pd.to_datetime(df['date'], format="%m/%d/%y")

In [99]:
# Check for missing values
print(df.isnull().sum())

country      0
date         0
cases        0
deaths       0
recovered    0
dtype: int64


In [101]:
# Fill missing values
df[['cases', 'deaths', 'recovered']] = df[['cases', 'deaths', 'recovered']].fillna(0)

In [103]:
# Sort by country and date
df.sort_values(by=['country', 'date'], inplace=True)
# Remove countries where all values are zero
df = df[(df['cases'] > 0) | (df['deaths'] > 0) | (df['recovered'] > 0)]

# Step 4: Add Daily Change Columns

In [106]:
#  Add Daily New Cases/Deaths/Recoveries
df[['new_cases', 'new_deaths', 'new_recovered']] = df.groupby('country')[['cases', 'deaths', 'recovered']].diff().fillna(0)
df.head()

Unnamed: 0,country,date,cases,deaths,recovered,new_cases,new_deaths,new_recovered
0,Afghanistan,2023-02-08,208771,7896,0,0.0,0.0,0.0
1,Afghanistan,2023-02-09,208771,7896,0,0.0,0.0,0.0
2,Afghanistan,2023-02-10,208943,7896,0,172.0,0.0,0.0
3,Afghanistan,2023-02-11,208971,7896,0,28.0,0.0,0.0
4,Afghanistan,2023-02-12,208982,7896,0,11.0,0.0,0.0


# Step 5: Clean Column Names and Export

In [109]:
# Clean Column Names and Types
df.columns = [col.lower() for col in df.columns]
df.dtypes

country                  object
date             datetime64[ns]
cases                     int64
deaths                    int64
recovered                 int64
new_cases               float64
new_deaths              float64
new_recovered           float64
dtype: object

#  Export the Cleaned Data

In [112]:
#  Export the Cleaned Data
df.to_csv("cleaned_covid_data.csv", index=False)