In [1]:
import pandas as pd
from google.colab import drive
from tqdm import tqdm

drive.mount('/content/gdrive')
folder_path = "/content/gdrive/My Drive"

Mounted at /content/gdrive


# Data Preprocessing

In [None]:
# covid time series preprocessing
def process_ts():

    # read data
    case_death_df = pd.read_csv(folder_path + "/full_data.csv")
    vaccination_df = pd.read_csv(folder_path + "/vaccinations.csv")
    testing_df = pd.read_csv(folder_path + "/covid-testing-all-observations.csv")
    hospitalization_df = pd.read_csv(folder_path + "/covid-hospitalizations.csv")

    # only keep US data
    case_death_df = case_death_df[case_death_df["location"] == "United States"]
    vaccination_df = vaccination_df[vaccination_df["location"] == "United States"]
    testing_df = testing_df[testing_df["Entity"] == "United States - tests performed"]
    hospitalization_df = hospitalization_df[hospitalization_df["entity"] == "United States"]

    # data cleaning
    case_death_df = case_death_df.drop(columns=["location"])
    vaccination_df = vaccination_df.drop(columns=["location", "iso_code"])
    testing_df.rename(columns={"Date": "date"}, inplace=True)
    testing_df = testing_df.drop(columns=["Entity", "ISO code", "Source URL", "Source label", "Notes"])

    hospitalization_df = (
        hospitalization_df.pivot_table(index="date", columns="indicator", values="value")
          .reset_index()
          .sort_values("date")
    )

    # join dataframes
    merged = pd.merge(case_death_df , vaccination_df, on="date", how="inner")
    merged = pd.merge(merged , testing_df, on="date", how="inner")
    timeseries_df = pd.merge(merged , hospitalization_df, on="date", how="inner")

    # Average out new cases and new deaths data (previously collected weekly)
    timeseries_df['new_cases_smoothed'] = timeseries_df['new_cases'].copy()
    timeseries_df['new_deaths_smoothed'] = timeseries_df['new_deaths'].copy()

    for index, row in tqdm(timeseries_df.iterrows(), total=len(timeseries_df)):
        if row['new_cases'] > 0 and index >= 6:
            weekly_cases = row['new_cases']
            daily_cases = weekly_cases / 7
            for i in range(7):
                timeseries_df.loc[index - i, 'new_cases_smoothed'] = daily_cases

        if row['new_deaths'] > 0 and index >= 6:
            weekly_deaths = row['new_deaths']
            daily_deaths = weekly_deaths / 7
            for i in range(7):
                timeseries_df.loc[index - i, 'new_deaths_smoothed'] = daily_deaths

    timeseries_df['new_cases'] = timeseries_df['new_cases_smoothed']
    timeseries_df['new_deaths'] = timeseries_df['new_deaths_smoothed']

    # Drop the temporary smoothed columns
    timeseries_df = timeseries_df.drop(columns=['new_cases_smoothed', 'new_deaths_smoothed', 'total_boosters_per_hundred'])
    timeseries_df.loc[0, 'new_cases'] = timeseries_df.loc[0, 'new_cases']/7
    timeseries_df.loc[0, 'new_deaths'] = timeseries_df.loc[0, 'new_deaths']/7

    # Display the updated DataFrame with smoothed values
    timeseries_df.to_csv(folder_path + "/covid_us_timeseries.csv", index=False)

    print(f"Processed covid timeseries dataset saved to: {folder_path}")

Processed covid timeseries dataset saved to: /content/gdrive/My Drive


In [2]:
# sentiment preprocessing
def process_sentiment():
    sentiment_df = pd.concat([chunk for chunk in tqdm(pd.read_csv(folder_path + "/COVID19_twitter_full_dataset.csv", chunksize=1000), desc='Loading data')])
    sentiment_df_us = sentiment_df[sentiment_df['country/region'] == 'United States']
    sentiment_df_us.to_csv(folder_path + "/covid_us_sentiment.csv", index=False)

Loading data: 198379it [09:27, 349.64it/s]


In [None]:
process_ts()
process_sentiment()