In [24]:
import pandas as pd

In [25]:
# Define the common identifier columns for both datasets
ID_COLUMNS = ['Province/State', 'Country/Region', 'Lat', 'Long']
DATA_FOLDER = './' # Make sure this matches your folder name

In [26]:
# --- 1. Load Confirmed Cases Data ---
print(f"Loading confirmed cases data from {DATA_FOLDER}time_series_covid19_confirmed_global.csv...")
try:
    df_confirmed = pd.read_csv(f'{DATA_FOLDER}time_series_covid19_confirmed_global.csv')
    print("Confirmed cases data loaded.")
except FileNotFoundError:
    print(f"Error: {DATA_FOLDER}time_series_covid19_confirmed_global.csv not found. Check path.")
    exit() # Exit if file not found

Loading confirmed cases data from ./time_series_covid19_confirmed_global.csv...
Confirmed cases data loaded.


In [27]:
# --- 2. Unpivot (Melt) Confirmed Cases Data ---
# The columns that are dates need to be "melted" into two columns: 'Date' and 'ConfirmedCases'
print("Unpivoting confirmed cases data...")
# Identify date columns by excluding ID_COLUMNS
confirmed_date_columns = [col for col in df_confirmed.columns if col not in ID_COLUMNS]
df_confirmed_long = df_confirmed.melt(
    id_vars=ID_COLUMNS,
    value_vars=confirmed_date_columns,
    var_name='Date',
    value_name='ConfirmedCases'
)
print("Confirmed cases data unpivoted.")

Unpivoting confirmed cases data...
Confirmed cases data unpivoted.


In [28]:
# --- 3. Load Deaths Data ---
print(f"Loading deaths data from {DATA_FOLDER}time_series_covid19_deaths_global.csv...")
try:
    df_deaths = pd.read_csv(f'{DATA_FOLDER}time_series_covid19_deaths_global.csv')
    print("Deaths data loaded.")
except FileNotFoundError:
    print(f"Error: {DATA_FOLDER}time_series_covid19_deaths_global.csv not found. Check path.")
    exit() # Exit if file not found

Loading deaths data from ./time_series_covid19_deaths_global.csv...
Deaths data loaded.


In [29]:
# --- 4. Unpivot (Melt) Deaths Data ---
print("Unpivoting deaths data...")
# Identify date columns by excluding ID_COLUMNS
deaths_date_columns = [col for col in df_deaths.columns if col not in ID_COLUMNS]
df_deaths_long = df_deaths.melt(
    id_vars=ID_COLUMNS,
    value_vars=deaths_date_columns,
    var_name='Date',
    value_name='Deaths'
)
print("Deaths data unpivoted.")

Unpivoting deaths data...
Deaths data unpivoted.


In [30]:
# --- 5. Convert 'Date' Column to Datetime Objects ---
# This ensures proper date sorting and analysis
print("Converting 'Date' column to datetime...")
df_confirmed_long['Date'] = pd.to_datetime(df_confirmed_long['Date'], format='%m/%d/%y')
df_deaths_long['Date'] = pd.to_datetime(df_deaths_long['Date'], format='%m/%d/%y')
print("Date column conversion complete.")# --- 6. Combine Confirmed and Deaths Data ---
# Merge them on all identifier columns and the new 'Date' column
print("Combining confirmed and deaths data...")
df_combined_long = pd.merge(
    df_confirmed_long,
    df_deaths_long,
    on=ID_COLUMNS + ['Date'], # Merge on all common columns
    how='outer' # Use outer merge to ensure no data is lost
)
print("Data combined.")

Converting 'Date' column to datetime...
Date column conversion complete.
Combining confirmed and deaths data...
Data combined.


In [31]:
# --- 6. Combine Confirmed and Deaths Data ---
# Merge them on all identifier columns and the new 'Date' column
print("Combining confirmed and deaths data...")
df_combined_long = pd.merge(
    df_confirmed_long,
    df_deaths_long,
    on=ID_COLUMNS + ['Date'], # Merge on all common columns
    how='outer' # Use outer merge to ensure no data is lost
)
print("Data combined.")

Combining confirmed and deaths data...
Data combined.


In [32]:
# --- 7. Clean Combined Data (Basic) ---
# Fill NaN values in 'Province/State' with a placeholder for consistency
df_combined_long['Province/State'] = df_combined_long['Province/State'].fillna('Unknown')
# Fill NaN cases/deaths with 0 if they appear after merge
df_combined_long['ConfirmedCases'] = df_combined_long['ConfirmedCases'].fillna(0)
df_combined_long['Deaths'] = df_combined_long['Deaths'].fillna(0)

In [33]:
# --- 8. Save the Combined, Long-Format Data to a New CSV ---
output_filename = f'{DATA_FOLDER}covid_global_long_format.csv'
print(f"Saving combined long-format data to {output_filename}...")
df_combined_long.to_csv(output_filename, index=False)
print("Combined long-format data saved successfully!")

Saving combined long-format data to ./covid_global_long_format.csv...
Combined long-format data saved successfully!


In [35]:
df_combined_long.count()

Province/State    330327
Country/Region    330327
Lat               328041
Long              328041
Date              330327
ConfirmedCases    330327
Deaths            330327
dtype: int64