In [21]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows', None)


In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
torch.set_default_dtype(torch.float64)
torch.autograd.set_detect_anomaly(True)

cuda


<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7fa4392568e0>

In [24]:
df=pd.read_csv('data/origindata.csv')


In [25]:
df['date'] = pd.to_datetime(df['date'])

In [26]:
df = df.sort_values(by=['location', 'date'])

In [27]:
# Drop columns with more than 70% missing values (arbitrary threshold, can be adjusted).
threshold = 0.70 * len(df)
df = df.dropna(thresh=threshold, axis=1)

# Impute missing numerical values with the mean of the column.
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].mean(), inplace=True)

# Impute missing categorical values with the mode of the column.
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [28]:
df['new_cases_next_day'] = df.groupby('location')['new_cases'].shift(-1)

In [29]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['new_cases_next_day', 'population']] # 'population' is a static value and not a daily feature

In [30]:
features_to_lag = [
    'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths',
    'reproduction_rate', 'stringency_index', 'total_vaccinations',
    'new_vaccinations_smoothed', 'positive_rate', 'new_tests_smoothed',
    'icu_patients', 'hosp_patients',
    'population_density', 'median_age', 'aged_65_older', 'gdp_per_capita',
    'hospital_beds_per_thousand', 'life_expectancy'
]

In [31]:
actual_features_to_lag = [col for col in features_to_lag if col in df.columns]


In [32]:
for col in actual_features_to_lag:
    if col in numerical_cols: # Ensure we are only lagging numerical columns that are still in df
        for i in range(1, 4):  # Lag for 1, 2, and 3 days
            df[f'{col}_lag_{i}'] = df.groupby('location')[col].shift(i)

In [33]:
df.isnull().sum()

iso_code                             0
continent                            0
location                             0
date                                 0
total_cases                          0
new_cases                            0
new_cases_smoothed                   0
total_deaths                         0
new_deaths                           0
new_deaths_smoothed                  0
total_cases_per_million              0
new_cases_per_million                0
new_cases_smoothed_per_million       0
total_deaths_per_million             0
new_deaths_per_million               0
new_deaths_smoothed_per_million      0
population_density                   0
median_age                           0
aged_65_older                        0
aged_70_older                        0
gdp_per_capita                       0
cardiovasc_death_rate                0
diabetes_prevalence                  0
life_expectancy                      0
human_development_index              0
population               

In [35]:
# Drop any remaining rows with NaN values that might have been introduced by shifting (e.g., first few rows).
df.dropna(inplace=True)

# Remove the original columns listed in features_to_lag.
# Ensure to only drop columns that exist in the DataFrame.
columns_to_drop = [col for col in actual_features_to_lag if col in df.columns]
df = df.drop(columns=columns_to_drop)

In [37]:
df.isnull().sum()

iso_code                           0
continent                          0
location                           0
date                               0
total_cases                        0
new_deaths_smoothed                0
total_cases_per_million            0
new_cases_per_million              0
new_cases_smoothed_per_million     0
total_deaths_per_million           0
new_deaths_per_million             0
new_deaths_smoothed_per_million    0
aged_70_older                      0
cardiovasc_death_rate              0
diabetes_prevalence                0
human_development_index            0
population                         0
new_cases_next_day                 0
new_cases_lag_1                    0
new_cases_lag_2                    0
new_cases_lag_3                    0
new_cases_smoothed_lag_1           0
new_cases_smoothed_lag_2           0
new_cases_smoothed_lag_3           0
total_deaths_lag_1                 0
total_deaths_lag_2                 0
total_deaths_lag_3                 0
n

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 428418 entries, 3 to 429433
Data columns (total 45 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   iso_code                         428418 non-null  object        
 1   continent                        428418 non-null  object        
 2   location                         428418 non-null  object        
 3   date                             428418 non-null  datetime64[ns]
 4   total_cases                      428418 non-null  float64       
 5   new_deaths_smoothed              428418 non-null  float64       
 6   total_cases_per_million          428418 non-null  float64       
 7   new_cases_per_million            428418 non-null  float64       
 8   new_cases_smoothed_per_million   428418 non-null  float64       
 9   total_deaths_per_million         428418 non-null  float64       
 10  new_deaths_per_million           428418 non-null 

In [39]:
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,...,median_age_lag_3,aged_65_older_lag_1,aged_65_older_lag_2,aged_65_older_lag_3,gdp_per_capita_lag_1,gdp_per_capita_lag_2,gdp_per_capita_lag_3,life_expectancy_lag_1,life_expectancy_lag_2,life_expectancy_lag_3
3,AFG,Asia,Afghanistan,2020-01-08,0.0,72.060828,0.0,0.0,122.713852,0.0,...,18.6,2.58,2.58,2.58,1803.99,1803.99,1803.99,64.83,64.83,64.83
4,AFG,Asia,Afghanistan,2020-01-09,0.0,72.060828,0.0,0.0,122.713852,0.0,...,18.6,2.58,2.58,2.58,1803.99,1803.99,1803.99,64.83,64.83,64.83
5,AFG,Asia,Afghanistan,2020-01-10,0.0,0.0,0.0,0.0,0.0,0.0,...,18.6,2.58,2.58,2.58,1803.99,1803.99,1803.99,64.83,64.83,64.83
6,AFG,Asia,Afghanistan,2020-01-11,0.0,0.0,0.0,0.0,0.0,0.0,...,18.6,2.58,2.58,2.58,1803.99,1803.99,1803.99,64.83,64.83,64.83
7,AFG,Asia,Afghanistan,2020-01-12,0.0,0.0,0.0,0.0,0.0,0.0,...,18.6,2.58,2.58,2.58,1803.99,1803.99,1803.99,64.83,64.83,64.83


In [41]:
output_filename = 'processed_country_data.xlsx'

with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
        df.to_excel(writer, index=False)