# Data preperation

### Weekly consolidation of cases, vaccinations and deaths

In [2]:
import pandas as pd
import sqlite3

# Connection to the SQLite database
conn = sqlite3.connect('../data/covid_switzerland.db')

# loading data from the database
df_cases = pd.read_sql_query('SELECT * FROM covid_weekly_data', conn)
df_deaths = pd.read_sql_query('SELECT * FROM daily_deaths_data', conn)
df_vacc = pd.read_sql_query('SELECT * FROM vaccinated_data', conn)
conn.close()

In [3]:
# Data cleaning and preparation
df_deaths['date'] = pd.to_datetime(df_deaths['date'])
df_deaths['week_str'] = df_deaths['date'].dt.strftime('%G%V')

# Calculating the number of deaths per week
df_vacc['date'] = pd.to_datetime(df_vacc['date'], format='%d/%m/%Y')
df_vacc['week_str'] = df_vacc['date'].dt.strftime('%G%V')

In [4]:
# Calculating the number of deaths per week
weekly_deaths = df_deaths.groupby('week_str')['daily_deaths'].sum().reset_index()
weekly_deaths.rename(columns={'daily_deaths': 'weekly_deaths'}, inplace=True)

# Calculating the number of vaccinations per week
weekly_vacc = df_vacc.groupby('week_str')['enteries'].sum().reset_index()
weekly_vacc.rename(columns={'enteries': 'weekly_vaccinations'}, inplace=True)

In [5]:
# preparation of covid_weekly_data (week in week_str format)
df_cases['week_str'] = df_cases['week'].astype(str)

# Merging the tables
df_merged = df_cases.merge(weekly_vacc, on='week_str', how='left')
df_merged = df_merged.merge(weekly_deaths, on='week_str', how='left')

# Displaying the first few rows of the merged tables
df_merged.head()

Unnamed: 0,week,new_cases,total_cases,population,incidence_weekly,incidence_total,week_str,weekly_vaccinations,weekly_deaths
0,202009,57,57,8738791,0.65,0.65,202009,,0.0
1,202010,377,434,8738791,4.31,4.97,202010,,3.0
2,202011,2265,2699,8738791,25.92,30.89,202011,,25.0
3,202012,6558,9257,8738791,75.04,105.93,202012,,99.0
4,202013,7345,16602,8738791,84.05,189.98,202013,,253.0


### Droping missing values

In [6]:
# Checking for missing values
print(df_merged.isnull().sum())

# Replacing NaN values with 0
df_merged.fillna(0, inplace=True)

# Convert weekly_vaccinations and weekly_deaths to integers
df_merged['weekly_vaccinations'] = df_merged['weekly_vaccinations'].astype(int)
df_merged['weekly_deaths'] = df_merged['weekly_deaths'].astype(int)

# Display the first few rows to confirm the changes
print(df_merged[['weekly_vaccinations', 'weekly_deaths']].head())

week                      0
new_cases                 0
total_cases               0
population                0
incidence_weekly          0
incidence_total           0
week_str                  0
weekly_vaccinations    2592
weekly_deaths          1692
dtype: int64
   weekly_vaccinations  weekly_deaths
0                    0              0
1                    0              3
2                    0             25
3                    0             99
4                    0            253


### Changing week in a real date

In [7]:
# changing the week_str to a date format
from pandas import to_datetime
df_merged['date'] = df_merged['week_str'].apply(lambda x: to_datetime(x + '1', format='%G%V%u'))

# print the first few rows of the column 'date'
print(df_merged[['date']].head())

        date
0 2020-02-24
1 2020-03-02
2 2020-03-09
3 2020-03-16
4 2020-03-23


### Calculations per 100k residents

In [8]:
# Calculations for new cases, deaths, and vaccinations per 100k population
df_merged['weekly_cases_per_100k'] = df_merged['new_cases'] / df_merged['population'] * 100000
df_merged['weekly_deaths_per_100k'] = df_merged['weekly_deaths'] / df_merged['population'] * 100000
df_merged['weekly_vacc_per_100k'] = df_merged['weekly_vaccinations'] / df_merged['population'] * 100000

# Death rate calculation
df_merged['death_rate'] = df_merged['weekly_deaths'] / df_merged['new_cases']

# Display the first few rows to confirm the changes
print(df_merged[['weekly_cases_per_100k', 'weekly_deaths_per_100k', 'weekly_vacc_per_100k', 'death_rate']].head())

   weekly_cases_per_100k  weekly_deaths_per_100k  weekly_vacc_per_100k  \
0               0.652264                0.000000                   0.0   
1               4.314098                0.034330                   0.0   
2              25.918917                0.286081                   0.0   
3              75.044706                1.132880                   0.0   
4              84.050528                2.895137                   0.0   

   death_rate  
0    0.000000  
1    0.007958  
2    0.011038  
3    0.015096  
4    0.034445  


Weekly Summary of cases, vaccinations and deaths

In [9]:
# 🔁 Weekly summary for first 10 weeks (loop demonstration)

print("📊 Weekly overview: new cases, vaccinations, and deaths\n")

for i, row in df_merged.iterrows():
    week = row["week_str"]
    new_cases = row["new_cases"]
    vacc = row["weekly_vaccinations"]
    deaths = row["weekly_deaths"]
    
    print(f"Week {week}: {new_cases} new cases | {vacc} vaccinations | {deaths} deaths")
    
    if i == 9:  # Limit output to 10 weeks for readability
        break


📊 Weekly overview: new cases, vaccinations, and deaths

Week 202009: 57 new cases | 0 vaccinations | 0 deaths
Week 202010: 377 new cases | 0 vaccinations | 3 deaths
Week 202011: 2265 new cases | 0 vaccinations | 25 deaths
Week 202012: 6558 new cases | 0 vaccinations | 99 deaths
Week 202013: 7345 new cases | 0 vaccinations | 253 deaths
Week 202014: 6024 new cases | 0 vaccinations | 385 deaths
Week 202015: 3600 new cases | 0 vaccinations | 343 deaths
Week 202016: 1837 new cases | 0 vaccinations | 242 deaths
Week 202017: 1167 new cases | 0 vaccinations | 194 deaths
Week 202018: 700 new cases | 0 vaccinations | 90 deaths


### Mean of 3 Weeks for the visualization

In [10]:
# Mean per 3 weeks for new cases, deaths, and vaccinations
df_merged['cases_smooth'] = df_merged['new_cases'].rolling(3).mean()
df_merged['vacc_smooth'] = df_merged['weekly_vaccinations'].rolling(3).mean()
df_merged['deaths_smooth'] = df_merged['weekly_deaths'].rolling(3).mean()

# Display the first few rows to confirm the changes
df_merged[['week_str', 'date', 'new_cases', 'weekly_deaths', 'weekly_vaccinations', 
    'weekly_cases_per_100k', 'weekly_deaths_per_100k', 'weekly_vacc_per_100k', 'death_rate']].head()

Unnamed: 0,week_str,date,new_cases,weekly_deaths,weekly_vaccinations,weekly_cases_per_100k,weekly_deaths_per_100k,weekly_vacc_per_100k,death_rate
0,202009,2020-02-24,57,0,0,0.652264,0.0,0.0,0.0
1,202010,2020-03-02,377,3,0,4.314098,0.03433,0.0,0.007958
2,202011,2020-03-09,2265,25,0,25.918917,0.286081,0.0,0.011038
3,202012,2020-03-16,6558,99,0,75.044706,1.13288,0.0,0.015096
4,202013,2020-03-23,7345,253,0,84.050528,2.895137,0.0,0.034445


### Saving data frame in CSV file

In [11]:
# Save the DataFrame to a CSV file
df_merged.to_csv('../data/weekly_combined_prepared.csv', index=False)