# Data Analysis

Here we analyse the processed data to find an answer to the research question.
First we need to install the dependecies, then we load the data from one country and perform the actual analysis on that data.
The goal is to generate a graph that shows the connection between testing and deaths/cases.

## Dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define "generate_plot" function

In [None]:
def generate_plot(df, country: str, indicator: str):
    # Plot erstellen
    fig, ax1 = plt.subplots()

    # Maximalwerte finden
    max_rate_14_day = df['rate_14_day'].max()
    max_testing_rate = df['testing_rate'].max()
    max_value = max(max_rate_14_day, max_testing_rate)

    # Balkendiagramm für 'rate_14_day'
    ax1.bar(df['year_week'], df['rate_14_day'], color='b', alpha=0.7, label='Rate 14 Day')
    ax1.set_xlabel('Year Week')
    ax1.set_ylabel('Rate 14 Day', color='b')
    ax1.tick_params(axis='y', labelcolor='b')
    ax1.set_xticklabels(df['year_week'], rotation=90)
    ax1.set_ylim(0, max_value)

    # Zweite Y-Achse für 'testing_rate'
    ax2 = ax1.twinx()
    ax2.plot(df['year_week'], df['testing_rate'], color='r', label='Testing Rate')
    ax2.set_ylabel('Testing Rate', color='r')
    ax2.tick_params(axis='y', labelcolor='r')
    ax2.set_ylim(0, max_value)

    # X-Achse anpassen, um nur jeden Januar zu zeigen
    x_labels = df['year_week']
    ax1.set_xticks([i for i, label in enumerate(x_labels) if label.endswith('-01') or label.endswith('-26')])
    ax1.set_xticklabels([label for label in x_labels if label.endswith('-01') or label.endswith('-26')], rotation=45)

    # Titel und Legende
    plt.title('COVID-19 ' + indicator + ' Rate 14 Day and Testing Rate over Time in ' + country)
    fig.tight_layout()
    fig.legend(loc='upper left', bbox_to_anchor=(0.14, 0.9))

    # Ordner erstellen, falls er nicht existiert
    output_dir = '../data/graphs/' + country
    os.makedirs(output_dir, exist_ok=True)

    # Plot als PNG speichern
    plt.savefig(os.path.join(output_dir, indicator + '_rate_testing_over_time_' + country + '.png'))

## Load the dataset into Pandas Dataframe

After loading the data, split it between normal cases and deaths and generatet the plots.

In [None]:
# Directory where CSV files are stored
directory = '../data/per_country'

# Iterate over files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract country name from filename
        country_name = filename[:-4].split('_')[-1]  # Remove ".csv" and get last part after underscore
    
        # Load the CSV file
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        generate_plot(df[df['indicator'] == 'cases'], country_name, "cases")
        generate_plot(df[df['indicator'] == 'deaths'], country_name, "deaths")