We fetched COVID-19 historical data for Italy and France, loaded it into JSON dataframes and converted it to pandas dataframes. 

In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.dates import DateFormatter, AutoDateLocator


url_fr = "https://disease.sh/v3/covid-19/historical/France?lastdays=all"
url_it = "https://disease.sh/v3/covid-19/historical/Italy?lastdays=all"

resp_fr = requests.get(url_fr)
resp_it = requests.get(url_it)

resp_fr.raise_for_status()
resp_it.raise_for_status()

data_fr_js = resp_fr.json()
data_it_js = resp_it.json()

data_it = pd.DataFrame(data_it_js["timeline"])
data_fr = pd.DataFrame(data_fr_js["timeline"])


In [None]:
print(data_fr.isnull().values.any()) 
print(data_it.isnull().values.any()) 

In [None]:
print(data_it.index.duplicated().sum())
print(data_fr.index.duplicated().sum())
#stampa il numero di volte in cui una data è ripetuta

By performing a general analysis of the dataset using the describe command, we encountered an issue. Specifically, for the "recovered" variable in both France and Italy, the median is significantly different from the mean. In particular, the value of zero for a cumulative variable—such as the number of recovered individuals—does not make logical sense. Further investigation of the dataset revealed the presence of null values. After 08/05/2021, the recovered data for both countries consist only of zeros.

In [None]:
print(data_it.describe()) 
print(data_fr.describe())

In [None]:
data_fr.index = pd.to_datetime(data_fr.index, format="%m/%d/%y")
data_it.index = pd.to_datetime(data_it.index, format="%m/%d/%y")

# Creare il grafico per i dati di Italia e Francia
fig, axes = plt.subplots(3, 1, figsize=(10, 15), sharex=True)

# Formattazione dell'asse x
date_locator = AutoDateLocator()
date_formatter = DateFormatter("%m/%d/%y")

# Dati di decessi (Italia vs Francia)
axes[0].plot(data_it.index, data_it["deaths"], label="Italy Deaths", color="green")
axes[0].plot(data_fr.index, data_fr["deaths"], label="France Deaths", color="blue")
axes[0].set_title("Deaths (Italy vs France)")
axes[0].legend()

# Dati di casi (Italia vs Francia)
axes[1].plot(data_it.index, data_it["cases"], label="Italy Cases", color="green")
axes[1].plot(data_fr.index, data_fr["cases"], label="France Cases", color="blue")
axes[1].set_title("Cases (Italy vs France)")
axes[1].legend()

# Dati di guarigioni (Italia vs Francia)
axes[2].plot(data_it.index, data_it["recovered"], label="Italy Recovered", color="green")
axes[2].plot(data_fr.index, data_fr["recovered"], label="France Recovered", color="blue")
axes[2].set_title("Recovered (Italy vs France)")
axes[2].legend()

# Configurare l'asse x per tutti i subplot
for ax in axes:
    ax.xaxis.set_major_locator(date_locator)
    ax.xaxis.set_major_formatter(date_formatter)
    ax.grid(True)

plt.xticks(rotation=45)  # Ruota le etichette dell'asse x per renderle leggibili
plt.tight_layout()
plt.show()

We can see that the number of recovered, both in Italy and France, suddenly drops to zero around August 2021 and remains zero thereafter. We will have to handle this.
Moreover, another particular feature can be observed, except for the sudden drop in recovered, all the graphs seem to be increasing, so it is fair to assume that the data is cumulative.
NdT: we actually looked online for graphs of cumulative cases, deaths, recovered, and they are very similar to the ones we are working with (Disease.shdocs)

In [None]:
zero_recovered_it = data_it[(data_it["recovered"] == 0) & (data_it.index > "31-05-2021")].index.min()
zero_recovered_fr = data_fr[(data_it["recovered"] == 0) & (data_fr.index > "31-05-2021")].index.min()

print(zero_recovered_it)
print(zero_recovered_fr)

We start the data cleaning process by eliminating all the zero-valued data in the recovered for bouth dataset from summer 2021.

In [None]:
cutoff_date = "7/5/21"
data_fr_cut = data_fr[data_fr.index <= cutoff_date]
data_it_cut = data_it[data_it.index <= cutoff_date]

We continue by checking the consistency of the data, in particular, we must:
- Check that there are no missing or duplicated values
- Check that the data is actually cumulative, i.e. they are non-decreasing
- Check that the the cumulative data is consistent, i.e. the number of deaths and the number of recovered cannot be greater than the number of cases

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

ax.plot(data_it_cut.index, data_it_cut["recovered"], label="Italy Recovered", color="green")
ax.plot(data_fr_cut.index, data_fr_cut["recovered"], label="France Recovered", color="blue")
ax.set_title("Recovered (Italy vs France)")
ax.legend()

# Configurare l'asse x
ax.xaxis.set_major_locator(date_locator)
ax.xaxis.set_major_formatter(date_formatter)
ax.grid(True)

plt.xticks(rotation=45)  # Ruota le etichette dell'asse x per renderle leggibili
plt.tight_layout()
plt.show()

The three variables used above are cumulative. We use the `.diff` command to observe the daily trend of the values.

In [None]:
anomalous_days_it = data_it[data_it["deaths"] > data_it["cases"]]
print(anomalous_days_it[["deaths", "cases"]])
anomalous_days_fr = data_fr[data_fr["deaths"] > data_fr["cases"]]
print(anomalous_days_fr[["deaths", "cases"]])

In [None]:
anomalous_days_it = data_it[data_it["recovered"] > data_it["cases"]]
print(anomalous_days_it[["recovered", "cases"]])
anomalous_days_fr = data_fr[data_fr["recovered"] > data_fr["cases"]]
print(anomalous_days_fr[["recovered", "cases"]])

In [None]:
# here substitute recovered data only before 
#here we go through each observation for each data set 
def substitute_missing(df, column_name):
    dates_errors = []
    for i in range(1, len(df)):
        if df.loc[df.index[i], column_name] < df.loc[df.index[i-1], column_name]:
            df.loc[df.index[i], column_name] = df.loc[df.index[i-1], column_name]
            dates_errors.append(i)
    return df[column_name] , dates_errors


data_it.loc[:,"cases"], mistakes_cases_it  = substitute_missing(data_it, "cases")
data_it.loc[: , "deaths"], mistakes_death_it  = substitute_missing(data_it, "deaths")
data_it_cut.loc[: ,"recovered"], mistakes_recovered_it  = substitute_missing(data_it, "recovered")

data_fr.loc[:,"cases"], mistakes_cases_fr  = substitute_missing(data_fr, "cases")
data_fr.loc[:, "deaths"], mistakes_death_fr  = substitute_missing(data_fr, "deaths") 
data_fr_cut.loc[: ,"recovered"], mistakes_recovered_it =  substitute_missing(data_fr, "recovered")

mistakes_cases_it, mistakes_death_it, mistakes_recovered_it ,mistakes_cases_fr , mistakes_death_fr , mistakes_recovered_fr

In [None]:
data_it["daily_deaths"] = data_it["deaths"].diff()
data_fr["daily_deaths"] = data_fr["deaths"].diff()

plt.figure(figsize=(10, 5))
plt.plot(data_it.index, data_it["daily_deaths"], label="Italy Daily Deaths", color="green")
plt.title("Daily Deaths in Italy")
plt.xlabel("Date")
plt.ylabel("Deaths per Day")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(data_fr.index, data_fr["daily_deaths"], label="France Daily Deaths", color="blue")
plt.title("Daily Deaths in France")
plt.xlabel("Date")
plt.ylabel("Deaths per Day")
plt.legend()
plt.grid()
plt.show()

The jagged lines in our data are likely a result of our data cleaning process, which naively replaces erroneous values with those from the previous day. This approach is problematic because it causes daily case, death, and recovery counts to appear as zero, distorting the actual trends, to have a clearer picture of what is going on each time we will provide a smoothed curve that approximates better the trends present in our data.

In [None]:
smoothed_data_it = data_it["daily_deaths"].ewm( alpha = 0.01 , adjust=False).mean()
smoothed_data_fr = data_it["daily_deaths"].ewm( alpha = 0.01 , adjust=False).mean()

plt.figure(figsize=(10, 5))
plt.plot(data_it.index, smoothed_data_it, label="Italy Daily Deaths", color="green")
plt.title("Smoothed Daily Deaths in Italy ")
plt.xlabel("Date")
plt.ylabel("Deaths per Day")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(data_it.index, smoothed_data_fr, label="Italy Daily Deaths", color="green")
plt.title("Smoothed Daily Deaths in France ")
plt.xlabel("Date")
plt.ylabel("Deaths per Day")
plt.legend()
plt.grid()
plt.show()

In [None]:
data_it["daily_cases"] = data_it["cases"].diff()
data_fr["daily_cases"] = data_fr["cases"].diff()

plt.figure(figsize=(10, 5))
plt.plot(data_it.index, data_it["daily_cases"], label="Italy Daily Cases", color="green")
plt.title("Daily Cases in Italy")
plt.xlabel("Date")
plt.ylabel("Cases per Day")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(data_fr.index, data_fr["daily_cases"], label="France Daily Cases", color="blue")
plt.title("Daily Cases in France")
plt.xlabel("Date")
plt.ylabel("Cases per Day")
plt.legend()
plt.grid()
plt.show()

In [None]:
data_it_cut = data_it_cut.copy()
data_it_cut["daily_recovered"] = data_it_cut["recovered"].diff()
data_fr_cut = data_fr_cut.copy()
data_fr_cut["daily_recovered"] = data_fr_cut["recovered"].diff()

plt.figure(figsize=(10, 5))
plt.plot(data_it_cut.index, data_it_cut["daily_recovered"], label="Italy Daily Recovered", color="green")
plt.title("Daily Recovered in Italy and France")
plt.xlabel("Date")
plt.ylabel("Cases per Day")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(data_fr_cut.index, data_fr_cut["daily_recovered"], label="France Daily Recovered", color="blue")
plt.title("Daily Recovered in Italy and France")
plt.xlabel("Date")
plt.ylabel("Cases per Day")
plt.legend()
plt.grid()
plt.show()

Let's now focus on analyzing the weekly data.

In [None]:
data_it = data_it.copy()
data_it.loc[:, "weekly_deaths"] = data_it["deaths"].diff(periods=7)
data_fr = data_fr.copy()
data_fr.loc[:, "weekly_deaths"] = data_fr["deaths"].diff(periods=7)

plt.figure(figsize=(10, 5))
plt.plot(data_it.index, data_it["weekly_deaths"], label="Italy Weekly Deaths", color="green")
plt.title("Weekly Deaths in Italy")
plt.xlabel("Date")
plt.ylabel("Deaths per Week")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(data_fr.index, data_fr["weekly_deaths"], label="France Weekly Deaths", color="blue")
plt.title("Weekly Deaths in France")
plt.xlabel("Date")
plt.ylabel("Deaths per Week")
plt.legend()
plt.grid()
plt.show()

In [None]:
data_it["weekly_cases"] = data_it["cases"].diff(periods=7)
data_fr["weekly_cases"] = data_fr["cases"].diff(periods=7)

plt.figure(figsize=(10, 5))
plt.plot(data_it.index, data_it["weekly_cases"], label="Italy Weekly Cases", color="green")
plt.title("Weekly Cases in Italy and France")
plt.xlabel("Date")
plt.ylabel("Cases per Week")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(data_fr.index, data_fr["weekly_cases"], label="France Weekly Cases", color="blue")
plt.title("Weekly Cases in Italy and France")
plt.xlabel("Date")
plt.ylabel("Cases per Week")
plt.legend()
plt.grid()
plt.show()

In [None]:
data_it_cut = data_it_cut.copy()
data_fr_cut = data_fr_cut.copy()
data_it_cut["weekly_recovered"] = data_it_cut["recovered"].diff(periods=7)
data_fr_cut["weekly_recovered"] = data_fr_cut["recovered"].diff(periods=7)

plt.figure(figsize=(10, 5))
plt.plot(data_it_cut.index, data_it_cut["weekly_recovered"], label="Italy Weekly Recovered", color="green")
plt.title("Weekly Recovered in Italy ")
plt.xlabel("Date")
plt.ylabel("Recovered per Week")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(data_fr_cut.index, data_fr_cut["weekly_recovered"], label="France Weekly Cases", color="blue")
plt.title("Weekly Recovered in France")
plt.xlabel("Date")
plt.ylabel("Recovered per Week")
plt.legend()
plt.grid()
plt.show()

It can be observed that the graphs exhibit similar patterns. Special attention can be given to the "cases"-"recovered" ratio, as the latter represents the first shift by a few weeks. This can be explained by recalling that the time between becoming positive and recovering (returning to negative) typically takes a few weeks.


In general, the behaviors between France and Italy are comparable, except for the "recovered" variable.

In [None]:
sns.set_theme(style="ticks", palette="pastel")
df = pd.DataFrame({
    "deaths": pd.concat([data_it["daily_deaths"], data_fr["daily_deaths"]], ignore_index=True),
    "country": ["Italy"] * len(data_it) + ["France"] * len(data_fr)
})

plt.figure(figsize=(10, 6))
sns.boxplot(x="country", y="deaths", data=df, palette=["green", "lightblue"], hue="country")

# Miglioriamo l'aspetto del grafico
sns.despine(offset=10, trim=True)
plt.title("Boxpolots of Daily Deaths in Italy and France")
plt.xlabel("Country")
plt.ylabel("Number of Daily Deaths")

# Mostriamo il grafico
plt.show()

In [None]:
sns.set_theme(style="ticks", palette="pastel")
df = pd.DataFrame({
    "cases": pd.concat([data_it["daily_cases"], data_fr["daily_cases"]], ignore_index=True),
    "country": ["Italy"] * len(data_it) + ["France"] * len(data_fr)
})

plt.figure(figsize=(10, 6))
sns.boxplot(x="country", y="cases", data=df, palette=["green", "lightblue"], hue="country")

# Miglioriamo l'aspetto del grafico
sns.despine(offset=10, trim=True)
plt.title("Boxpolots of Daily Cases in Italy and France")
plt.xlabel("Country")
plt.ylabel("Number of Daily Cases")

# Mostriamo il grafico
plt.show()

In [None]:
sns.set_theme(style="ticks", palette="pastel")
df = pd.DataFrame({
    "recovered": pd.concat([data_it_cut["daily_recovered"], data_fr_cut["daily_recovered"]], ignore_index=True),
    "country": ["Italy"] * len(data_it_cut) + ["France"] * len(data_fr_cut)
})

plt.figure(figsize=(10, 6))
sns.boxplot(x="country", y="recovered", data=df, palette=["green", "lightblue"], hue="country")

# Miglioriamo l'aspetto del grafico
sns.despine(offset=10, trim=True)
plt.title("Boxpolots of Daily Recovered in Italy and France")
plt.xlabel("Country")
plt.ylabel("Number of Daily Recovered")

# Mostriamo il grafico
plt.show()

we use the cut data set for recovereds 