In [None]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"
from plotly.subplots import make_subplots

from pathlib import Path
data_dir = Path('../input/')

import os
os.listdir(data_dir)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
os.listdir('../input/covid19-global-forecasting-week-3/')

In [None]:
df = pd.read_csv('../input/covid19-global-forecasting-week-3/train.csv')
df.rename(columns={'Country_Region' : 'country'}, inplace=True)

In [None]:
test = pd.read_csv('../input/covid19-global-forecasting-week-3/test.csv')

In [None]:
df = pd.concat([df , test])
df

Load the cleaned data from https://www.kaggle.com/imdevskp/corona-virus-report.

In [None]:
# df = pd.merge(df, countries_df, on='country')
df

In [None]:
icu_df = pd.read_csv("../input/hospital-beds-by-country/API_SH.MED.BEDS.ZS_DS2_en_csv_v2_887506.csv")
icu_df['Country Name'] = icu_df['Country Name'].replace('United States', 'US')
icu_df['Country Name'] = icu_df['Country Name'].replace('Russian Federation', 'Russia')
icu_df['Country Name'] = icu_df['Country Name'].replace('Iran, Islamic Rep.', 'Iran')
icu_df['Country Name'] = icu_df['Country Name'].replace('Egypt, Arab Rep.', 'Egypt')
icu_df['Country Name'] = icu_df['Country Name'].replace('Venezuela, RB', 'Venezuela')
df['country'] = df['country'].replace('Czechia', 'Czech Republic')


# We wish to have the most recent values, thus we need to go through every year and extract the most recent one, if it exists.
icu_cleaned = pd.DataFrame()
icu_cleaned["country"] = icu_df["Country Name"]
icu_cleaned["icu"] = np.nan

for year in range(1960, 2020):
    year_df = icu_df[str(year)].dropna()
    icu_cleaned["icu"].loc[year_df.index] = year_df.values

df = pd.merge(df, icu_cleaned, on='country' , how = 'left')
df

## 4. Temperature Data
In our next step, we wish to analyze the weather and temperature data of the respective countries since the outbreak of the virus. We have composed a dataset here: https://www.kaggle.com/winterpierre91/covid19-global-weather-data

We hope to find some colleration between certain weather metrics and the speed of the number of infections/deaths.

In [None]:
df_temperature = pd.read_csv("../input/covid19-global-weather-data/temperature_dataframe.csv")
df_temperature['country'] = df_temperature['country'].replace('USA', 'US')
df_temperature['country'] = df_temperature['country'].replace('UK', 'United Kingdom')
df_temperature = df_temperature[["country", "province", "date", "humidity", "sunHour", "tempC", "windspeedKmph"]].reset_index()
df_temperature.rename(columns={'province': 'state'}, inplace=True)
df_temperature["Date"] = pd.to_datetime(df_temperature['date'])
df_temperature['state'] = df_temperature['state'].fillna('')


df_temperature.head()

In [None]:
df["Date"] = pd.to_datetime(df['Date'])


In [None]:
df = df.merge(df_temperature, on=['country','Date'], how='left')
df.to_csv("countries_icu_temp.csv")

In [None]:
df.tail()

# Regression Model
By implementing a regression model which tries to use the country input variables to predict the most recent number of infections and deaths as target, we can extract the relative feature importance. This can be done pretty well with a Random Forest Regressor.

In [None]:
train_data = df
print(train_data.shape)
train_data.tail()

(We only wish to have a look at countries which already have an infection ratio higher than 0, because the ones that aren't infected yet, might bias the feature importance)

In [None]:
threshold = 0
train_data['infectionRate'] = round((train_data['ConfirmedCases']/train_data['population'])*100, 5)
train_data = train_data[train_data['infectionRate'] >= threshold]
print(train_data.shape)

In [None]:
train_data.head()

In [None]:
train_data.columns

In [None]:
train_data = train_data[['Id', 'country', 'Date', 'ConfirmedCases',
       'Fatalities', 'population', 'density', 'fertility', 'age',
       'urban_percentage', 'icu', 'index', 'state', 'humidity',
       'sunHour', 'tempC', 'windspeedKmph', 'infectionRate']]
train_data

In [None]:
y = train_data[["ConfirmedCases", "Fatalities"]]
X = train_data.drop(["ConfirmedCases", "Fatalities"],axis=1)

display(X.head())
print(X.shape, y.shape)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
cm = train_data.corr()
plt.figure(figsize=(20,10))
sns.heatmap(cm, annot=True)

## Train and Evaluate Model (Random Forest)

In [None]:
fcols = ['population', 'density', 'fertility',
       'age', 'urban_percentage', 'icu']
mcols = ['humidity',
       'sunHour', 'tempC', 'windspeedKmph']

In [None]:

new = pd.DataFrame()
for country in df['country'].unique():
    cdf = df.query('country == @country')
#     cols = ['tempC  ' , 'windspeedKmph' , ]
    for col in fcols :
        cdf[col] = cdf[col].fillna(method = 'ffill')
    for col in mcols:
        cdf[col] = cdf[col].fillna(cdf.median())
    new = new.append(cdf)
        
cdf.to_csv('./nitesh.csv')

In [None]:
new.columns

In [None]:
df = new[['ConfirmedCases','Date', 'Fatalities', 'ForecastId',
       'Id','country', 'population', 'density', 'fertility',
       'age', 'urban_percentage', 'icu', 'index', 'state','humidity',
       'sunHour', 'tempC', 'windspeedKmph']]

df

In [None]:
def show_missing(df1):
    df_train = df1
    total = df_train.isnull().sum().sort_values(ascending=False)
    percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)*100
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print(missing_data.head(20))



show_missing(df)

### TODO :
* OHE
* Model create

In [None]:
from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)

In [None]:
# Split into training and evaluation data:
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_log_error, make_scorer
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error of a prediction set.
    params:
        y_true: numpy array of ground truth
        y_pred: numpy array of predictions
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle)

X_train, X_val, y_train, y_val = tts(X, y, test_size= 0.2, random_state=42, shuffle=True)

In [None]:
model_infected = DecisionTreeRegressor(random_state=42, criterion="mae")

scores = cross_val_score(model_infected, 
                      X_train,
                      y_train["confirmed"],
                      cv=5, scoring=rmsle_scorer)

print("Cross Validation of Confirmed Cases: Mean = {}, std = {}".format(scores.mean(), scores.std()))
model_infected.fit(X_train, y_train["confirmed"])
result_infected = rmsle(y_val["confirmed"], model_infected.predict(X_val))
print("Validation Infected set RMSLE: {}".format(result_infected))

In [None]:
model_deaths = DecisionTreeRegressor(random_state=42, criterion="mae")

scores = cross_val_score(model_deaths, 
                      X_train,
                      y_train["deaths"],
                      cv=5, scoring=rmsle_scorer)

print("Cross Validation of Fatal Cases: Mean = {}, std = {}".format(scores.mean(), scores.std()))
model_deaths.fit(X_train, y_train["deaths"])
result_deaths = rmsle(y_val["deaths"], model_deaths.predict(X_val))
print("Validation Death set RMSLE: {}".format(result_deaths))

In [None]:
# Final Evalutation
print("Final Validatio score: {}".format(np.mean([result_infected, result_deaths])))

## Extract Features for Infections

In [None]:
model_infected = model_infected.fit(X, y["confirmed"])
model_deaths = model_deaths.fit(X, y["deaths"])

In [None]:
def show_feature_importance(forest):
    """
    Creates a sorted list of the feature importance of a decision tree algorithm.
    Furthermore it plots it.
    params:
        forest: Decision Tree algorithm
    """
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("{}, Feature: {}, Importance: {}".format(f + 1, X.columns[indices[f]], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure(figsize=(20,10))
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices], color="r", align="center")
    plt.xticks(range(X.shape[1]),  X.columns[indices], rotation='vertical')
    plt.xlim([-1, X.shape[1]])
    plt.show()

In [None]:
show_feature_importance(model_infected)

From the plot above we can see that many variables are positively correlated to the number of COVID19 infections such as: temperature, hours of sunlight, population, wind speed, humidity, and age. 

These variables should be analyzed carefully as they are not necessaril causal. In terms of population for example, the more people there are in a country, the more likely they are to get infected.
Also, is it possible, that older people are more likely to be infected? Maybe they are also more likely to be tested, and hence confirmed.
Weather conditions can help the virus to spread faster, such as temperature and humidity. It could be that the more hours of sunlight in a country, the more that people will want to be out and interact with social groups.
The percentage of people living in an urban area also has some importance because it signifies a higher density of people, making it easier to transmit the virus.

## Extract Features for Deaths

In [None]:
show_feature_importance(model_deaths)

When inspecting the mortality, it appears as if weather conditions are more important than factors such as population, age, and urban percentage. Of course the standard deviation of prediction error should be taken into account, but from this data we can conclude that temperature and humidity are important features for predicting COVID19 mortality.

Furthermore, with the current regression model, it does not seem that ICU beds per 1000 people are as important as expected.

## Create Submission
The test set for this week is from the 12th of March until the 23rd of April.

In [None]:
test_df = pd.read_csv("../input/covid19-global-forecasting-week-1/test.csv")
test_df.rename(columns={'Date': 'date', 
                     'Province/State':'state',
                     'Country/Region':'country',
                    }, inplace=True)
test_df["date"] = pd.to_datetime(test_df['date'])
test_df['state'] = test_df['state'].fillna('')
test_df.info()

In [None]:
test_df = test_df.merge(df_temperature, on=['country','date', 'state'], how='left')
test_df = test_df.merge(countries_df, on=['country'], how='left')
test_df = test_df.merge(icu_cleaned, on=['country'], how='left')
test_df.shape

In [None]:
X_test = test_df.set_index("ForecastId").drop(["Lat", "Long", "date", "state", "country", "index"], axis=1).fillna(0)
#X_test = scaler.fit_transform(X_test)
y_pred_confirmed = model_infected.predict(X_test)
y_pred_deaths = model_deaths.predict(X_test)

In [None]:
submission = pd.DataFrame()
submission["ForecastId"] = test_df["ForecastId"]
submission = submission.set_index(['ForecastId'])
submission["ConfirmedCases"] = y_pred_confirmed.astype(int)
submission["Fatalities"] = y_pred_deaths.astype(int)
submission.to_csv("submission.csv")
submission.head()

# Time Series Analysis
Let's now look into a time series analysis of the issue using Prophet and using the log of the confirmed cases in Italy as an example.

In [None]:
from fbprophet import Prophet
m = Prophet()
italy_data = data[data['country']=='Italy']
ts_df = pd.concat([italy_data['date'], np.log(italy_data['confirmed']+1)], axis=1, keys=['ds', 'y'])
ts_df.head()
m.fit(ts_df)

In [None]:
future = m.make_future_dataframe(periods=14)
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
fig1 = m.plot(forecast)

In [None]:
from fbprophet.plot import plot_plotly
import plotly.offline as py
py.init_notebook_mode()

fig = plot_plotly(m, forecast)  # This returns a plotly Figure
py.iplot(fig)

In [None]:
ts_df = data
ts_df.info()

In [None]:
ts_df['infectionRate'] = round((ts_df['confirmed']/ts_df['population'])*100, 5)
ts_df = ts_df[ts_df['infectionRate'] >= threshold]
ts_df.index = ts_df.date

In [None]:
ts_df = ts_df.drop([
                     "country", 
                     "active", 
                     "recovered", 
                     "infectionRate",
                     "state",
                     "date",
                     "Lat",
                     "Long",
                     "population",
                     "density",
                     "fertility",
                     "age",
                     "urban_percentage",
                     "icu",
                     "index"
                    ], axis= 1).dropna()

#y = train_data[["confirmed", "deaths"]]
#X = train_data.drop(["confirmed", "deaths"],axis=1)

In [None]:
ts_df = ts_df[:60]
ts_df.head()

In [None]:
train_percentage = 0.75

In [None]:
train = ts_df[:int(train_percentage*(len(ts_df)))]
valid = ts_df[int((1-train_percentage)*(len(ts_df))):]

In [None]:
from statsmodels.tsa.vector_ar.var_model import VAR
model = VAR(endog=train)
model_fit = model.fit()

In [None]:
prediction = model_fit.forecast(model_fit.y, steps=len(valid))

In [None]:
pred = pd.DataFrame(index=range(0,len(prediction)),columns=ts_df.columns)
for j in range(0,prediction.shape[1]):
    for i in range(0, len(prediction)):
        pred.iloc[i][j] = prediction[i][j]

In [None]:
for i in ts_df.columns:
    print('rmse value for', i, 'is : ', np.sqrt(mean_squared_error(pred[i], valid[i])))

In [None]:
days_to_predict = 14
future_dt = pd.date_range(ts_df.last_valid_index(), periods=days_to_predict)

model = VAR(endog=ts_df)
model_fit = model.fit()
yhat = model_fit.forecast(model_fit.y, steps=days_to_predict)

pred_df = pd.DataFrame(yhat, columns=ts_df.columns)
pred_df = pred_df.drop([
                     "humidity", 
                     "sunHour", 
                     "tempC", 
                     "windspeedKmph"
                    ], axis=1)
pred_df['confirmed'] = pred_df['confirmed'].astype(int)
pred_df['deaths'] = pred_df['deaths'].astype(int)
pred_df.index = future_dt
pred_df.head()

We want to see if we can actually use the time series temperature data to find time-related insights. To do this we are looking into multivariate time series regression tools such as Vector Auto Regression (VAR)...

# Thanks!
If you like this kernel, give us an upvote :)
Stay healthy, you beautiful people!