In [12]:
import pandas as pd 
from statistics import mean
import numpy as np
import statsmodels.api as sm 
import json 
import folium 
from statsmodels import stats
import math 
from sklearn import linear_model
from sklearn import model_selection

In [13]:
def merge_data(attr_file): 
    with open(attr_file, "r") as infile:
        attributes = json.load(infile)
    weather_data = pd.read_csv(f'{attributes["Files"]["Base_path"]}{attributes["Files"]["Weather"]}', sep=",")
    if weather_data.isnull().values.any(): 
        return "Missing data"
    weather_data[["country","region"]]= weather_data["iso3166-2"].str.split("-",expand=True)
    weather_data = weather_data[weather_data["country"] == attributes["Country"]]
    corona_data = pd.read_csv(f'{attributes["Files"]["Base_path"]}{attributes["Files"]["Corona"]}', sep="\t")
    with open(f'{attributes["Files"]["Base_path"]}{attributes["Files"]["Metadata"]}') as f:
        country_metadata = json.load(f)
    with open(f'{attributes["Files"]["Base_path"]}{attributes["Files"]["Shapefile"]}') as f: 
        geojson = json.load(f)
    covid_region_map = {int(country_metadata["country_metadata"][i]["covid_region_code"]): country_metadata["country_metadata"][i]["iso3166-2_code"] for i in range(len(country_metadata["country_metadata"]))}
    corona_data["iso3166-2"] = corona_data["region_code"].map(covid_region_map)
    for i in attributes["Replacements"]:
        corona_data["region_name"] = corona_data["region_name"].str.replace(i,attributes["Replacements"][i])
    # print(corona_data.describe)
    merged = weather_data.merge(corona_data, on = attributes["Merge"])
    # print(merged.describe)
    for column in merged.columns: 
        try: 
            merged[column] = merged[column].astype("float")
        except ValueError:
            try: 
                merged[column] = pd.to_datetime(merged[column])
            except ValueError:
                pass 
    merged.replace([np.inf,-np.inf],np.nan,inplace=True)
    merged.dropna(inplace=True)
    
    return merged

In [14]:
corona_and_weather = merge_data("../Scripts/attributes.json")
# nl_coronadata = "../Data/Raw/corona/nl_corona.csv"
nl_metadata = "../Data/Raw/metadata/nl_metadata.json"
nl_geojson = "../Data/Raw/shapefiles/nl.geojson"

# corona_df = pd.read_csv(nl_coronadata, sep = "\t")

with open(nl_metadata, "r") as f:
    country_metadata = json.load(f)

with open(nl_geojson, "r") as f:
    geojson = json.load(f)

rm = {int(country_metadata["country_metadata"][i]["covid_region_code"]): country_metadata["country_metadata"][i]["iso3166-2_code"] for i in range(len(country_metadata["country_metadata"]))}
corona_and_weather["region"] = corona_and_weather["region_code"].map(rm)

pm = {country_metadata["country_metadata"][i]["iso3166-2_name_en"]: country_metadata["country_metadata"][i]["population"]for i in range(len(country_metadata["country_metadata"]))}
corona_and_weather["population"] = corona_and_weather["region_name"].map(pm)

corona_and_weather["cases_pc"] = corona_and_weather["hospitalized_addition"] / corona_and_weather["population"]

# df = corona_and_weather.merge(nl_data, left_on = ["date", "region"], right_on = ["date", "iso3166-2"])
# df = df.drop(["region_code", "region_y"], axis = 1)

corona_and_weather = corona_and_weather.dropna()
corona_and_weather.replace([np.inf,-np.inf],np.nan,inplace=True)
corona_and_weather.dropna(inplace=True)
corona_and_weather.isna().any()
corona_and_weather.isin([np.inf,-np.inf]).values.sum()
corona_and_weather

Unnamed: 0,date,iso3166-2,RelativeHumiditySurface,SolarRadiation,Surfacepressure,TemperatureAboveGround,Totalprecipitation,UVIndex,WindSpeed,country,...,region_name,region_code,deceased_addition,confirmed_addition,hospitalized_addition,deceased_cumulative,confirmed_cumulative,hospitalized_cumulative,population,cases_pc
192,2020-03-14,NL-DR,76.175131,5.677015e+06,2.444319e+06,278.645045,0.000246,10.722513,3.870691,NL,...,Drenthe,22.0,0.0,2.0,0.0,0.0,15.0,0.0,493657,0.000000
193,2020-03-14,NL-FL,80.302209,7.426985e+06,2.443688e+06,279.671005,0.000252,9.391304,5.326425,NL,...,Flevoland,24.0,0.0,7.0,0.0,0.0,14.0,3.0,422979,0.000000
194,2020-03-14,NL-FR,77.900978,6.994390e+06,2.444151e+06,279.227731,0.000179,10.532446,5.612008,NL,...,Friesland,21.0,0.0,1.0,0.0,0.0,8.0,1.0,649944,0.000000
195,2020-03-14,NL-GE,77.892281,7.956310e+06,2.441177e+06,279.821025,0.000212,10.868673,3.875319,NL,...,Gelderland,25.0,1.0,12.0,5.0,1.0,76.0,14.0,2086142,0.000002
196,2020-03-14,NL-GR,74.633632,5.360761e+06,2.446000e+06,278.622180,0.000100,10.408840,4.368198,NL,...,Groningen,20.0,0.0,0.0,0.0,0.0,4.0,1.0,586061,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4327,2021-02-21,NL-NH,75.253727,5.696371e+06,2.429628e+06,283.445914,0.000000,5.766284,4.694117,NL,...,North Holland,27.0,2.0,959.0,11.0,2200.0,179755.0,3654.0,2879611,0.000004
4328,2021-02-21,NL-OV,69.729526,6.547145e+06,2.428964e+06,284.413133,0.000000,5.921844,3.537233,NL,...,Overijssel,23.0,0.0,172.0,2.0,1017.0,74162.0,1123.0,1162500,0.000002
4329,2021-02-21,NL-UT,70.287774,6.386869e+06,2.429348e+06,284.654022,0.000000,5.903930,3.833734,NL,...,Utrecht,26.0,3.0,324.0,5.0,1110.0,84158.0,1967.0,1354979,0.000004
4330,2021-02-21,NL-ZE,67.850837,6.807321e+06,2.428021e+06,284.711623,0.000000,5.975684,4.871302,NL,...,Zeeland,29.0,0.0,79.0,0.0,170.0,15910.0,455.0,383519,0.000000


In [15]:
corona_and_weather.dtypes

date                       datetime64[ns]
iso3166-2                          object
RelativeHumiditySurface           float64
SolarRadiation                    float64
Surfacepressure                   float64
TemperatureAboveGround            float64
Totalprecipitation                float64
UVIndex                           float64
WindSpeed                         float64
country                            object
region                             object
region_name                        object
region_code                       float64
deceased_addition                 float64
confirmed_addition                float64
hospitalized_addition             float64
deceased_cumulative               float64
confirmed_cumulative              float64
hospitalized_cumulative           float64
population                          int64
cases_pc                          float64
dtype: object

In [16]:
stringency = "../Data/Raw/Stringency_Index/stringency.csv"

str_data = pd.read_csv(stringency, sep = ",")
str_data = str_data[1:]
# str_data = str_data[str_data["CountryCode"] == "NL"]
# str_data
# str_data["Date"] = pd.to_datetime(str_data["Date"])
nl_str = str_data[str_data["CountryCode"] == "NLD"]


# nl_str['Date_formatted']=nl_str['Date']

# nl_str['Date_formatted']=nl_str['Date_formatted'].apply(str)


# nl_str['Date_formatted'] = nl_str.Date_formatted.apply(lambda x: x[:4] + '-' + x[4:6] + '-' + x[6:8])
nl_str["date"] = pd.to_datetime(nl_str.Date.apply(str).apply(lambda x: x[:4] + '-' + x[4:6] + '-' + x[6:8]))
# nl_str.pop(nl_str["Date_formatted"])
# nl_str
# nl_str["region_name"] = nl_str["RegionName"]
# nl_str = nl_str.dropna()

# merged = nl_str.merge(corona_and_weather,on=["date"])


  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nl_str["date"] = pd.to_datetime(nl_str.Date.apply(str).apply(lambda x: x[:4] + '-' + x[4:6] + '-' + x[6:8]))


In [17]:
merge_df = corona_and_weather.merge(nl_str,on=["date"])
merge_df = sm.add_constant(merge_df)
weather = corona_and_weather[["RelativeHumiditySurface",
                            "SolarRadiation",
                            "Surfacepressure",
                            "TemperatureAboveGround",
                            "Totalprecipitation",
                            "UVIndex",
                            "WindSpeed"]]
corona = corona_and_weather["cases_pc"]

  result = getattr(ufunc, method)(*inputs, **kwargs)
  x = pd.concat(x[::order], 1)


In [18]:
weather_vars = ["RelativeHumiditySurface",
                            "SolarRadiation",
                            "Surfacepressure",
                            "TemperatureAboveGround",
                            "Totalprecipitation",
                            "UVIndex",
                            "WindSpeed",
                            "const"]
regions = ["const",]


In [19]:
# x_train, x_test, y_train, y_test = model_selection.train_test_split()
regr = linear_model.LinearRegression()
regr.fit(weather,corona)
print(regr.coef_)



[-1.97259605e-07  2.14498641e-13  7.19676664e-12 -1.45907582e-07
  1.33030347e-04 -2.45588924e-07 -1.71165884e-07]


In [20]:
merge_df["cases_pc"]

0       0.000000
1       0.000000
2       0.000000
3       0.000002
4       0.000000
          ...   
4135    0.000004
4136    0.000002
4137    0.000004
4138    0.000000
4139    0.000004
Name: cases_pc, Length: 4140, dtype: float64

In [22]:
merge_df.drop("Date",inplace=True,axis=1)
    

In [32]:
columns = list(merge_df.columns)[2:]

for i in range(18): 
    columns.pop(7)
columns = columns[:49]
columns.append("const")
# print(columns)
# for i in range(len(columns)):
#     if columns[i] == "ConfirmedCases":
#         print(i)
# merge_df = sm.add_constant(merge_df)

merge_df["const"] = [1]*len(merge_df)
columns.pop(41)
columns.pop(41)
print(merge_df['const'])


0       1
1       1
2       1
3       1
4       1
       ..
4135    1
4136    1
4137    1
4138    1
4139    1
Name: const, Length: 4140, dtype: int64


In [37]:
tmp = merge_df[columns]
tmp.replace(np.nan,0,inplace=True)
for i in tmp.dtypes:
    print(i)
# tmp = np.asarray(tmp)
# type(tmp)

float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [42]:
tmp_two = np.asarray(merge_df["cases_pc"])


In [43]:
est = sm.OLS(tmp_two, tmp, hasconst = True).fit(cov_type = "cluster", cov_kwds = {"groups": merge_df["iso3166-2"]}, use_t = True)
# est = est
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.254
Model:                            OLS   Adj. R-squared:                  0.249
Method:                 Least Squares   F-statistic:                 3.227e+09
Date:                Tue, 22 Feb 2022   Prob (F-statistic):           8.12e-51
Time:                        13:00:43   Log-Likelihood:                 43708.
No. Observations:                4140   AIC:                        -8.736e+04
Df Residuals:                    4110   BIC:                        -8.717e+04
Df Model:                          29                                         
Covariance Type:              cluster                                         
                                                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------



In [44]:
regr = linear_model.LinearRegression()
regr.fit(tmp,tmp_two)
# print(regr.coef_)
coef_list = list(regr.coef_)
for i in range(len(columns)):
    print(f"{columns[i]} - {coef_list[i]}")

RelativeHumiditySurface - -1.1431584059305806e-07
SolarRadiation - 1.7337695786941807e-13
Surfacepressure - 3.6223221144697245e-12
TemperatureAboveGround - -1.5557525020273595e-07
Totalprecipitation - 1.907185912297377e-06
UVIndex - -3.7193558318390065e-07
WindSpeed - -1.0377658078815683e-07
C1_Flag - 9.416470972452911e-15
C2_Workplace closing - 8.543301247274753e-07
C2_Flag - -3.2953552115132538e-18
C3_Cancel public events - 9.028163646823833e-07
C3_Flag - 1.1280823403994778e-06
C4_Restrictions on gatherings - -2.714987656662192e-06
C4_Flag - 7.157428404298838e-20
C5_Close public transport - 1.4307111699476502e-06
C5_Flag - 1.4307111699466303e-06
C6_Stay at home requirements - 3.789017457579329e-06
C6_Flag - 6.776263578034403e-20
C7_Restrictions on internal movement - -3.845249699954067e-08
C7_Flag - -3.845249700004075e-08
C8_International travel controls - 9.783230540787169e-20
E1_Income support - 2.2616542336512512e-06
E1_Flag - 1.083143062971807e-06
E2_Debt/contract relief - 8.9751