# Final Project | Prediction possibilities in a pandemic 
by Kevin Spurk

Text

# Table of content

1. 
2.
3.
...

# 1 | Setup

### 1.1 Libary imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
import random
import datetime
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None
pd.set_option('max_row', None)

### 1.2 Data imports

### 1.2.1 google mobility data

- movement of people in different areas scales to 100 
- the baseline is the median value, for the corresponding day of the week, during the 5-week period Jan 3–Feb 6, 2020.

In [2]:
# csv import
data_mob_global = pd.read_csv('data/mobility_google/Global_Mobility_Report.csv')

data_mob_global.head()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-15,0.0,4.0,5.0,0.0,2.0,1.0
1,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-16,1.0,4.0,4.0,1.0,2.0,1.0
2,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-17,-1.0,1.0,5.0,1.0,2.0,1.0
3,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-18,-2.0,1.0,5.0,0.0,2.0,1.0
4,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-19,-2.0,0.0,4.0,-1.0,2.0,1.0


In [3]:
data_mob_global.shape

(7697738, 15)

In [4]:
data_mob_global.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7697738 entries, 0 to 7697737
Data columns (total 15 columns):
 #   Column                                              Dtype  
---  ------                                              -----  
 0   country_region_code                                 object 
 1   country_region                                      object 
 2   sub_region_1                                        object 
 3   sub_region_2                                        object 
 4   metro_area                                          object 
 5   iso_3166_2_code                                     object 
 6   census_fips_code                                    float64
 7   place_id                                            object 
 8   date                                                object 
 9   retail_and_recreation_percent_change_from_baseline  float64
 10  grocery_and_pharmacy_percent_change_from_baseline   float64
 11  parks_percent_change_from_baseline   

**results**

data contains:

features to keep:

### 1.2.2 facebook mobility data

In [5]:
# csv import
data_mob_fb = pd.read_csv('data/mobility_fb/movement-range-2021-11-07.txt', sep='\t')

In [6]:
data_mob_fb.head()

Unnamed: 0,ds,country,polygon_source,polygon_id,polygon_name,all_day_bing_tiles_visited_relative_change,all_day_ratio_single_tile_users,baseline_name,baseline_type
0,2021-01-01,AGO,GADM,AGO.10.10_1,Lubango,-0.35291,0.25398,full_february,DAY_OF_WEEK
1,2021-01-02,AGO,GADM,AGO.10.10_1,Lubango,-0.06131,0.1733,full_february,DAY_OF_WEEK
2,2021-01-03,AGO,GADM,AGO.10.10_1,Lubango,-0.00392,0.21932,full_february,DAY_OF_WEEK
3,2021-01-04,AGO,GADM,AGO.10.10_1,Lubango,0.15114,0.11662,full_february,DAY_OF_WEEK
4,2021-01-05,AGO,GADM,AGO.10.10_1,Lubango,0.12696,0.10832,full_february,DAY_OF_WEEK


In [7]:
data_mob_fb.shape

(4568692, 9)

In [8]:
data_mob_fb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4568692 entries, 0 to 4568691
Data columns (total 9 columns):
 #   Column                                      Dtype  
---  ------                                      -----  
 0   ds                                          object 
 1   country                                     object 
 2   polygon_source                              object 
 3   polygon_id                                  object 
 4   polygon_name                                object 
 5   all_day_bing_tiles_visited_relative_change  float64
 6   all_day_ratio_single_tile_users             float64
 7   baseline_name                               object 
 8   baseline_type                               object 
dtypes: float64(2), object(7)
memory usage: 313.7+ MB


### 1.2.3 google trends data

In [9]:
# connect to google and deepL APIs 
from pytrends.request import TrendReq
from pytrends import dailydata
from translate import Translator 

# get authentification for deepL API
with open('auth/deepl_auth_key.txt') as f:
    deepl_auth_key = f.read()

# google trends API access request
pytrends = TrendReq(hl='en-US', tz=0)

In [10]:
from pyf.apis import deepl_translate_list
from pyf.imex import df_to_csv_format

# keyword list 
keywords_list = ['covid', 'corona', 'SARS-CoV-2', 'virus', 'symptoms', 'fever', 'cough', 'tiredness', 'loss of smell', 'throat', 'shortness of breath', 'breathing issues', 'headache', 'diarrhea', 'chest pain', 'PCR', 'test', 'sick', 'infection']

# dict of countries and there languages to search in
countries_lang = {'GB': 'en', 'DE': 'de', 'FR': 'fr', 'NL': 'nl',
                  'ES': 'es', 'PT': 'pt', 'PL': 'pl', 'IT': 'it',
                  'AT': 'de', 'DK': 'da', 'SE': 'sv', 'GR': 'el',
                  'CZ': 'cs'}

# translating keyword list into target languages
c19_kw_dict, c19_kw_df = deepl_translate_list(auth=deepl_auth_key, keywords=keywords_list, lang_out=countries_lang)

# export translations to csv
# df_to_csv_format(c19_kw_df, 'keywords_translations', '2')

# c19_kw_df = pd.read_csv('data/multilanguage/keywords_translations_2.csv')


In [11]:
c19_kw_df

Unnamed: 0,GB,DE,FR,NL,ES,PT,PL,IT,AT,DK,SE,GR,CZ
0,covid,covid,covide,covid,covid,covid,covid,covid,covid,covid,covid,covid,covid
1,corona,corona,corona,corona,corona,corona,korona,corona,corona,corona,corona,corona,corona
2,SARS-CoV-2,SARS-CoV-2,SRAS-CoV-2,SARS-CoV-2,SARS-CoV-2,SRA-CoV-2,SARS-CoV-2,SARS-CoV-2,SARS-CoV-2,SARS-CoV-2,SARS-CoV-2,SARS-CoV-2,SARS-CoV-2
3,virus,Virus,virus,virus,virus,vírus,wirus,virus,Virus,virus,virus,ιός,virus
4,symptoms,Symptome,symptômes,symptomen,síntomas,sintomas,objawy,sintomi,Symptome,symptomer,symtom,συμπτώματα,příznaky
5,fever,Fieber,fièvre,koorts,fiebre,febre,gorączka,febbre,Fieber,feber,feber,πυρετός,horečka
6,cough,Husten,toux,hoest,toser,tosse,kaszel,tosse,Husten,hoste,hosta,βήχας,kašel
7,tiredness,Müdigkeit,fatigue,vermoeidheid,cansancio,cansaço,zmęczenie,stanchezza,Müdigkeit,træthed,trötthet,κούραση,únava
8,loss of smell,Geruchsverlust,la perte de l'odorat,verlies van reukzin,pérdida del olfato,perda do olfacto,utrata węchu,perdita dell'olfatto,Geruchsverlust,tab af lugt,förlust av lukt,απώλεια οσμής,ztráta čichu
9,throat,Kehle,gorge,keel,garganta,garganta,gardło,gola,Kehle,hals,hals,λαιμός,krk


In [12]:
import time
from datetime import date, timedelta
from pyf.apis import g_trends_ml

In [13]:

c19_kw_p2 = c19_kw_dict

del c19_kw_p2['GB']
del c19_kw_p2['DE']
del c19_kw_p2['FR']
del c19_kw_p2['ES']
del c19_kw_p2['NL']
del c19_kw_p2['PL']
del c19_kw_p2['PT']
del c19_kw_p2['IT']
del c19_kw_p2['AT']
del c19_kw_p2['DK']
del c19_kw_p2['GR']
del c19_kw_p2['SE']

# get google trends data for keywords in all languages
c19_g_trends = g_trends_ml(kw_dict=c19_kw_p2, dt_start='2020-03', dt_end='2021-10', sleep_time=800)


covid:2020-03-01 2020-03-31
covid:2020-04-01 2020-04-30
covid:2020-05-01 2020-05-31
covid:2020-06-01 2020-06-30
covid:2020-07-01 2020-07-31
covid:2020-08-01 2020-08-31
covid:2020-09-01 2020-09-30
covid:2020-10-01 2020-10-31
covid:2020-11-01 2020-11-30
covid:2020-12-01 2020-12-31
covid:2021-01-01 2021-01-31
covid:2021-02-01 2021-02-28
covid:2021-03-01 2021-03-31
covid:2021-04-01 2021-04-30
covid:2021-05-01 2021-05-31
covid:2021-06-01 2021-06-30
covid:2021-07-01 2021-07-31
covid:2021-08-01 2021-08-31
covid:2021-09-01 2021-09-30
covid:2021-10-01 2021-10-31
corona:2020-03-01 2020-03-31
corona:2020-04-01 2020-04-30
corona:2020-05-01 2020-05-31
corona:2020-06-01 2020-06-30
corona:2020-07-01 2020-07-31
corona:2020-08-01 2020-08-31
corona:2020-09-01 2020-09-30
corona:2020-10-01 2020-10-31
corona:2020-11-01 2020-11-30
corona:2020-12-01 2020-12-31
corona:2021-01-01 2021-01-31
corona:2021-02-01 2021-02-28
corona:2021-03-01 2021-03-31
corona:2021-04-01 2021-04-30
corona:2021-05-01 2021-05-31
coron

ReadTimeout: HTTPSConnectionPool(host='trends.google.com', port=443): Read timed out. (read timeout=5)

In [None]:
import glob
from pyf.imex import import_concat_csv

# import google trends data from csv files
data_g_trends = import_concat_csv('data/keywords_google/', 'g_trend_2020-03_2021-10_GB')
     

In [None]:
data_g_trends.head()

In [None]:
len(data_g_trends)

In [None]:
'''
def trends_complete_timeframe(df, dt_start, dt_end, kw_listed):
    
    # convert input to datetime
    dt_s = datetime.datetime.strptime(dt_start, '%Y-%m-%d').date()
    dt_e = datetime.datetime.strptime(dt_end, '%Y-%m-%d').date()
    # lists for missing days
    dates_missing = []
    trend_missing = []
    
    # loop to find missing dates in df
    while dt_s <= dt_e:
        date_check = False
        for d in df['date']:
            if d == dt_s:
                date_check = True
        # append missing date to lists
        if date_check == False:
            dates_missing.append(dt_s)
            trend_missing.append(np.NaN)
        dt_s += timedelta(days=1)
    
    # df for missing dates
    df_dates_missing = pd.DataFrame({'date': dates_missing, kw_listed[0]: trend_missing})
    df_completed = pd.concat([df, df_dates_missing], axis=0)
    df_completed['date'] = pd.to_datetime(df_completed['date'], errors='coerce')
    df_completed = df_completed.sort_values(by=['date'], ignore_index=True)
    
    return df_completed   


def trends_df_transform(df):
    # drop unwanted col
    for col in df.columns:
        if col == 'isPartial':
            df = df.drop(['isPartial'], axis=1)
            
    # get date as col
    df.reset_index(inplace=True)
    df['date'] = df['date'].dt.date
    
    # group rows by date
    df = df.groupby(['date']).mean()
    df.reset_index(inplace=True)
    df['date'] = df['date'].dt.date
    
    return df

def df_to_csv_format(df, name_cst, name_var, index=False):
    file_name = name_cst + '_' + name_var
    df.to_csv(f'{file_name}.csv', sep=',', index=index)
    return df


def trends_multilang(kw_dict, dt_start, dt_end, main_lang='GB', cat=0, sleep=60):
    # df for all results
    trends_all = pd.DataFrame({})
    
    # build timeframe str for api
    timeframe = dt_start + ' ' + dt_end
    
    # loop through lang
    for geo in kw_dict.keys():
        # df for results of one loc
        search_trend = pd.DataFrame({})
        # reformat kw list for sub func
        kw_group = list(zip(*[iter(kw_dict[geo])]*1))
        kw_listlist = [list(kw) for kw in kw_group]
        
        # loop through keywords
        for k in kw_listlist:
            # build payload
            print(k)
            try:
                pytrends.build_payload(kw_list=k, cat=cat, timeframe=timeframe, geo=geo)
                # get df with search trend for kw
                k_trend = pytrends.get_historical_interest(k, year_start=int(dt_start.split('-')[0]), month_start=int(dt_start.split('-')[1]), day_start=int(dt_start.split('-')[2]), hour_start=0, year_end=int(dt_end.split('-')[0]), month_end=int(dt_end.split('-')[1]), day_end=int(dt_end.split('-')[2]), hour_end=23, sleep=sleep)
                time.sleep(10)
                print(k_trend.head())
            except requests.exceptions.Timeout:
                print('Timeout occured.')
            
            # transform df
            k_trend = trends_df_transform(k_trend)
            
            # add missing dates
            k_trend_completed = trends_complete_timeframe(df=k_trend, dt_start=dt_start, dt_end=dt_end, kw_listed=k)
            
            # add df with search trend for kw to df for results of one loc
            search_trend['date'] = k_trend_completed['date']
            search_trend[k[0]] = k_trend_completed[k[0]]
        
        # add loc to df 
        search_trend['loc'] = geo
        # export df to csv
        search_details = dt_start + '_' + dt_end + geo
        df_to_csv_format(df=search_trend, name_cst='g_trend', name_var=search_details, index=False)
        
        # concat df with df holding all results
        if len(trends_all) > 0:
            search_details.columns = trends_all.columns
        trends_all = pd.concat([trends_all, search_details], axis=0)
        trends_all.reset_index(inplace=True)
        trends_all = trends_all.sort_values(by=['date'], ignore_index=True)
    
    return trends_all
'''

In [None]:
'''
date_t1 = datetime.datetime.strptime('2020-07-06', '%Y-%m-%d').date()
exists = date_t1 in data4['date']
exists
'''

In [None]:
'''
while test_date_start <= test_date_end:
    if test_date_start not in data4['date']:
        print(test_date_start.strftime('%Y-%m-%d'))
        test_date_start += timedelta(days=1)
'''

'''
for i in data4['date']:
    if i == test_date:
        date_present = True
    else:
        pass
        
print(date_present)
'''

### 1.2.4 twitter trends data

In [None]:
import tweepy as tw 
from pyf.apis import tweepy_auth

# get authentification for twitter API
tw_api, tw_client = tweepy_auth('auth/tw_auth.txt')

In [None]:
tweets = tw_client.get_recent_tweets_count(query='winter', granularity='day')
tweets

In [None]:
# import requests
# import json

### 1.2.5 Covid-19 data

In [None]:
# csv import
#data_c19_hospitalization = pd.read_csv('data/covid19/Covid19-hospital_and_ICU_admission_rates.csv')
#data_c19_cases = pd.read_csv('data/covid19/time_series_covid19_confirmed_global.csv')
#data_c19_deaths = pd.read_csv('data/covid19/time_series_covid19_deaths_global.csv')
#cases_daily_historical = pd.read_csv('data/covid19/cases_daily_historical.csv')
owid_covid_data = pd.read_csv('data/covid19/owid-covid-data.csv')

In [None]:
owid_covid_data[owid_covid_data['location'] == 'Germany'].head()

In [None]:
owid_covid_data.shape

In [None]:
owid_covid_data.info()

# 2 | Data cleaning/wranging

### 2.1 google mobility data 

### 2.1.1 Feature elimination

In [None]:
# dict of countries, country codes and there languages
tgt_countries = {'GB': ['en', 'united kingdom', 'GBR'], 'DE': ['de', 'germany', 'DEU'], 'FR': ['fr', 'france', 'FRA'], 'NL': ['nl', 'netherlands', 'NLD'],
                 'ES': ['es', 'spain', 'ESP'], 'PT': ['pt', 'portugal', 'PRT'], 'PL': ['pl', 'poland', 'POL'], 'IT': ['it', 'italy', 'ITA'],
                 'AT': ['de', 'austria', 'AUT'], 'DK': ['da', 'denmark', 'DNK'], 'SE': ['sv', 'sweden', 'SWE'], 'GR': ['el', 'greece', 'GRC'],
                 'CZ': ['cs', 'czechia', 'CZE']}


In [None]:
# getting subset of row with data of the target countries
countries_iso2 = [country_code for country_code in tgt_countries.keys()]
data_g_mob = data_mob_global[data_mob_global['country_region_code'].isin(countries_iso2)]

# separating national data from regional data
data_g_mob = data_g_mob[data_g_mob['sub_region_1'].isna()]

# dropping unneccessary columns
data_g_mob = data_g_mob.drop(data_g_mob.iloc[:, 1:8], axis=1)


### 2.1.2. data wrangling

In [None]:
# convert 'date' column to datetime
data_g_mob['date'] = pd.to_datetime(data_g_mob['date'], errors='coerce')
data_g_mob['date'] = data_g_mob['date'].dt.date

data_g_mob['date'].isna().sum()

In [None]:
# clean column names 
data_g_mob.columns = data_g_mob.columns.str.replace('_percent_change_from_baseline', '')

# rename country_region_code column
g_mob_columns = list(data_g_mob.columns.values)

g_mob_new_columns = {col:col for col in g_mob_columns}
g_mob_new_columns['country_region_code'] = 'location'

data_g_mob = data_g_mob.rename(columns=g_mob_new_columns)

# rename locations into iso3 format
for key, val in tgt_countries.items():
    for i in range(len(data_g_mob)):
        if key == data_g_mob['location'].iloc[i]:
            data_g_mob['location'].iloc[i] = val[2]
            

In [None]:
from pyf.preprocessing import timeframe_check_by_group, value_overview, df_timeframe_limit, timeseries_interpolation_clustered

# check if range of dates is complete for all countries
g_mob_timeframe = timeframe_check_by_group(df=data_g_mob, groupby='location', timeframe='date')

In [None]:
g_mob_timeframe

In [None]:
# picking only df entries between Mar 2020 and Oct 2021
data_g_mob = df_timeframe_limit(data_g_mob, 'date', '2020-03-01', '2021-10-31')


In [None]:
g_mob_timeframe = timeframe_check_by_group(df=data_g_mob, groupby='location', timeframe='date')


### 2.1.3. null values

In [None]:
g_mob_ov, g_mob_columns = value_overview(df=data_g_mob, neg_allowed=True)

In [None]:
g_mob_ov

In [None]:
data_g_mob[data_g_mob['grocery_and_pharmacy'].isna()]

In [None]:
# interpolating null values
g_mob_full = timeseries_interpolation_clustered(df=data_g_mob, timeframe='date', cluster_by='location', method='polynomial', order=2)


In [None]:
g_mob_full.isna().sum()

### 2.2 facebook mobility data

### 2.2.1 feature elimination

In [None]:
countries_iso3 = [cn[2] for cn in tgt_countries.values()]
data_mob_fb = data_mob_fb[data_mob_fb['country'].isin(countries_iso3)]


In [None]:
data_mob_fb[data_mob_fb['country'] == 'DEU']['ds'].nunique()

### 2.3 google trends data

### 2.1.1 Feature elimination

no feature eliminaltion


### 2.1.2. data wrangling

In [None]:
# convert 'date' column to datetime
data_g_trends['date'] = pd.to_datetime(data_g_trends['date'], errors='coerce')
data_g_trends['date'] = data_g_trends['date'].dt.date

data_g_trends['date'].isna().sum()

In [None]:
from pyf.preprocessing import clean_headers

# clean column names 
data_g_trends = clean_headers(data_g_trends)

# rename locations into iso3 format
for key, val in tgt_countries.items():
    for i in range(len(data_g_trends)):
        if key == data_g_trends['location'].iloc[i]:
            data_g_trends['location'].iloc[i] = val[2]
            

In [None]:
# check if range of dates is complete for all countries
g_trends_timeframe = timeframe_check_by_group(df=data_g_trends, groupby='location', timeframe='date')

### 2.1.3. null values

In [None]:
g_trends_ov, g_trends_columns = value_overview(df=data_g_trends, neg_allowed=False)

In [None]:
g_trends_ov

In [None]:
# data_g_trends[data_g_trends['shortness_of_breath'].isna()]

In [None]:


# TODO
# complete data
# change kw translations manually and get data
# interpolation




### 2.4 twitter trends data

### 2.5 Covid-19 data

### 2.5.1 feature elimination

In [None]:
# getting subset of row with data of the target countries
countries_iso3 = [cn[2] for cn in tgt_countries.values()]
data_c19 = owid_covid_data[owid_covid_data['iso_code'].isin(countries_iso3)]

# dropping unneccessary columns
data_c19 = data_c19[['iso_code', 'date', 'new_cases_smoothed', 
                     'new_deaths_smoothed', 'new_cases_smoothed_per_million', 'new_deaths_smoothed_per_million', 
                     'icu_patients', 'icu_patients_per_million', 'hosp_patients', 
                     'hosp_patients_per_million', 'positive_rate']]

# convert 'date' column to datetime
data_c19['date'] = pd.to_datetime(data_c19['date'], errors='coerce')
data_c19['date'] = data_c19['date'].dt.date

data_c19['date'].isna().sum()


### 2.5.2. data wrangling

In [None]:
# rename column iso_code
c19_columns = list(data_c19.columns.values)
c19_new_columns = {col:col for col in c19_columns}
c19_new_columns['iso_code'] = 'location'

data_c19 = data_c19.rename(columns=c19_new_columns)

In [None]:
# check if range of dates is complete for all countries
c19_timeframe = timeframe_check_by_group(df=data_c19, groupby='location', timeframe='date')


In [None]:
c19_timeframe

In [None]:
# picking only df entries between Mar 2020 and Oct 2021
data_c19 = df_timeframe_limit(data_c19, 'date', '2020-03-01', '2021-10-31')

c19_timeframe = timeframe_check_by_group(df=data_c19, groupby='location', timeframe='date')


In [None]:
c19_timeframe

In [None]:
c19_ov, c19_columns = value_overview(df=data_c19, neg_allowed=True)

In [None]:
c19_ov

In [None]:
from pyf.preprocessing import complete_timeseries

# adding dates to data from loc 'POL' to have the same dates for all loc
data_pol = data_c19[data_c19['location'] == 'POL']

data_pol = complete_timeseries(df=data_pol, timeframe='date', start_date='2020 03 01', end_date='2021 10 31', constant_col=['location'])

# concating original df and new data for loc 'POL'
data_c19 = data_c19[data_c19['location'] != 'POL']
data_c19 = pd.concat([data_c19, data_pol], axis=0)

In [None]:
data_c19.info()

### 2.5.3 handling null values

In [None]:
data_c19[data_c19['new_cases_smoothed'].isna()]

**Conclusion**

Interpolation is the most valid method of dealing with null values. It's not going to produce good estimates though, since the null values are consequtive and occure at the beginning of the time series.

In [None]:
# interpolating null values for all loc
c19_full = timeseries_interpolation_clustered(df=data_c19, timeframe='date', cluster_by='location', method='backfill', limit_direction='backward')




# TODO: instert first NaN manually and change interpol method to poly and check for difference


In [None]:
c19_full.head()

In [None]:
c19_full.isna().sum()

In [None]:
#

In [None]:
# from pyf.preprocessing import timeseries_plot

# plot_pol = timeseries_plot(data_c19, 'date', 'location', 'DEU')

### 2.6. data merging for modeling

In [None]:
# merging google mobility data with target variables

g_mob_full['date'] = pd.to_datetime(g_mob_full['date'], errors='coerce')
c19_full['date'] = pd.to_datetime(c19_full['date'], errors='coerce')

g_mob_covid = pd.merge(g_mob_full, c19_full, how='inner', on=['date', 'location'])

In [None]:
g_mob_covid.head()

In [None]:

# temporary subset google trends

g_trends_temp = data_g_trends[['date', 'location', 'covid', 'corona', 'virus', 'symptoms', 'fever', 'headache', 'test', 'sick', 'infection']]

countries_temp = list(g_trends_temp['location'].unique())

# temporary subset covid 19 to work with g trends

c19_temp = c19_full[c19_full['location'].isin(countries_temp)]

# merge

g_trends_temp['date'] = pd.to_datetime(g_trends_temp['date'], errors='coerce')
c19_temp['date'] = pd.to_datetime(c19_temp['date'], errors='coerce')

g_trends_covid = pd.merge(g_trends_temp, c19_temp, how='inner', on=['date', 'location'])


In [None]:
g_trends_covid.head()

In [None]:
# TODO

# merging other data with target variables


# 3 | EDA

### 3.1 overview of numerical features

### 3.1.1 google mobility data

In [None]:
g_mob_full.describe().T

### 3.1.2 facebook mobility data

### 3.1.3 google trends data

In [None]:
g_trends_temp.describe().T

### 3.1.4 twitter trends data

### 3.1.5 covid 19 data

In [None]:
c19_full.describe().T

### 3.2 correlations

### 3.2.1 google mobility data

In [None]:
from pyf.eda import show_heatmap

# correlation heatmap
show_heatmap(g_mob_covid, 14, 12, title='correlations: google mobility & covid 19 data')


In [None]:
# Paired density and scatterplot matrix

# pgrid_1 = sns.PairGrid(g_mob_covid, diag_sharey=False)
# pgrid_1.map_upper(sns.scatterplot, s=15)
# pgrid_1.map_lower(sns.kdeplot)
# pgrid_1.map_diag(sns.kdeplot, lw=2)

### 3.2.2 facebook mobility data

### 3.2.3 google trends data

In [None]:
# correlation heatmap
show_heatmap(g_trends_covid, 14, 12, title='correlations: google trends & covid 19 data')

### 3.2.4 twitter trends data

### 3.2.5 covid 19 data

In [None]:
# correlation heatmap
show_heatmap(c19_full, 10, 8, title='correlations: covid 19 data')


### 3.3 visual exploration

### 3.3.1 google mobility data

In [None]:


def plot_timeseries(df, timeframe, w, h, title=''):
    # transform df to get a line for each col
    df = df.melt(timeframe, var_name='columns', value_name='vals')
    # plot
    custom_params = {"axes.spines.right": False, "axes.spines.top": False, "axes.spines.left": False}
    sns.set(rc={'figure.figsize':(w,h)})
    sns.set_theme(style="whitegrid", rc=custom_params)
    sns.lineplot(x=timeframe, y="vals", hue='columns', data=df).set_title('\n' + title + '\n', fontsize=16)
    
    #axes = plt.gca()
    #axes.yaxis.grid()
    plt.xlabel('')
    plt.ylabel('')
    plt.xticks(rotation=45, ha='right')
    plt.show()


# TO DO
# daily > weelky data for plots
# removes v grid lines
# plot in different ways (by country, c19 data only features together with similar scales, ...)
    
    
# Plot the responses for different events and regions




In [None]:
g_mob_plotting = g_mob_full[g_mob_full['location'] == 'DEU']
g_mob_plotting = g_mob_plotting.drop(['location'], axis=1)

plot_timeseries(g_mob_plotting, 'date', 18, 6, 'google mobility data over time')

### 3.3.2 facebook mobility data

### 3.3.3 google trends data

In [None]:
g_trends_plotting = g_trends_temp[g_trends_temp['location'] == 'DEU']
g_trends_plotting = g_trends_plotting.drop(['location'], axis=1)

plot_timeseries(g_trends_plotting, 'date', 18, 6)

### 3.3.4 twitter trends data

### 3.3.5 covid 19 data

In [None]:
c19_plotting = c19_full[c19_full['location'] == 'DEU']
c19_plotting = c19_plotting.drop(['location'], axis=1)

plot_timeseries(c19_plotting, 'date', 18, 6, 'covid 19 data over time')

# 4 | data preprocessing

### 4.1 mobility data

### 4.1.1 feature selection

In [None]:
from pyf.preprocessing import timeseries_clustered_sma, df_date_to_season

# add columns with simple moving averages (5 days) of features
sma_columns = list(g_mob_full.select_dtypes(np.number).columns)
g_mob_covid = timeseries_clustered_sma(df=g_mob_covid, columns=sma_columns, cluster_by='location', timeframe='date', nod=5)

# add season column
g_mob_covid = df_date_to_season(g_mob_covid, 'date')


In [None]:
# TODO

# feature additions: add rolling averages of daily features to smooth out fluctuations (5d, 10d ?), season 
# feature selection: corr, p-value, VIF, chi2 ?, ... ?

# f engineering
# f encoding (season)

In [None]:
c19_full.select_dtypes(np.number).columns