In [26]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
from selenium import webdriver
import time
from zipfile import ZipFile
import json
import urllib
import io
from datetime import date, timedelta
from io import BytesIO
import os

In [30]:
#Helper function for getting trips data
def clean_trips(df_trips):
    df_trips.drop(df_trips[df_trips['date'] < '2020/01/01'].index, inplace = True)
    df_trips.drop(['level', 'state_fips', 'state_code', 'county_fips', 'county'], axis=1, inplace=True)
    df_trips['date'] = pd.to_datetime(df_trips['date'])
    df_trips.reset_index(inplace=True, drop=True)

    return df_trips

def get_trips_data():
    print('Load trips data')
    # retrieve data
    pd.options.mode.chained_assignment = None
    trips_url = 'https://data.bts.gov/resource/w96p-f2qv.json?State%20Postal%20Code=CA'
    json_trips = pd.read_json(trips_url)
    df_trips = clean_trips(pd.DataFrame(json_trips))
    df_trips['date'] =pd.to_datetime(df_trips['date'])
    return df_trips

def get_apple_mobility():
    print('Load apple mobility data')
    url = 'https://covid19.apple.com/mobility'
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(10)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    div = soup.find( class_ ='download-button-container')
    url = div.a['href']
    driver.close()
    s = requests.get(url).text
    apple =  pd.read_csv(io.StringIO(s))
    
    return apple
    
#helper function for get google mobility
def clean_google(df, date):
    df_google_mobility = df[(df['date'] == date) & (df['sub_region_1'] == 'California')]
    df_google_mobility.drop(df_google_mobility.columns[[0,1,2,4,5,6]], axis=1, inplace=True)
    df_google_mobility.dropna(subset=['sub_region_2'], inplace=True)
    df_google_mobility.rename(columns={'sub_region_2':'county'}, inplace=True)
    df_google_mobility.reset_index(drop=True, inplace=True)

    return df_google_mobility

def get_google_mobility():
    google_url = 'https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip'
    d = date.today() - timedelta(days=5)
    
    print('load google mobility data')
    content = requests.get(google_url)
    zf = ZipFile(BytesIO(content.content))
    us_mobility = [s for s in zf.namelist() if s == '2020_US_Region_Mobility_Report.csv'][0]
    df = pd.read_csv(zf.open(us_mobility), low_memory=False)
    df_google = clean_google(df, str(d))
    df_google['date'] =pd.to_datetime(df_google['date'])

    return df_google


def get_hospital():
    print('load hospital data')
    url = 'https://data.ca.gov/dataset/529ac907-6ba1-4cb7-9aae-8966fc96aeef/resource/42d33765-20fd-44b8-a978-b083b7542225/download/hospitals_by_county.csv'
    s = requests.get(url).text
    hospital =  pd.read_csv(io.StringIO(s))
    hospital = hospital.rename(columns={'todays_date':'date'})
    hospital['date'] = pd.to_datetime(hospital.date)

    return hospital

def get_ppe():
    print('load hospital data')
    url = 'https://data.ca.gov/dataset/da1978f2-068c-472f-be2d-04cdec48c3d9/resource/7d2f11a4-cc0f-4189-8ba4-8bee05493af1/download/logistics_ppe.csv'
    s = requests.get(url).text
    ppe = pd.read_csv(io.StringIO(s))
    ppe = ppe.drop(columns = ['quantity_filled','shipping_zip_postal_code'])
    ppe = pd.get_dummies(ppe, columns=['product_family'])
    ppe = ppe.rename(columns ={'as_of_date':'date'})
    ppe = ppe.groupby(by=['date','county']).sum().reset_index()
    ppe['date'] = pd.to_datetime(ppe.date)
    
    return ppe

def get_cases():
    url = 'https://data.ca.gov/dataset/590188d5-8545-4c93-a9a0-e230f0db7290/resource/926fd08f-cc91-4828-af38-bd45de97f8c3/download/statewide_cases.csv'
    s = requests.get(url).text
    cases = pd.read_csv(io.StringIO(s))
    cases['date'] = pd.to_datetime(cases.date)
    
    return cases

In [31]:
cases = get_cases()
ppe = get_ppe()
hospital = get_hospital()
mobility = get_google_mobility()
trips = get_trips_data()

temp = cases.merge(hospital,on=['date','county'],how='left')
temp = temp.merge(ppe,on=['date','county'],how='left')
temp = temp.merge(mobility,on=['date','county'],how='left')

temp.to_csv('../../data/combined.csv', index = False)



load hospital data
load hospital data
load google mobility data
Load trips data


In [29]:
trips

Unnamed: 0,date,pop_stay_at_home,pop_not_stay_at_home,trips,trips_1,trips_1_3,trips_3_5,trips_5_10,trips_10_25,trips_25_50,trips_50_100,trips_100_250,trips_250_500,trips_500,row_id
0,2021-01-16,4846,13958,25264,4540,8624,2352,2151,2649,1454,1212,356,27,1899,06-06063-20210116
1,2020-01-01,9219703,30337342,112606885,31144582,29578049,13125991,15443353,14439973,5553845,2217442,839553,174752,89345,06-00000-20200101
2,2020-01-02,7403357,32153688,140419232,38456444,35683211,16386822,19907224,19541250,7108309,2337243,776777,157347,64605,06-00000-20200102
3,2020-01-03,7322168,32234877,144822552,39324925,36983665,17075874,20792489,20103437,7167063,2346209,793404,164548,70938,06-00000-20200103
4,2020-01-04,8077306,31479739,137327489,37470641,36331753,16574863,19376978,17687365,6409478,2378738,852115,171189,74369,06-00000-20200104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,2021-01-19,13881106,25675939,76377435,19765969,18560168,8937540,11150252,11290746,4416911,1465905,529319,145066,115559,06-00000-20210119
386,2021-01-20,13514049,26042996,79369599,20931301,19087726,9295309,11549423,11661906,4532785,1515033,523615,154280,118221,06-00000-20210120
387,2021-01-21,12819363,26737682,89050467,24259283,21304157,10288433,12715962,12827087,5038620,1686871,613186,185866,131002,06-00000-20210121
388,2021-01-22,12828525,26728520,85832876,23016633,20972844,10049534,12218238,12034906,4626435,1836832,742911,209486,125057,06-00000-20210122
