In [2]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import pprint
import json
import urllib

# Get Covid data from api.covidtracking.com
1. suppose 'data' is the retrieved json, then it has keys ['links', 'meta', 'data']<br>
2. For us/daily, data['data'] stores daily number of cases, with keys ['date', 'states', 'cases', 'testing', 'outcomes']<br>
['date'] is the date in YYYY-MM-DD<br>
['state'] is the number of states<br>
['cases'] has a single key ['total'] with ['value'] (raw number) and ['calculated'] (some statistics)<br>
['testing'] is structured the same way as ['cases']<br>
['outcomes']<br>
3. For state/daily, data['data'] has keys ['date', 'state', 'meta', 'cases', 'tests', 'outcomes']

In [106]:
def get_state_codes():
    l = 'https://api.covidtracking.com/v2/states.json'
    with urllib.request.urlopen(l) as url:
        link_json = json.load(url)
    pd.DataFrame(link_json['data']).to_csv('./data/states.csv')

In [128]:
def retrieve_json(link):
    try:
        with urllib.request.urlopen(link) as url:
            link_json = json.load(url)
        filename = './data/' + link.split('v2/')[1].replace('/', '_')
        with open(filename, 'w') as f:
            json.dump(link_json, f)
        print('saved ' + filename)
        if filename != './data/us_daily.json':
            covid_files.append(filename)
    except:
        print('retrieval failed ' + link)

In [113]:
def get_all_json():
    global state_codes
    covidtracking_api = 'https://api.covidtracking.com/v2/'
    
    retrieve_json(covidtracking_api+'us/daily.json')
    
    states_codes = pd.read_csv('./data/states.csv')['state_code'].str.lower()
    for state in states_codes:
        retrieve_json(covidtracking_api + 'states/' + state + '/daily/simple.json')

In [169]:
def clean_df_from_json(filename):
    with open(filename, 'r') as f:
        temp = json.load(f)
    temp = pd.DataFrame(temp['data'])
    
    if filename != './data/us_daily.json':
        temp['cases_total'] = temp['cases'].apply(lambda x: x['total'])
        temp['cases_confirmed'] = temp['cases'].apply(lambda x: x['confirmed'])
        temp['cases_probable'] = temp['cases'].apply(lambda x: x['probable'])
        temp['tests_total'] = temp['tests'].apply(lambda x: x['pcr']['specimens']['total'])
        temp['tests_positive'] = temp['tests'].apply(lambda x: x['pcr']['specimens']['positive'])
        temp = temp.drop(['cases', 'tests'], axis=1)
        temp.to_csv(filename.replace('json', 'csv'))
        print('saved ' + filename.replace('json', 'csv'))
    else:
        temp['cases_total'] = temp['cases'].apply(lambda x: x['total']['value'])
        temp['cases_percent'] = temp['cases'].apply(lambda x: x['total']['calculated']['population_percent'])
        temp['cases_increase'] = temp['cases'].apply(lambda x: x['total']['calculated']['change_from_prior_day'])
        temp['cases_7d_change'] = temp['cases'].apply(lambda x: x['total']['calculated']['seven_day_change_percent'])
        temp['testing_total'] = temp['testing'].apply(lambda x: x['total']['value'])
        temp['testing_percent'] = temp['testing'].apply(lambda x: x['total']['calculated']['population_percent'])
        temp['testing_increase'] = temp['testing'].apply(lambda x: x['total']['calculated']['change_from_prior_day'])
        temp['testing_7d_change'] = temp['testing'].apply(lambda x: x['total']['calculated']['seven_day_change_percent'])
        temp = temp.drop(['cases', 'testing'], axis=1)
        temp.to_csv('./data/us_daily.csv')
        print('saved us_daily.csv')

In [170]:
def clean_all_df():
    clean_df_from_json('./data/us_daily.json')
    for file in covid_files:
        clean_df_from_json(file)

In [107]:
get_state_codes()

In [93]:
covid_files = []
get_all_json()

saved states_al_daily_simple.json
saved states_ak_daily_simple.json
saved states_az_daily_simple.json
saved states_ar_daily_simple.json
saved states_ca_daily_simple.json
saved states_co_daily_simple.json
saved states_ct_daily_simple.json
saved states_de_daily_simple.json
saved states_dc_daily_simple.json
saved states_fl_daily_simple.json
saved states_ga_daily_simple.json
saved states_hi_daily_simple.json
saved states_id_daily_simple.json
saved states_il_daily_simple.json
saved states_in_daily_simple.json
saved states_ia_daily_simple.json
saved states_ks_daily_simple.json
saved states_ky_daily_simple.json
saved states_la_daily_simple.json
saved states_me_daily_simple.json
saved states_md_daily_simple.json
saved states_ma_daily_simple.json
saved states_mi_daily_simple.json
saved states_mn_daily_simple.json
saved states_ms_daily_simple.json
saved states_mo_daily_simple.json
saved states_mt_daily_simple.json
saved states_ne_daily_simple.json
saved states_nv_daily_simple.json
saved states_n

In [171]:
clean_all_df()

saved us_daily.csv
saved states_al_daily_simple.csv
saved states_ak_daily_simple.csv
saved states_az_daily_simple.csv
saved states_ar_daily_simple.csv
saved states_ca_daily_simple.csv
saved states_co_daily_simple.csv
saved states_ct_daily_simple.csv
saved states_de_daily_simple.csv
saved states_dc_daily_simple.csv
saved states_fl_daily_simple.csv
saved states_ga_daily_simple.csv
saved states_hi_daily_simple.csv
saved states_id_daily_simple.csv
saved states_il_daily_simple.csv
saved states_in_daily_simple.csv
saved states_ia_daily_simple.csv
saved states_ks_daily_simple.csv
saved states_ky_daily_simple.csv
saved states_la_daily_simple.csv
saved states_me_daily_simple.csv
saved states_md_daily_simple.csv
saved states_ma_daily_simple.csv
saved states_mi_daily_simple.csv
saved states_mn_daily_simple.csv
saved states_ms_daily_simple.csv
saved states_mo_daily_simple.csv
saved states_mt_daily_simple.csv
saved states_ne_daily_simple.csv
saved states_nv_daily_simple.csv
saved states_nh_daily_si

# Get Covid data (of March 10) from worldometers

In [6]:
wdm = 'https://www.worldometers.info/coronavirus/country/us/'
wdm_page = requests.get(wdm)

In [12]:
df = pd.read_html(wdm_page.content)[1]
df.columns.to_list() # check weird column names

['#',
 'USAState',
 'TotalCases',
 'NewCases',
 'TotalDeaths',
 'NewDeaths',
 'TotalRecovered',
 'ActiveCases',
 'Tot\xa0Cases/1M pop',
 'Deaths/1M pop',
 'TotalTests',
 'Tests/ 1M pop',
 'Population',
 'Projections']

In [16]:
# change weird column names
df = df.rename(columns={'Tot\xa0Cases/1M pop': 'TotCasesPer1MPop', 
                        'Deaths/1M pop':'DeathsPer1MPop', 'Tests/ 1M pop': 'TestsPer1MPop'})
df.to_csv('./data/us_0310.csv')

# Get weather data

In [100]:
# the county_staitons is always empty as returned by the api, so we have another function
# get_state_stations
def get_state_counties(state):
    state_abbr = states[states['name']==state]['state_code'].values[0]
    t = f'https://api.weather.gov/zones?area={state_abbr}&type=county'
    with urllib.request.urlopen(t) as url:
            k = url.read()
    print(t + ' api call is successful')
    j = json.loads(k)['features']
    
    county_links = [item['id'] for item in j]
    county_ids = [item['properties']['id'] for item in j]
    county_names = [item['properties']['name'] for item in j]
    county_states = [item['properties']['state'] for item in j]
    county_stations = [item['properties']['observationStations'] for item in j] # empty
    
    df = pd.DataFrame({'link': county_links, 'id': county_ids, 'name': county_names,
                      'state_code': county_states, 'stations': county_stations})
    df['state'] = [state]*len(j)
    return df

In [113]:
# the api does not show any stations for a given county, so we have to get all stations in
# a state and match them to their counties
def get_state_stations(state):
    state_abbr = states[states['name']==state]['state_code'].values[0]
    t = f'https://api.weather.gov/stations?state={state_abbr}'
    with urllib.request.urlopen(t) as url:
            k = url.read()
    print(t + ' api call is successful')
    j = json.loads(k)['features']
    
    station_forecasts = []
    for i, item in enumerate(j):
        if 'forecast' in item['properties'].keys():
            station_forecasts.append(item['properties']['forecast'])
        else:
            station_forecasts.append(np.nan)

    station_counties = []
    for i, item in enumerate(j):
        if 'county' in item['properties'].keys():
            station_counties.append(item['properties']['county'].split('/')[-1])
        else:
            station_counties.append(np.nan)
    
    df = pd.DataFrame({'forecast': station_forecasts, 'county': station_counties})
    df = df.dropna(axis=0)
    df = df.drop_duplicates()
    
    return df

In [122]:
def get_station_county_pair(state):
    counties_of_state = get_state_counties(state)
    stations_of_state = get_state_stations(state)
    df = pd.merge(cali, cali_station, left_on='id', right_on='county', 
                  suffixes=['_county', '_station'])
    df = df.drop('county', axis=1)
    df.to_csv(f'./data/{state}_station_county_pair.csv')

In [46]:
states = pd.read_csv('./data/states.csv', index_col=0)

In [58]:
cali = get_state_counties('California')

In [114]:
cali_station = get_state_stations('California')

https://api.weather.gov/stations?state=CA api call is successful


In [115]:
cali_station.shape

(215, 2)

In [123]:
get_station_county_pair('California')

https://api.weather.gov/zones?area=CA&type=county api call is successful
https://api.weather.gov/stations?state=CA api call is successful


In [68]:
# use this to get station per state
t = 'https://api.weather.gov/stations?state=CA'
with urllib.request.urlopen(t) as url:
        k = url.read()

In [69]:
j = json.loads(k)['features']

In [90]:
j[3517]['properties']

{'@id': 'https://api.weather.gov/stations/KNUC',
 '@type': 'wx:ObservationStation',
 'elevation': {'value': 50.9016, 'unitCode': 'unit:m'},
 'stationIdentifier': 'KNUC',
 'name': 'San Clemente Island NALF',
 'timeZone': 'America/Los_Angeles',
 'county': 'https://api.weather.gov/zones/county/CAC037'}

In [71]:
len([item for item in j if item['properties']['']])

3581

In [73]:
cali

Unnamed: 0,link,id,name,state_code,stations,state
0,https://api.weather.gov/zones/county/CAC001,CAC001,Alameda,CA,[],California
1,https://api.weather.gov/zones/county/CAC003,CAC003,Alpine,CA,[],California
2,https://api.weather.gov/zones/county/CAC005,CAC005,Amador,CA,[],California
3,https://api.weather.gov/zones/county/CAC007,CAC007,Butte,CA,[],California
4,https://api.weather.gov/zones/county/CAC009,CAC009,Calaveras,CA,[],California
5,https://api.weather.gov/zones/county/CAC011,CAC011,Colusa,CA,[],California
6,https://api.weather.gov/zones/county/CAC013,CAC013,Contra Costa,CA,[],California
7,https://api.weather.gov/zones/county/CAC015,CAC015,Del Norte,CA,[],California
8,https://api.weather.gov/zones/county/CAC017,CAC017,El Dorado,CA,[],California
9,https://api.weather.gov/zones/county/CAC019,CAC019,Fresno,CA,[],California


In [72]:
# Use this to get weather of station
with urllib.request.urlopen('https://api.weather.gov/stations/C1231/observations?limit=10') as url:
        k3 = url.read()

In [73]:
j3 = json.loads(k3)
j3.keys()

dict_keys(['@context', 'type', 'features'])

In [91]:
pd.DataFrame(j3['features'])['properties'][8]

{'@id': 'https://api.weather.gov/stations/C1231/observations/2021-03-09T15:06:00+00:00',
 '@type': 'wx:ObservationStation',
 'elevation': {'value': 12.81, 'unitCode': 'unit:m'},
 'station': 'https://api.weather.gov/stations/C1231',
 'timestamp': '2021-03-09T15:06:00+00:00',
 'rawMessage': '',
 'textDescription': '',
 'icon': None,
 'presentWeather': [],
 'temperature': {'value': 17.77776,
  'unitCode': 'unit:degC',
  'qualityControl': 'qc:V'},
 'dewpoint': {'value': 12.22778,
  'unitCode': 'unit:degC',
  'qualityControl': 'qc:V'},
 'windDirection': {'value': 84,
  'unitCode': 'unit:degree_(angle)',
  'qualityControl': 'qc:V'},
 'windSpeed': {'value': 1.609344,
  'unitCode': 'unit:km_h-1',
  'qualityControl': 'qc:V'},
 'windGust': {'value': 8.04672,
  'unitCode': 'unit:km_h-1',
  'qualityControl': 'qc:S'},
 'barometricPressure': {'value': 103310,
  'unitCode': 'unit:Pa',
  'qualityControl': 'qc:V'},
 'seaLevelPressure': {'value': None,
  'unitCode': 'unit:Pa',
  'qualityControl': 'qc:Z'