In [1]:
# library loading
import pandas as pd
from google.cloud import bigquery
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import os

%matplotlib inline
mpl.rcParams["axes.unicode_minus"] = False
mpl.rcParams["font.family"] = "D2Coding"
warnings.filterwarnings(action="ignore")

In [2]:
os.listdir()

['covid_mobility.ipynb',
 'bigquery.ipynb',
 '.ipynb_checkpoints',
 'bigquery2.ipynb',
 'bigquery1.ipynb',
 'bigquery.py']

In [3]:
client = bigquery.Client()

In [4]:
[i.dataset_id for i in client.list_datasets()]

['austin_311',
 'austin_bikeshare',
 'austin_crime',
 'austin_incidents',
 'austin_waste',
 'baseball',
 'bitcoin_blockchain',
 'bls',
 'bls_qcew',
 'catalonian_mobile_coverage',
 'catalonian_mobile_coverage_eu',
 'census_bureau_acs',
 'census_bureau_construction',
 'census_bureau_international',
 'census_bureau_usa',
 'census_utility',
 'cfpb_complaints',
 'chicago_crime',
 'chicago_taxi_trips',
 'cloud_storage_geo_index',
 'cms_codes',
 'cms_medicare',
 'cms_synthetic_patient_data_omop',
 'covid19_ecdc',
 'covid19_google_mobility',
 'covid19_google_mobility_eu',
 'covid19_govt_response',
 'covid19_italy',
 'covid19_italy_eu',
 'covid19_jhu_csse',
 'covid19_jhu_csse_eu',
 'covid19_nyt',
 'covid19_usafacts',
 'covid19_weathersource_com',
 'crypto_bitcoin',
 'crypto_bitcoin_cash',
 'crypto_dash',
 'crypto_dogecoin',
 'crypto_ethereum',
 'crypto_ethereum_classic',
 'crypto_litecoin',
 'crypto_zcash',
 'eclipse_megamovie',
 'epa_historical_air_quality',
 'ethereum_blockchain',
 'faa',
 'f

In [5]:
dataset = client.get_dataset(client.dataset("covid19_usafacts"))
dataset

Dataset(DatasetReference('bigquery-public-data', 'covid19_usafacts'))

In [6]:
[i.table_id for i in client.list_tables(dataset)]

['confirmed_cases', 'deaths', 'summary']

In [7]:
table = client.get_table(dataset.table("confirmed_cases"))
table

Table(TableReference(DatasetReference('bigquery-public-data', 'covid19_usafacts'), 'confirmed_cases'))

In [8]:
client.list_rows(table).total_rows

3195

In [9]:
cases = client.list_rows(table).to_dataframe()

In [10]:
summary = client.list_rows(client.get_table(dataset.table("summary"))).to_dataframe()

In [11]:
deaths = client.list_rows(client.get_table(dataset.table("deaths"))).to_dataframe()

In [12]:
cases.head()

Unnamed: 0,county_fips_code,county_name,state,state_fips_code,_1_22_20,_1_23_20,_1_24_20,_1_25_20,_1_26_20,_1_27_20,...,_5_28_20,_5_29_20,_5_30_20,_5_31_20,_6_1_20,_6_2_20,_6_3_20,_6_4_20,_6_5_20,_6_6_20
0,1073,Jefferson County,AL,1,0,0,0,0,0,0,...,1715,1744,1780,1826,1859,1876,1884,1901,1927,1980
1,1081,Lee County,AL,1,0,0,0,0,0,0,...,533,536,540,547,550,552,558,562,563,577
2,1089,Madison County,AL,1,0,0,0,0,0,0,...,318,324,327,337,343,343,346,353,359,371
3,1097,Mobile County,AL,1,0,0,0,0,0,0,...,2133,2162,2191,2238,2271,2279,2284,2321,2341,2377
4,1101,Montgomery County,AL,1,0,0,0,0,0,0,...,1541,1590,1632,1693,1758,1790,1823,1858,1905,1993


In [13]:
summary.head()

Unnamed: 0,county_fips_code,county_name,state,state_fips_code,date,confirmed_cases,deaths
0,0,Statewide Unallocated,AL,1,2020-01-22,0.0,0.0
1,0,Statewide Unallocated,AL,1,2020-01-23,0.0,0.0
2,0,Statewide Unallocated,AL,1,2020-01-24,0.0,0.0
3,0,Statewide Unallocated,AL,1,2020-01-25,0.0,0.0
4,0,Statewide Unallocated,AL,1,2020-01-26,0.0,0.0


In [14]:
deaths.head()

Unnamed: 0,county_fips_code,county_name,state,state_fips_code,_1_22_20,_1_23_20,_1_24_20,_1_25_20,_1_26_20,_1_27_20,...,_5_28_20,_5_29_20,_5_30_20,_5_31_20,_6_1_20,_6_2_20,_6_3_20,_6_4_20,_6_5_20,_6_6_20
0,39123,Ottawa County,OH,39,0,0,0,0,0,0,...,12,13,16,16,16,17,17,17,17,19
1,19127,Marshall County,IA,19,0,0,0,0,0,0,...,15,16,16,16,16,16,17,18,18,18
2,1125,Tuscaloosa County,AL,1,0,0,0,0,0,0,...,13,14,14,15,16,16,16,16,16,17
3,1013,Butler County,AL,1,0,0,0,0,0,0,...,15,16,17,18,18,18,18,18,21,22
4,19049,Dallas County,IA,19,0,0,0,0,0,0,...,17,20,20,20,21,24,24,25,26,26


In [15]:
cases.columns

Index(['county_fips_code', 'county_name', 'state', 'state_fips_code',
       '_1_22_20', '_1_23_20', '_1_24_20', '_1_25_20', '_1_26_20', '_1_27_20',
       ...
       '_5_28_20', '_5_29_20', '_5_30_20', '_5_31_20', '_6_1_20', '_6_2_20',
       '_6_3_20', '_6_4_20', '_6_5_20', '_6_6_20'],
      dtype='object', length=141)

In [16]:
cases_ = cases.melt(id_vars=["county_fips_code", "county_name", "state", "state_fips_code"], var_name="yyyymmddhh", value_name="value")

In [17]:
cases_.head()

Unnamed: 0,county_fips_code,county_name,state,state_fips_code,yyyymmddhh,value
0,1073,Jefferson County,AL,1,_1_22_20,0
1,1081,Lee County,AL,1,_1_22_20,0
2,1089,Madison County,AL,1,_1_22_20,0
3,1097,Mobile County,AL,1,_1_22_20,0
4,1101,Montgomery County,AL,1,_1_22_20,0


In [18]:
cases_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437715 entries, 0 to 437714
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   county_fips_code  437715 non-null  object
 1   county_name       437715 non-null  object
 2   state             437715 non-null  object
 3   state_fips_code   437715 non-null  object
 4   yyyymmddhh        437715 non-null  object
 5   value             437715 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 20.0+ MB


In [19]:
def replace(str) :
    return str.replace("_"," ",-1)

cases_ = cases_.assign(temp=cases_["yyyymmddhh"].apply(replace))

In [20]:
cases_.head()

Unnamed: 0,county_fips_code,county_name,state,state_fips_code,yyyymmddhh,value,temp
0,1073,Jefferson County,AL,1,_1_22_20,0,1 22 20
1,1081,Lee County,AL,1,_1_22_20,0,1 22 20
2,1089,Madison County,AL,1,_1_22_20,0,1 22 20
3,1097,Mobile County,AL,1,_1_22_20,0,1 22 20
4,1101,Montgomery County,AL,1,_1_22_20,0,1 22 20


In [21]:
def parse_day(x, a):
    x : str
    a : int
    return x.split(" ")[a]
cases_ = cases_.assign(month=cases_["temp"].apply(parse_day,a=1),day=cases_["temp"].apply(parse_day,a=2), year=cases_["temp"].apply(parse_day,a=3))

In [22]:
cases_.head()

Unnamed: 0,county_fips_code,county_name,state,state_fips_code,yyyymmddhh,value,temp,month,day,year
0,1073,Jefferson County,AL,1,_1_22_20,0,1 22 20,1,22,20
1,1081,Lee County,AL,1,_1_22_20,0,1 22 20,1,22,20
2,1089,Madison County,AL,1,_1_22_20,0,1 22 20,1,22,20
3,1097,Mobile County,AL,1,_1_22_20,0,1 22 20,1,22,20
4,1101,Montgomery County,AL,1,_1_22_20,0,1 22 20,1,22,20


In [23]:
summary.head()

Unnamed: 0,county_fips_code,county_name,state,state_fips_code,date,confirmed_cases,deaths
0,0,Statewide Unallocated,AL,1,2020-01-22,0.0,0.0
1,0,Statewide Unallocated,AL,1,2020-01-23,0.0,0.0
2,0,Statewide Unallocated,AL,1,2020-01-24,0.0,0.0
3,0,Statewide Unallocated,AL,1,2020-01-25,0.0,0.0
4,0,Statewide Unallocated,AL,1,2020-01-26,0.0,0.0


In [24]:
cases_ = cases_.assign(yyyymmdd=pd.to_datetime(cases_["year"]+"-"+cases_["month"]+"-"+cases_["day"], yearfirst=True))

In [25]:
cases_.head()

Unnamed: 0,county_fips_code,county_name,state,state_fips_code,yyyymmddhh,value,temp,month,day,year,yyyymmdd
0,1073,Jefferson County,AL,1,_1_22_20,0,1 22 20,1,22,20,2020-01-22
1,1081,Lee County,AL,1,_1_22_20,0,1 22 20,1,22,20,2020-01-22
2,1089,Madison County,AL,1,_1_22_20,0,1 22 20,1,22,20,2020-01-22
3,1097,Mobile County,AL,1,_1_22_20,0,1 22 20,1,22,20,2020-01-22
4,1101,Montgomery County,AL,1,_1_22_20,0,1 22 20,1,22,20,2020-01-22


In [26]:
cases_["yyyymmdd"].value_counts()

2020-05-18    3195
2020-02-20    3195
2020-05-24    3195
2020-03-26    3195
2020-03-19    3195
              ... 
2020-05-16    3195
2020-02-02    3195
2020-03-06    3195
2020-05-30    3195
2020-02-04    3195
Name: yyyymmdd, Length: 137, dtype: int64