# Clean Google Mobility Data 



In [1]:
# Dependencies and Setup
import json
import os
import pandas as pd
import urllib.request
import requests
# from config import db_pwd, db_user
from sqlalchemy import create_engine

## Store Google CSV into DataFrame

In [2]:
csv_file = "Resources/google_mob_US.csv"
google_data_df = pd.read_csv(csv_file)
google_data_df.head()

Unnamed: 0.1,Unnamed: 0,State,date,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential
0,0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0


### Rename the dataframe with select columns

In [3]:
google_data_df = google_data_df.rename(columns = {"State":'states', "date":'dates', "retail_and_recreation":'retail_recreation',
                            "grocery_and_pharmacy":'grocery_pharmacy', "parks":'parks', "transit_stations":"transit", "workplaces":"workplaces", "residential":"residential"})
google_data_df.head()

Unnamed: 0.1,Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0


In [4]:
google_df = google_data_df[["states", "dates", "retail_recreation",
                            "grocery_pharmacy", "parks", "transit", "workplaces", "residential"]]
google_df.head()

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0


In [5]:
# Reseting the index and saving the cleaned file to csv
# clean_google_mob_US_df = clean_google_mob_US_df.reset_index(drop = True)
# clean_google_mob_US_df.to_csv("../Data/clean_google_mob_US.csv")

## Adding the 30 Day Moving Average 

In [6]:
google_df["SMA_retail_recreation"] = google_df.iloc[:,2].rolling(window=30).mean()
google_df["SMA_grocery_pharmacy"] = google_df.iloc[:,3].rolling(window=30).mean()
google_df["SMA_parks"] = google_df.iloc[:,4].rolling(window=30).mean()
google_df["SMA_transit"] = google_df.iloc[:,5].rolling(window=30).mean()
google_df["SMA_workplaces"] = google_df.iloc[:,6].rolling(window=30).mean()
google_df["SMA_residential"] = google_df.iloc[:,7].rolling(window=30).mean()
google_df.head()


Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0,,,,,,
1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0,,,,,,
2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0,,,,,,
3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0,,,,,,
4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0,,,,,,


In [7]:
# Reseting the index and saving the cleaned file to csv
google_us_df = google_df.reset_index(drop = True)
google_us_df.to_csv("data/google_us.csv")


## Test Data - Virginia

In [8]:
# Selecting only the data for the US. This dropped the data to 456634 rows × 14 columns
google_mob_VA = google_df.loc[google_df["states"] == "Virginia"]
google_mob_VA = google_mob_VA.reset_index(drop = True)
google_mob_VA.head()

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,Virginia,2020-02-15,2.0,2.0,3.0,4.0,0.0,0.0,0.4,,,-10.366667,-25.7,
1,Virginia,2020-02-16,4.0,0.0,9.0,-1.0,0.0,0.0,0.433333,,,-10.266667,-24.666667,
2,Virginia,2020-02-17,8.0,2.0,41.0,-6.0,-27.0,6.0,0.8,,,-10.0,-24.5,
3,Virginia,2020-02-18,1.0,2.0,22.0,4.0,3.0,0.0,0.766667,,,-9.433333,-23.366667,
4,Virginia,2020-02-19,2.0,2.0,6.0,3.0,2.0,0.0,1.166667,,,-8.533333,-22.166667,


In [9]:
# Grouping by date, so we can get all the data for all states into one date
# skipnabool, default is True, and all NA/null values are excluded, when computing the result.
data_by_date_VA_df = pd.DataFrame(google_mob_VA.groupby("dates").mean())
data_by_date_VA_df.reset_index(inplace = True)
data_by_date_VA_df.head()

Unnamed: 0,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,2020-02-15,7.033898,1.70339,1.652174,5.875,0.604396,-0.482759,-8.546237,1.238667,,-20.316667,-27.33451,10.044444
1,2020-02-16,5.916667,0.087719,3.52,3.189189,0.431818,-0.448276,-8.282424,1.136054,,-22.645455,-26.292593,9.622222
2,2020-02-17,6.327434,0.435897,33.157895,1.833333,-19.23622,5.015873,-7.633951,1.117687,,-21.636364,-26.077395,9.383333
3,2020-02-18,2.787611,1.947826,19.894737,4.545455,2.492063,-0.267606,-7.303704,1.110204,,-20.518182,-24.853488,8.944444
4,2020-02-19,3.333333,2.104348,0.05,3.191489,1.769841,-0.152778,-6.801235,1.252381,,-19.306061,-23.726357,8.527778


In [10]:
# Reseting the index and saving the cleaned file to csv
data_by_date_VA_df.to_csv("data/google_mob_VAA.csv")

In [11]:
# Grouping by date, so we can get all the data for all states into one date
# skipnabool, default is True, and all NA/null values are excluded, when computing the result.
data_by_date_US_df = pd.DataFrame(google_df.groupby("dates").mean())
data_by_date_US_df.reset_index(inplace = True)
data_by_date_US_df.head()

Unnamed: 0,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,2020-02-15,6.43849,2.379197,16.658416,4.307071,0.440225,-0.550276,-3.564387,3.430499,38.969935,-12.050508,-24.100974,7.729787
1,2020-02-16,7.986974,2.511178,17.796895,3.75211,0.484,-0.6618,-3.274811,3.276794,35.700233,-12.121905,-23.112474,7.381628
2,2020-02-17,3.865043,-0.345417,27.884146,1.819767,-17.579826,4.363513,-3.060386,3.143699,33.313281,-11.794194,-22.794246,7.208911
3,2020-02-18,-0.193205,-1.269649,5.354559,2.244722,-0.005492,0.794469,-2.998998,2.946242,33.360937,-11.221212,-21.849528,6.90198
4,2020-02-19,2.07987,0.579322,7.988439,2.475737,0.884418,0.17746,-2.630322,2.952743,33.446875,-10.572796,-20.857547,6.536964


In [12]:
# Reseting the index and saving the cleaned file to csv
data_by_date_US_df.to_csv("data/google_mob_US.csv")

## US States 30 Day Moving Average DF

In [13]:
# Review previous df for the entire U.S - Noting number of rows
google_df.head()

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0,,,,,,
1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0,,,,,,
2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0,,,,,,
3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0,,,,,,
4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0,,,,,,


In [14]:
google_df.columns

Index(['states', 'dates', 'retail_recreation', 'grocery_pharmacy', 'parks',
       'transit', 'workplaces', 'residential', 'SMA_retail_recreation',
       'SMA_grocery_pharmacy', 'SMA_parks', 'SMA_transit', 'SMA_workplaces',
       'SMA_residential'],
      dtype='object')

In [15]:
# Groupby state and date to return the moving average (30 days)
data_us_df = pd.DataFrame(google_df.groupby(['states','dates']).mean())
data_us_df.reset_index(inplace = True)
data_us_df

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,Alabama,2020-02-15,5.155172,0.230769,49.571429,5.761905,0.784314,-0.863636,-4.353333,3.583871,1.666667,4.946667,-22.844444,7.441667
1,Alabama,2020-02-16,-1.454545,-4.015625,-5.476190,2.136364,-1.920000,0.523810,-4.335632,3.376344,1.800000,5.013333,-21.935461,7.108333
2,Alabama,2020-02-17,-1.169492,-4.476190,12.933333,5.727273,-16.264706,4.333333,-4.324138,3.105556,2.000000,5.140000,-21.568085,6.916667
3,Alabama,2020-02-18,-4.280702,-5.000000,-12.466667,-0.681818,0.338235,1.435897,-4.388506,2.746667,0.966667,4.946667,-20.662411,6.641667
4,Alabama,2020-02-19,1.438596,-0.730159,6.666667,3.714286,1.397059,0.000000,-4.066667,2.704444,1.466667,4.973333,-19.695035,6.291667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8767,Wyoming,2020-07-31,16.000000,28.333333,132.000000,44.666667,-24.190476,2.500000,19.620000,18.425000,,35.429167,-22.344444,3.244444
8768,Wyoming,2020-08-01,12.363636,17.200000,333.000000,40.888889,-9.933333,-1.000000,18.860000,17.600000,,35.379167,-21.597778,3.122222
8769,Wyoming,2020-08-02,24.300000,18.750000,321.000000,33.750000,-11.461538,-1.666667,19.056667,17.066667,,34.629167,-20.858974,2.788889
8770,Wyoming,2020-08-03,29.900000,20.200000,323.000000,34.555556,-23.142857,2.555556,20.236667,17.200000,,34.991667,-21.266667,2.977778


In [16]:
# Export the US data as a csv file
data_us_df.to_csv("data/data_us.csv")

In [17]:
# start_date = "2020-02-15"
# end_date = "2020-03-01"
# mask = (google_df["dates"] > start_date) & (google_df["dates"] <= end_date)
# cut_date_df = google_df.loc[mask]
# cut_date_df

## Store NYT COVID cases and deaths CSV into DataFrame

In [18]:
csv_file = "Resources/COVID-states.csv"
covid_data_df = pd.read_csv(csv_file)
covid_data_df.head()

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0


In [19]:
covid_us = covid_data_df.rename(columns = {"state":'states', "fips": 'fips', "date":'dates', "cases":'cases',
                            "deaths":'deaths'})
covid_us.head()

Unnamed: 0,dates,states,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0


### Connect to local database

In [21]:
rds_connection_string = f"{db_user}:{{db_pwd}}@localhost:5432/mobility_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

### Check for tables

In [22]:
engine.table_names()

['google_data', 'us_data', 'covid_data']

### Use pandas to load csv converted DataFrame into database

In [None]:
google_us.to_sql(name='google_data', con=engine, if_exists='append', index=False)

In [23]:
covid_us.to_sql(name='covid_data', con=engine, if_exists='append', index=False)

In [25]:
data_us_df.to_sql(name='us_data', con=engine, if_exists='append', index=False)

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "SMA_retail_recreation" of relation "us_data" does not exist
LINE 1: ...harmacy, parks, transit, workplaces, residential, "SMA_retai...
                                                             ^

[SQL: INSERT INTO us_data (states, dates, retail_recreation, grocery_pharmacy, parks, transit, workplaces, residential, "SMA_retail_recreation", "SMA_grocery_pharmacy", "SMA_parks", "SMA_transit", "SMA_workplaces", "SMA_residential") VALUES (%(states)s, %(dates)s, %(retail_recreation)s, %(grocery_pharmacy)s, %(parks)s, %(transit)s, %(workplaces)s, %(residential)s, %(SMA_retail_recreation)s, %(SMA_grocery_pharmacy)s, %(SMA_parks)s, %(SMA_transit)s, %(SMA_workplaces)s, %(SMA_residential)s)]
[parameters: ({'states': 'Alabama', 'dates': '2020-02-15', 'retail_recreation': 5.155172413793103, 'grocery_pharmacy': 0.23076923076923078, 'parks': 49.57142857142857, 'transit': 5.761904761904762, 'workplaces': 0.7843137254901961, 'residential': -0.8636363636363636, 'SMA_retail_recreation': -4.353333333333334, 'SMA_grocery_pharmacy': 3.583870967741936, 'SMA_parks': 1.6666666666666667, 'SMA_transit': 4.946666666666667, 'SMA_workplaces': -22.844444444444438, 'SMA_residential': 7.441666666666666}, {'states': 'Alabama', 'dates': '2020-02-16', 'retail_recreation': -1.4545454545454546, 'grocery_pharmacy': -4.015625, 'parks': -5.476190476190476, 'transit': 2.1363636363636362, 'workplaces': -1.92, 'residential': 0.5238095238095238, 'SMA_retail_recreation': -4.335632183908046, 'SMA_grocery_pharmacy': 3.3763440860215055, 'SMA_parks': 1.8, 'SMA_transit': 5.013333333333334, 'SMA_workplaces': -21.935460992907803, 'SMA_residential': 7.108333333333333}, {'states': 'Alabama', 'dates': '2020-02-17', 'retail_recreation': -1.1694915254237288, 'grocery_pharmacy': -4.476190476190476, 'parks': 12.933333333333334, 'transit': 5.7272727272727275, 'workplaces': -16.264705882352942, 'residential': 4.333333333333333, 'SMA_retail_recreation': -4.324137931034483, 'SMA_grocery_pharmacy': 3.1055555555555556, 'SMA_parks': 2.0, 'SMA_transit': 5.14, 'SMA_workplaces': -21.56808510638298, 'SMA_residential': 6.916666666666666}, {'states': 'Alabama', 'dates': '2020-02-18', 'retail_recreation': -4.280701754385965, 'grocery_pharmacy': -5.0, 'parks': -12.466666666666667, 'transit': -0.6818181818181818, 'workplaces': 0.3382352941176471, 'residential': 1.435897435897436, 'SMA_retail_recreation': -4.388505747126437, 'SMA_grocery_pharmacy': 2.746666666666667, 'SMA_parks': 0.9666666666666667, 'SMA_transit': 4.946666666666666, 'SMA_workplaces': -20.66241134751773, 'SMA_residential': 6.641666666666666}, {'states': 'Alabama', 'dates': '2020-02-19', 'retail_recreation': 1.4385964912280702, 'grocery_pharmacy': -0.7301587301587301, 'parks': 6.666666666666667, 'transit': 3.7142857142857144, 'workplaces': 1.3970588235294117, 'residential': 0.0, 'SMA_retail_recreation': -4.066666666666667, 'SMA_grocery_pharmacy': 2.704444444444444, 'SMA_parks': 1.4666666666666666, 'SMA_transit': 4.973333333333334, 'SMA_workplaces': -19.69503546099291, 'SMA_residential': 6.291666666666666}, {'states': 'Alabama', 'dates': '2020-02-20', 'retail_recreation': -7.568965517241379, 'grocery_pharmacy': -6.6875, 'parks': -27.58823529411765, 'transit': 1.3636363636363635, 'workplaces': -0.6764705882352942, 'residential': 2.923076923076923, 'SMA_retail_recreation': -4.083908045977012, 'SMA_grocery_pharmacy': 2.2599999999999993, 'SMA_parks': 0.36666666666666664, 'SMA_transit': 4.373333333333333, 'SMA_workplaces': -19.30212765957446, 'SMA_residential': 6.3}, {'states': 'Alabama', 'dates': '2020-02-21', 'retail_recreation': 0.5238095238095238, 'grocery_pharmacy': -2.8, 'parks': 9.333333333333334, 'transit': 6.136363636363637, 'workplaces': 1.4411764705882353, 'residential': 0.3157894736842105, 'SMA_retail_recreation': -3.6609195402298846, 'SMA_grocery_pharmacy': 2.2399999999999998, 'SMA_parks': 0.5333333333333333, 'SMA_transit': 4.72, 'SMA_workplaces': -18.565957446808508, 'SMA_residential': 6.116666666666667}, {'states': 'Alabama', 'dates': '2020-02-22', 'retail_recreation': 4.913793103448276, 'grocery_pharmacy': 0.7846153846153846, 'parks': 61.583333333333336, 'transit': 10.318181818181818, 'workplaces': 4.764705882352941, 'residential': -1.4545454545454546, 'SMA_retail_recreation': -3.3701149425287356, 'SMA_grocery_pharmacy': 2.26, 'SMA_parks': 3.2666666666666666, 'SMA_transit': 4.9799999999999995, 'SMA_workplaces': -17.406382978723407, 'SMA_residential': 5.766666666666667}  ... displaying 10 of 8772 total bound parameter sets ...  {'states': 'Wyoming', 'dates': '2020-08-03', 'retail_recreation': 29.9, 'grocery_pharmacy': 20.2, 'parks': 323.0, 'transit': 34.55555555555556, 'workplaces': -23.142857142857142, 'residential': 2.5555555555555554, 'SMA_retail_recreation': 20.23666666666667, 'SMA_grocery_pharmacy': 17.2, 'SMA_parks': None, 'SMA_transit': 34.99166666666667, 'SMA_workplaces': -21.26666666666667, 'SMA_residential': 2.977777777777778}, {'states': 'Wyoming', 'dates': '2020-08-04', 'retail_recreation': 25.0, 'grocery_pharmacy': 34.142857142857146, 'parks': 223.5, 'transit': 29.88888888888889, 'workplaces': -22.857142857142858, 'residential': 3.5555555555555554, 'SMA_retail_recreation': 20.380000000000003, 'SMA_grocery_pharmacy': 17.491666666666667, 'SMA_parks': None, 'SMA_transit': 35.15833333333333, 'SMA_workplaces': -21.47460317460318, 'SMA_residential': 3.233333333333333})]
(Background on this error at: http://sqlalche.me/e/13/f405)

### Confirm data has been added by querying the tables

In [None]:
pd.read_sql_query('select * from google_data', con=engine).head(10)

In [26]:
pd.read_sql_query('select * from covid_data', con=engine).head(10)

Unnamed: 0,dates,states,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0
5,2020-01-25,California,6,1,0
6,2020-01-25,Illinois,17,1,0
7,2020-01-25,Washington,53,1,0
8,2020-01-26,Arizona,4,1,0
9,2020-01-26,California,6,2,0
