# Clean Google Mobility Data 



In [1]:
# Dependencies and Setup
import json
import os
import pandas as pd
import urllib.request
import requests
# from config import db_pwd, db_user
from sqlalchemy import create_engine

## Store Google CSV into DataFrame

In [2]:
csv_file = "Resources/google_mob_US.csv"
google_data_df = pd.read_csv(csv_file)
google_data_df.head()

Unnamed: 0.1,Unnamed: 0,State,date,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential
0,0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0


### Rename the dataframe with select columns

In [3]:
google_data_df = google_data_df.rename(columns = {"State":'states', "date":'dates', "retail_and_recreation":'retail_recreation',
                            "grocery_and_pharmacy":'grocery_pharmacy', "parks":'parks', "transit_stations":"transit", "workplaces":"workplaces", "residential":"residential"})
google_data_df

Unnamed: 0.1,Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
456457,456457,Wyoming,2020-07-29,,,,,-29.0,
456458,456458,Wyoming,2020-07-30,,,,,-29.0,
456459,456459,Wyoming,2020-07-31,,,,,-26.0,
456460,456460,Wyoming,2020-08-03,,,,,-26.0,


In [4]:
google_df = google_data_df[["states", "dates", "retail_recreation",
                            "grocery_pharmacy", "parks", "transit", "workplaces", "residential"]]
google_df

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0
...,...,...,...,...,...,...,...,...
456457,Wyoming,2020-07-29,,,,,-29.0,
456458,Wyoming,2020-07-30,,,,,-29.0,
456459,Wyoming,2020-07-31,,,,,-26.0,
456460,Wyoming,2020-08-03,,,,,-26.0,


In [5]:
# # Reseting the index and saving the cleaned file to csv
# clean_google_mob_US_df = clean_google_mob_US_df.reset_index(drop = True)
# clean_google_mob_US_df.to_csv("../Data/clean_google_mob_US.csv")

In [6]:
google_df["SMA_retail_recreation"] = google_df.iloc[:,2].rolling(window=30).mean()
google_df["SMA_grocery_pharmacy"] = google_df.iloc[:,3].rolling(window=30).mean()
google_df["SMA_parks"] = google_df.iloc[:,4].rolling(window=30).mean()
google_df["SMA_transit"] = google_df.iloc[:,5].rolling(window=30).mean()
google_df["SMA_workplaces"] = google_df.iloc[:,6].rolling(window=30).mean()
google_df["SMA_residential"] = google_df.iloc[:,7].rolling(window=30).mean()
google_df.head(20)


Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0,,,,,,
1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0,,,,,,
2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0,,,,,,
3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0,,,,,,
4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0,,,,,,
5,Alabama,2020-02-20,-7.0,-6.0,-25.0,-1.0,0.0,3.0,,,,,,
6,Alabama,2020-02-21,3.0,0.0,12.0,9.0,1.0,0.0,,,,,,
7,Alabama,2020-02-22,5.0,4.0,46.0,13.0,6.0,-1.0,,,,,,
8,Alabama,2020-02-23,3.0,2.0,24.0,16.0,2.0,0.0,,,,,,
9,Alabama,2020-02-24,-2.0,-3.0,-13.0,2.0,-2.0,2.0,,,,,,


In [None]:
# # Reseting the index and saving the cleaned file to csv
google_us_df = google_df.reset_index(drop = True)
google_us_df.to_csv("google_us.csv")


In [24]:
# Selecting only the data for the US. This dropped the data to 456634 rows × 14 columns
google_mob_VA = google_df.loc[google_df["states"] == "Virginia"]
google_mob_VA = google_mob_VA.reset_index(drop = True)
google_mob_VA

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,Virginia,2020-02-15,2.0,2.0,3.0,4.0,0.0,0.0,0.400000,,,-10.366667,-25.700000,
1,Virginia,2020-02-16,4.0,0.0,9.0,-1.0,0.0,0.0,0.433333,,,-10.266667,-24.666667,
2,Virginia,2020-02-17,8.0,2.0,41.0,-6.0,-27.0,6.0,0.800000,,,-10.000000,-24.500000,
3,Virginia,2020-02-18,1.0,2.0,22.0,4.0,3.0,0.0,0.766667,,,-9.433333,-23.366667,
4,Virginia,2020-02-19,2.0,2.0,6.0,3.0,2.0,0.0,1.166667,,,-8.533333,-22.166667,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21343,Virginia,2020-07-31,-7.0,2.0,55.0,,-45.0,14.0,-0.666667,8.666667,,,-37.166667,
21344,Virginia,2020-08-01,-2.0,13.0,,,-12.0,,-1.200000,8.400000,,,-36.033333,
21345,Virginia,2020-08-02,-1.0,6.0,,,-13.0,,-1.566667,7.733333,,,-34.233333,
21346,Virginia,2020-08-03,0.0,18.0,18.0,,-44.0,14.0,-1.033333,7.900000,,,-34.900000,


In [25]:
# Grouping by date, so we can get all the data for all states into one date
# skipnabool, default is True, and all NA/null values are excluded, when computing the result.
data_by_date_VA_df = pd.DataFrame(google_mob_VA.groupby("dates").mean())
data_by_date_VA_df.reset_index(inplace = True)
data_by_date_VA_df

Unnamed: 0,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,2020-02-15,7.033898,1.703390,1.652174,5.875000,0.604396,-0.482759,-8.546237,1.238667,,-20.316667,-27.334510,10.044444
1,2020-02-16,5.916667,0.087719,3.520000,3.189189,0.431818,-0.448276,-8.282424,1.136054,,-22.645455,-26.292593,9.622222
2,2020-02-17,6.327434,0.435897,33.157895,1.833333,-19.236220,5.015873,-7.633951,1.117687,,-21.636364,-26.077395,9.383333
3,2020-02-18,2.787611,1.947826,19.894737,4.545455,2.492063,-0.267606,-7.303704,1.110204,,-20.518182,-24.853488,8.944444
4,2020-02-19,3.333333,2.104348,0.050000,3.191489,1.769841,-0.152778,-6.801235,1.252381,,-19.306061,-23.726357,8.527778
...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,2020-07-31,-7.768293,1.766234,22.352941,-14.181818,-32.844961,10.920000,-8.374762,2.038506,42.270833,-17.346237,-29.136066,10.301389
168,2020-08-01,-5.814815,6.671875,52.500000,-5.944444,-10.567010,3.724138,-8.690821,1.997661,42.129167,-18.174444,-28.885263,10.061111
169,2020-08-02,-7.527778,2.241379,55.947368,-4.135135,-14.197802,3.440000,-8.911594,1.625731,41.362500,-17.698925,-27.541111,9.659722
170,2020-08-03,-6.410959,3.485714,8.210526,-17.476190,-32.155039,10.581818,-8.233333,1.618129,49.814286,-16.814583,-27.739891,9.944444


In [28]:
# # Reseting the index and saving the cleaned file to csv
data_by_date_VA_df.to_csv("google_mob_VAA.csv")

In [29]:
# Grouping by date, so we can get all the data for all states into one date
# skipnabool, default is True, and all NA/null values are excluded, when computing the result.
data_by_date_US_df = pd.DataFrame(google_df.groupby("dates").mean())
data_by_date_US_df.reset_index(inplace = True)
data_by_date_US_df

Unnamed: 0,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,2020-02-15,6.438490,2.379197,16.658416,4.307071,0.440225,-0.550276,-3.564387,3.430499,38.969935,-12.050508,-24.100974,7.729787
1,2020-02-16,7.986974,2.511178,17.796895,3.752110,0.484000,-0.661800,-3.274811,3.276794,35.700233,-12.121905,-23.112474,7.381628
2,2020-02-17,3.865043,-0.345417,27.884146,1.819767,-17.579826,4.363513,-3.060386,3.143699,33.313281,-11.794194,-22.794246,7.208911
3,2020-02-18,-0.193205,-1.269649,5.354559,2.244722,-0.005492,0.794469,-2.998998,2.946242,33.360937,-11.221212,-21.849528,6.901980
4,2020-02-19,2.079870,0.579322,7.988439,2.475737,0.884418,0.177460,-2.630322,2.952743,33.446875,-10.572796,-20.857547,6.536964
...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,2020-07-31,-3.338162,5.251713,68.145488,-3.857418,-28.497639,7.663490,-3.941568,4.445276,49.408766,-8.166667,-26.052553,7.810243
168,2020-08-01,-4.665718,7.484454,81.304189,1.590763,-9.850601,2.382151,-4.490288,4.048256,48.663862,-8.356387,-25.665812,7.614323
169,2020-08-02,-1.311211,4.662641,76.178253,0.365062,-12.422342,2.327806,-4.641958,3.563785,48.253074,-8.162613,-24.449131,7.253646
170,2020-08-03,1.516552,7.114116,56.354717,-6.108588,-27.709433,7.375178,-3.788574,3.574979,47.636452,-7.875407,-24.548064,7.430512


In [30]:
# # Reseting the index and saving the cleaned file to csv
data_by_date_US_df.to_csv("google_mob_US.csv")

In [9]:
start_date = "2020-02-15"
end_date = "2020-03-01"
mask = (google_df["dates"] > start_date) & (google_df["dates"] <= end_date)
cut_date_df = google_df.loc[mask]
cut_date_df

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0,,,,,,
2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0,,,,,,
3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0,,,,,,
4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0,,,,,,
5,Alabama,2020-02-20,-7.0,-6.0,-25.0,-1.0,0.0,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456344,Wyoming,2020-02-24,,,,,-4.0,,,,,,,
456345,Wyoming,2020-02-25,,-9.0,,,2.0,,,,,,,
456346,Wyoming,2020-02-26,,-4.0,,,-6.0,,,,,,,
456347,Wyoming,2020-02-27,,,,,0.0,,,,,,,


## Store NYT COVID cases and deaths CSV into DataFrame

In [None]:
csv_file = "Resources/COVID-states.csv"
covid_data_df = pd.read_csv(csv_file)
covid_data_df.head()

In [None]:
covid_us = covid_data_df.rename(columns = {"state":'states', "fips": 'fips', "date":'dates', "cases":'cases',
                            "deaths":'deaths'})
covid_us.head()

### Connect to local database

In [None]:
rds_connection_string = f"{db_user}:{{db_pwd}}@localhost:5432/mobility_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

### Check for tables

In [None]:
engine.table_names()

### Use pandas to load csv converted DataFrame into database

In [None]:
google_us.to_sql(name='google_data', con=engine, if_exists='append', index=False)

In [None]:
covid_us.to_sql(name='covid_data', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the tables

In [None]:
pd.read_sql_query('select * from google_data', con=engine).head(10)

In [None]:
pd.read_sql_query('select * from covid_data', con=engine).head(10)