# Clean Google Mobility Data 



In [1]:
# Dependencies and Setup
import json
import os
import pandas as pd
import urllib.request
import requests
# from config import db_pwd, db_user
from sqlalchemy import create_engine

## Store Google CSV into DataFrame

In [2]:
csv_file = "Resources/google_mob_US.csv"
google_data_df = pd.read_csv(csv_file)
google_data_df.head()

Unnamed: 0.1,Unnamed: 0,State,date,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential
0,0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0


### Rename the dataframe with select columns

In [3]:
google_data_df = google_data_df.rename(columns = {"State":'states', "date":'dates', "retail_and_recreation":'retail_recreation',
                            "grocery_and_pharmacy":'grocery_pharmacy', "parks":'parks', "transit_stations":"transit", "workplaces":"workplaces", "residential":"residential"})
google_data_df.head()

Unnamed: 0.1,Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0


In [4]:
google_df = google_data_df[["states", "dates", "retail_recreation",
                            "grocery_pharmacy", "parks", "transit", "workplaces", "residential"]]
google_df.head()

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0


In [5]:
# Reseting the index and saving the cleaned file to csv
# clean_google_mob_US_df = clean_google_mob_US_df.reset_index(drop = True)
# clean_google_mob_US_df.to_csv("../Data/clean_google_mob_US.csv")

## Adding the 30 Day Moving Average 

In [6]:
google_df["SMA_retail_recreation"] = google_df.iloc[:,2].rolling(window=30).mean()
google_df["SMA_grocery_pharmacy"] = google_df.iloc[:,3].rolling(window=30).mean()
google_df["SMA_parks"] = google_df.iloc[:,4].rolling(window=30).mean()
google_df["SMA_transit"] = google_df.iloc[:,5].rolling(window=30).mean()
google_df["SMA_workplaces"] = google_df.iloc[:,6].rolling(window=30).mean()
google_df["SMA_residential"] = google_df.iloc[:,7].rolling(window=30).mean()
google_df.head()


Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0,,,,,,
1,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0,,,,,,
2,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0,,,,,,
3,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0,,,,,,
4,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0,,,,,,


In [7]:
# Reseting the index and saving the cleaned file to csv
google_us_df = google_df.reset_index(drop = True)
google_us_df.to_csv("google_us.csv")


## Test Data - Virginia

In [8]:
# Selecting only the data for the US. This dropped the data to 456634 rows × 14 columns
google_mob_VA = google_df.loc[google_df["states"] == "Virginia"]
google_mob_VA = google_mob_VA.reset_index(drop = True)
google_mob_VA.head()

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,Virginia,2020-02-15,2.0,2.0,3.0,4.0,0.0,0.0,0.4,,,-10.366667,-25.7,
1,Virginia,2020-02-16,4.0,0.0,9.0,-1.0,0.0,0.0,0.433333,,,-10.266667,-24.666667,
2,Virginia,2020-02-17,8.0,2.0,41.0,-6.0,-27.0,6.0,0.8,,,-10.0,-24.5,
3,Virginia,2020-02-18,1.0,2.0,22.0,4.0,3.0,0.0,0.766667,,,-9.433333,-23.366667,
4,Virginia,2020-02-19,2.0,2.0,6.0,3.0,2.0,0.0,1.166667,,,-8.533333,-22.166667,


In [9]:
# Grouping by date, so we can get all the data for all states into one date
# skipnabool, default is True, and all NA/null values are excluded, when computing the result.
data_by_date_VA_df = pd.DataFrame(google_mob_VA.groupby("dates").mean())
data_by_date_VA_df.reset_index(inplace = True)
data_by_date_VA_df.head()

Unnamed: 0,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,2020-02-15,7.033898,1.70339,1.652174,5.875,0.604396,-0.482759,-8.546237,1.238667,,-20.316667,-27.33451,10.044444
1,2020-02-16,5.916667,0.087719,3.52,3.189189,0.431818,-0.448276,-8.282424,1.136054,,-22.645455,-26.292593,9.622222
2,2020-02-17,6.327434,0.435897,33.157895,1.833333,-19.23622,5.015873,-7.633951,1.117687,,-21.636364,-26.077395,9.383333
3,2020-02-18,2.787611,1.947826,19.894737,4.545455,2.492063,-0.267606,-7.303704,1.110204,,-20.518182,-24.853488,8.944444
4,2020-02-19,3.333333,2.104348,0.05,3.191489,1.769841,-0.152778,-6.801235,1.252381,,-19.306061,-23.726357,8.527778


In [10]:
# Reseting the index and saving the cleaned file to csv
data_by_date_VA_df.to_csv("google_mob_VAA.csv")

In [11]:
# Grouping by date, so we can get all the data for all states into one date
# skipnabool, default is True, and all NA/null values are excluded, when computing the result.
data_by_date_US_df = pd.DataFrame(google_df.groupby("dates").mean())
data_by_date_US_df.reset_index(inplace = True)
data_by_date_US_df.head()

Unnamed: 0,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,2020-02-15,6.43849,2.379197,16.658416,4.307071,0.440225,-0.550276,-3.564387,3.430499,38.969935,-12.050508,-24.100974,7.729787
1,2020-02-16,7.986974,2.511178,17.796895,3.75211,0.484,-0.6618,-3.274811,3.276794,35.700233,-12.121905,-23.112474,7.381628
2,2020-02-17,3.865043,-0.345417,27.884146,1.819767,-17.579826,4.363513,-3.060386,3.143699,33.313281,-11.794194,-22.794246,7.208911
3,2020-02-18,-0.193205,-1.269649,5.354559,2.244722,-0.005492,0.794469,-2.998998,2.946242,33.360937,-11.221212,-21.849528,6.90198
4,2020-02-19,2.07987,0.579322,7.988439,2.475737,0.884418,0.17746,-2.630322,2.952743,33.446875,-10.572796,-20.857547,6.536964


In [12]:
# Reseting the index and saving the cleaned file to csv
data_by_date_US_df.to_csv("google_mob_US.csv")

## US States Moving Average DF

In [13]:
# Review previous df for the entire U.S - Noting number of rows
google_df.count

<bound method DataFrame.count of          states       dates  retail_recreation  grocery_pharmacy  parks  \
0       Alabama  2020-02-15                5.0               2.0   39.0   
1       Alabama  2020-02-16                0.0              -2.0   -7.0   
2       Alabama  2020-02-17                3.0               0.0   17.0   
3       Alabama  2020-02-18               -4.0              -3.0  -11.0   
4       Alabama  2020-02-19                4.0               1.0    6.0   
...         ...         ...                ...               ...    ...   
456457  Wyoming  2020-07-29                NaN               NaN    NaN   
456458  Wyoming  2020-07-30                NaN               NaN    NaN   
456459  Wyoming  2020-07-31                NaN               NaN    NaN   
456460  Wyoming  2020-08-03                NaN               NaN    NaN   
456461  Wyoming  2020-08-04                NaN               NaN    NaN   

        transit  workplaces  residential  SMA_retail_recreation  \

In [14]:
# Groupby state and date to return the moving average (30 days)
data_us_df = pd.DataFrame(google_df.groupby(['states','dates']).mean())
data_us_df.reset_index(inplace = True)
data_us_df

Unnamed: 0,states,dates,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,SMA_retail_recreation,SMA_grocery_pharmacy,SMA_parks,SMA_transit,SMA_workplaces,SMA_residential
0,Alabama,2020-02-15,5.155172,0.230769,49.571429,5.761905,0.784314,-0.863636,-4.353333,3.583871,1.666667,4.946667,-22.844444,7.441667
1,Alabama,2020-02-16,-1.454545,-4.015625,-5.476190,2.136364,-1.920000,0.523810,-4.335632,3.376344,1.800000,5.013333,-21.935461,7.108333
2,Alabama,2020-02-17,-1.169492,-4.476190,12.933333,5.727273,-16.264706,4.333333,-4.324138,3.105556,2.000000,5.140000,-21.568085,6.916667
3,Alabama,2020-02-18,-4.280702,-5.000000,-12.466667,-0.681818,0.338235,1.435897,-4.388506,2.746667,0.966667,4.946667,-20.662411,6.641667
4,Alabama,2020-02-19,1.438596,-0.730159,6.666667,3.714286,1.397059,0.000000,-4.066667,2.704444,1.466667,4.973333,-19.695035,6.291667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8767,Wyoming,2020-07-31,16.000000,28.333333,132.000000,44.666667,-24.190476,2.500000,19.620000,18.425000,,35.429167,-22.344444,3.244444
8768,Wyoming,2020-08-01,12.363636,17.200000,333.000000,40.888889,-9.933333,-1.000000,18.860000,17.600000,,35.379167,-21.597778,3.122222
8769,Wyoming,2020-08-02,24.300000,18.750000,321.000000,33.750000,-11.461538,-1.666667,19.056667,17.066667,,34.629167,-20.858974,2.788889
8770,Wyoming,2020-08-03,29.900000,20.200000,323.000000,34.555556,-23.142857,2.555556,20.236667,17.200000,,34.991667,-21.266667,2.977778


In [15]:
# Export the US data csv
data_us_df.to_csv("data_us.csv")

In [None]:
# start_date = "2020-02-15"
# end_date = "2020-03-01"
# mask = (google_df["dates"] > start_date) & (google_df["dates"] <= end_date)
# cut_date_df = google_df.loc[mask]
# cut_date_df

## Store NYT COVID cases and deaths CSV into DataFrame

In [None]:
csv_file = "Resources/COVID-states.csv"
covid_data_df = pd.read_csv(csv_file)
covid_data_df.head()

In [None]:
covid_us = covid_data_df.rename(columns = {"state":'states', "fips": 'fips', "date":'dates', "cases":'cases',
                            "deaths":'deaths'})
covid_us.head()

### Connect to local database

In [None]:
rds_connection_string = f"{db_user}:{{db_pwd}}@localhost:5432/mobility_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

### Check for tables

In [None]:
engine.table_names()

### Use pandas to load csv converted DataFrame into database

In [None]:
google_us.to_sql(name='google_data', con=engine, if_exists='append', index=False)

In [None]:
covid_us.to_sql(name='covid_data', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the tables

In [None]:
pd.read_sql_query('select * from google_data', con=engine).head(10)

In [None]:
pd.read_sql_query('select * from covid_data', con=engine).head(10)