# DBAP CA2
## Jia Lin 22117644 teamC
## TABLE 2: US Covid-19 Data Over Time
### Import Necessary Libraries

In [1]:
import requests             # library request is used to fetch data from API 
import json                 # library json is used load json objects
import datetime             # library datetime is used to convert between datetime and string
from datetime import date
import time                 # libray time is used to measure the execution time of program
import pymongo              # library pymongo is used to build connection with MongoDB
import pandas               # library pandas is used to create a ETL pipeline
import psycopg2             # library psycopg2 is used to make connection with PostgreSQL
import csv                  # library csv is used to store csv to PostgreSQL

### Function Pool

In [2]:
# getResponse function is defined to get response from web "url" based on "query"
def getResponse(url, query):
    return requests.request("GET", url, headers=headers, params=query)

# getJSON_obj function is defined to get a json object based on "response"
def getJSON_obj(response):
    return json.loads(response.content.decode("utf-8"))

# get_data_value function is defined to return the value of key data
def get_data_value(url, query):
    json_obj = getJSON_obj(getResponse(url, query))["data"]
    if len(json_obj) == 0:
        print("Notice: there is no data on this date!")
    return json_obj

# get_data_values function is defined to get a list of json values of key data
# For each json_obj, there is a json_obj["data"]
# This function will return a list of json_obj["data"]
# based on a list of queries from a specific url
def get_data_values(url, query_list):
    return [get_data_value(url, q) for q in query_list]


# a_day_before function is defined to get the date of one day before the day_date
def a_day_before(day_date): 
    return day_date - datetime.timedelta(days=1)

# str_to_date function is defined to convert a string to a date
def str_to_date(s): 
    return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').date()

# date_to_str function is defined to convert a date to a string
def date_to_str(dt): 
    return dt.strftime('%Y-%m-%d')  

# get_date_list function is defined to return a list of date in string format
# by given the start_date and the number of days
def get_date_list(start_date, number_of_days):
    date_datetype_list = [start_date - datetime.timedelta(days=x) for x in range(number_of_days)]
    return [date_to_str(x) for x in date_datetype_list]

def get_csv(collection_name, csv_file_name):
    cursors = collection_name.find()
    # Convert the mongo documents to a DataFrame
    documents = pandas.DataFrame(cursors)
    # Discard the Mongo ID for the documents
    documents.pop("_id")
    # export MongoDB documents to a csv file, leaving out the row "labels" (row numbers)
    documents.to_csv(csv_file_name, ",", index=False)

## ETL pipeline: Extract
### Fetch Data from API (Extract)

In [5]:
# Source code: https://rapidapi.com/axisbits-axisbits-default/api/covid-19-statistics/
# Based on public data by Johns Hopkins CSSE
# In terms of the requirements of this project, Jia Lin made some modifications on the source code
headers = {"X-RapidAPI-Key": "bc72ee8736mshfc960795af3b6ddp15d851jsn50353bd76099",
           "X-RapidAPI-Host": "covid-19-statistics.p.rapidapi.com"}

# Reports by date an country/province. 
# Cities data is available for the USA only.
url_US_reports = "https://covid-19-statistics.p.rapidapi.com/reports"
#query_US_reports = {"iso":"USA","date":date_to_str(yesterday())}
# The date is an vital parameter
# If the API haven't update, a specific date needs to be selected. 
query_US_reports = {"iso":"USA","date":"2022-12-06"}
data_value_US_reports = get_data_value(url_US_reports, query_US_reports)
print("The latest Covid-19 data for US: \n{}\n".format(data_value_US_reports))

The latest Covid-19 data for US: 
[{'date': '2022-12-06', 'confirmed': 1859858, 'deaths': 14705, 'recovered': 0, 'confirmed_diff': 0, 'deaths_diff': 0, 'recovered_diff': 0, 'last_update': '2022-12-07 04:20:58', 'active': 1845153, 'active_diff': 0, 'fatality_rate': 0.0079, 'region': {'iso': 'USA', 'name': 'US', 'province': 'Washington', 'lat': '47.4009', 'long': '-121.4905', 'cities': [{'name': 'Adams', 'date': '2022-12-06', 'fips': 53001, 'lat': '46.98299757', 'long': '-118.5601734', 'confirmed': 5551, 'deaths': 44, 'confirmed_diff': 0, 'deaths_diff': 0, 'last_update': '2022-12-07 04:20:58'}, {'name': 'Asotin', 'date': '2022-12-06', 'fips': 53003, 'lat': '46.18894415', 'long': '-117.2022851', 'confirmed': 5320, 'deaths': 78, 'confirmed_diff': 0, 'deaths_diff': 0, 'last_update': '2022-12-07 04:20:58'}, {'name': 'Benton', 'date': '2022-12-06', 'fips': 53005, 'lat': '46.23946995', 'long': '-119.5120834', 'confirmed': 60524, 'deaths': 516, 'confirmed_diff': 0, 'deaths_diff': 0, 'last_updat

### Collection US Data for 365 days

In [10]:
# The number_of_days_for_US is the number of instances (or rows) of US data
# it must be greater than or equal to 1
# The first Covid-19 case was reported in USA in 20 Jan. 2020.
# Hence we collect the data from Feb. 2020
# Hence, maximum value of the number_of_days is 1030 (1039 until 05/12/22) 
number_of_days_for_US = 365
if number_of_days_for_US < 1:
    print("The query day must be more than 1, number_of_day = {}".format(number_of_days_for_US))
elif number_of_days_for_US >1030:
    print("The query day must be less than or equal to 1000, number_of_days = {}".format(number_of_days_for_US))
else:
    print("The query days for US = {}".format(number_of_days_for_US))

start_time = time.time()   

# A list of date queries
date_list_for_US = get_date_list(str_to_date("2022-12-06 00:00:00"), number_of_days_for_US)
query_date_list_for_US = [{"iso":"USA","date":x} for x in date_list_for_US]
        
# list of global data info over time from API.
# At this stage, only consider about the US info (next step will dive into province/cities).
# Hence, this is a intermediate result list, not the final one
intermediate_result = get_data_values(url_US_reports, query_date_list_for_US)

# Source code: https://stackoverflow.com/questions/28218173/extract-part-of-data-from-json-file-with-python
# In order to compare with the global data, the order of cols will be re-organised
final_result = []
for oneDayReport in intermediate_result:
    json_obj = oneDayReport[0]
    my_dict = {}
    my_dict["date"]=json_obj.get("date")
    my_dict["last_update"]=json_obj.get("last_update")
    my_dict["confirmed"]=json_obj.get("confirmed")
    my_dict["confirmed_diff"]=json_obj.get("confirmed_diff")
    my_dict["deaths"]=json_obj.get("deaths")
    my_dict["deaths_diff"]=json_obj.get("deaths_diff")
    my_dict["recovered"]=json_obj.get("recovered")
    my_dict["recovered_diff"]=json_obj.get("recovered_diff") 
    my_dict["active"]=json_obj.get("active")
    my_dict["active_diff"]=json_obj.get("active_diff")
    my_dict["fatality_rate"]=json_obj.get("fatality_rate")
    my_dict["iso"]=json_obj.get("region").get("iso")
    final_result.append(my_dict)
# This list can be inserted to MongoDB as a collection    
reports_US_data_over_time_list=final_result
print(type(final_result))
print(len(final_result))
#print(type(reports_US_data_over_time_list))

end_time= time.time()

elapsed_time = end_time-start_time

print("Execution time: ", time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
#######################################################################
# Output:
# The query days for US = 365
# <class 'list'>
# 365
# Execution time:  00:19:29
#######################################################################

The query days for US = 365
<class 'list'>
365
Execution time:  00:19:29


## ETL pipeline: Transform
### Making a Connection with MongoDB
#### One Collection is Created: 
#### reports_US_date_over_time_collection

In [11]:
# Source code: DBAP_Lab_Week6 (MogoDB)
client = pymongo.MongoClient('localhost', 27017)
# create a database called jialin_Mongo_database (may change to covid19JHCSSE_database)
db = client.jialin_Mongo_database
# create a cellections
reports_US_data_over_time_collection = db.reports_US_data_over_time_collection
reports_US_data_over_time_collection.drop()

# Insert more than 1 docuement using insert_many method
# collection 2
reports_US_data_over_time_collection.insert_many(reports_US_data_over_time_list)

database_list = client.list_database_names()
print ("db names:", database_list)
db.list_collection_names()

db names: ['admin', 'config', 'jialin_Mongo_database', 'local', 'test_database']


['global_date_over_time_collection',
 'reports_US_data_over_time_collection',
 'provinces_of_US_collection',
 'reports_US_provinces_cities_collection']

## ETL pipeline: Transform
### JSON to CSV
### Export CSV File from MongoDB

In [12]:
get_csv(reports_US_data_over_time_collection, "DBAP_CA2_reports_US_data_over_time.csv")