# DBAP CA2
## Jia Lin 22117644 teamC
## TABLE 1: Global Covid-19 Data Over Time
### Import Necessary Libraries

In [1]:
import requests             # library request is used to fetch data from API 
import json                 # library json is used load json objects
import datetime             # library datetime is used to convert between datetime and string
from datetime import date
import pymongo              # library pymongo is used to build connection with MongoDB
import pandas               # library pandas is used to create a ETL pipeline
import psycopg2             # library psycopg2 is used to make connection with PostgreSQL
import csv                  # library csv is used to store csv to PostgreSQL

### Function Pool

In [2]:
# getResponse function is defined to get response from web "url" based on "query"
def getResponse(url, query):
    return requests.request("GET", url, headers=headers, params=query)

# getJSON_obj function is defined to get a json object based on "response"
def getJSON_obj(response):
    return json.loads(response.content.decode("utf-8"))

# get_data_value function is defined to return the value of key data
def get_data_value(url, query):
    json_obj = getJSON_obj(getResponse(url, query))["data"]
    if len(json_obj) == 0:
        print("Notice: there is no data on this date!")
    return json_obj

# get_data_values function is defined to get a list of json values of key data
# For each json_obj, there is a json_obj["data"]
# This function will return a list of json_obj["data"]
# based on a list of queries from a specific url
def get_data_values(url, query_list):
    return [get_data_value(url, q) for q in query_list]


# a_day_before function is defined to get the date of one day before the day_date
def a_day_before(day_date): 
    return day_date - datetime.timedelta(days=1)

# str_to_date function is defined to convert a string to a date
def str_to_date(s): 
    return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').date()

# date_to_str function is defined to convert a date to a string
def date_to_str(dt): 
    return dt.strftime('%Y-%m-%d')  

# get_date_list function is defined to return a list of date in string format
# by given the start_date and the number of days
def get_date_list(start_date, number_of_days):
    date_datetype_list = [start_date - datetime.timedelta(days=x) for x in range(number_of_days)]
    return [date_to_str(x) for x in date_datetype_list]

def get_csv(collection_name, csv_file_name):
    cursors = collection_name.find()
    # Convert the mongo documents to a DataFrame
    documents = pandas.DataFrame(cursors)
    # Discard the Mongo ID for the documents
    documents.pop("_id")
    # export MongoDB documents to a csv file, leaving out the row "labels" (row numbers)
    documents.to_csv(csv_file_name, ",", index=False)

## ETL pipeline: Extract
### Fetch Data from API (Extract)


In [3]:
# Source code: https://rapidapi.com/axisbits-axisbits-default/api/covid-19-statistics/
# Based on public data by Johns Hopkins CSSE
# In terms of the requirements of this project, Jia Lin made some modifications on the source code
headers = {"X-RapidAPI-Key": "bc72ee8736mshfc960795af3b6ddp15d851jsn50353bd76099",
           "X-RapidAPI-Host": "covid-19-statistics.p.rapidapi.com"}

# Total data for the entire world for particular date
# In terms of data in API, the result is a day before current date
url_total = "https://covid-19-statistics.p.rapidapi.com/reports/total"
query_date = {"date":"2022-12-06"}
# The date is an vital parameter
# If the API haven't be updated, there is no value to present. 
data_value_total = get_data_value(url_total, query_date)
print("The Covid-19 data for global: \n{}\n".format(data_value_total))

The Covid-19 data for global: 
{'date': '2022-12-06', 'last_update': '2022-12-07 04:20:58', 'confirmed': 646353483, 'confirmed_diff': 591721, 'deaths': 6644784, 'deaths_diff': 2621, 'recovered': 0, 'recovered_diff': 0, 'active': 639708699, 'active_diff': 589100, 'fatality_rate': 0.0103}



### Collect Data for 1000 Days

In [4]:
# The number_of_days_for_global is the number of instances (or rows) of global info
# it must be greater than or equal to 1
# China reports the Covid-19 in 31 Dec. 2019.
# Hence we collect the data from Jan. 2020
# Hence, maximum value of the number_of_days is approximate 1030 (1070 until 05/12/22)
number_of_days_for_global = 1000
if number_of_days_for_global < 1:
    print("The query day must be more than 1, number_of_day = {}".format(number_of_days_for_global))
elif number_of_days_for_global >1030:
    print("The query day must be less than or equal to 1030, number_of_days = {}".format(number_of_days_for_global))
else:
    print("The query days for global = {}".format(number_of_days_for_global))

# a list of date queries   
date_list_for_global = get_date_list(str_to_date("2022-12-06 00:00:00"), number_of_days_for_global)
query_date_list_for_global = [{"date":x} for x in date_list_for_global]
#print(query_date_list_for_global)        
    
# list of global data info over time
# This list can be inserted to MongoDB as a collection
global_data_over_time_list = get_data_values(url_total, query_date_list_for_global)

#for row in global_data_over_time_list:
#    print(row)
#print(global_data_over_time_list)

The query days for global = 1000


## ETL pipeline: Transform
### Making a Connection with MongoDB
#### One Collection is Created: 
#### global_date_over_time_collection

In [9]:
# Source code: DBAP_Lab_Week6 (MogoDB)
client = pymongo.MongoClient('localhost', 27017)
# create a database called jialin_Mongo_database (may change to covid19JHCSSE_database)
db = client.jialin_Mongo_database
# create a cellections
global_data_over_time_collection = db.global_date_over_time_collection
global_data_over_time_collection.drop()
# Insert more than 1 docuement using insert_many method
# collection 1
global_data_over_time_collection.insert_many(global_data_over_time_list)

database_list = client.list_database_names()
print("database names:")
for database in database_list:
    print(database)
print("\n")    
print("collection names:")
db.list_collection_names()

database names:
admin
config
jialin_Mongo_database
local
test_database


collection names:


['global_date_over_time_collection',
 'reports_US_data_over_time_collection',
 'provinces_of_US_collection',
 'reports_US_provinces_cities_collection']

## ETL pipeline: Transform
### JSON to CSV
### Export CSV File from MongoDB

In [None]:
get_csv(global_data_over_time_collection, "DBAP_CA2_global_data_over_time.csv")