In [1]:
import pandas as pd
import csv
import datetime as dt
import pymongo
from sqlalchemy import create_engine
from config import username, password

# Extraction

In [2]:
# Extracted two data sources:
world = pd.read_csv('2019_nC0v_20200121_20200126_cleaned.csv')
us = pd.read_csv('us-counties.csv')

# Transform

### First Dataset

In [3]:

world.head()

Unnamed: 0.1,Unnamed: 0,Province/State,Country,Date last updated,Confirmed,Suspected,Recovered,Deaths
0,0,Shanghai,Mainland China,1/21/2020,9.0,10.0,0.0,0.0
1,1,Yunnan,Mainland China,1/21/2020,1.0,0.0,0.0,0.0
2,2,Beijing,Mainland China,1/21/2020,10.0,0.0,0.0,0.0
3,3,Taiwan,Taiwan,1/21/2020,1.0,0.0,0.0,0.0
4,4,Jilin,Mainland China,1/21/2020,0.0,1.0,0.0,0.0


In [4]:
world2 = world.drop(columns='Unnamed: 0').dropna()

In [5]:
world3 = world2.drop(columns=['Suspected','Recovered'])

In [6]:
world3.head()

Unnamed: 0,Province/State,Country,Date last updated,Confirmed,Deaths
0,Shanghai,Mainland China,1/21/2020,9.0,0.0
1,Yunnan,Mainland China,1/21/2020,1.0,0.0
2,Beijing,Mainland China,1/21/2020,10.0,0.0
3,Taiwan,Taiwan,1/21/2020,1.0,0.0
4,Jilin,Mainland China,1/21/2020,0.0,0.0


In [7]:
world4 = world3.loc[world3['Date last updated'] != '1/23/20 12:00 PM']

In [8]:
world3['Date last updated'].head()

0    1/21/2020
1    1/21/2020
2    1/21/2020
3    1/21/2020
4    1/21/2020
Name: Date last updated, dtype: object

In [9]:
def dateFun2(str):
    return dt.datetime.strptime(str,'%m/%d/%Y')

### Second Dataset

In [11]:
us.state.count()

56541

In [12]:
us.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


### Object to Date Function

In [13]:
def dateFun(str):
    return dt.datetime.strptime(str, '%Y-%m-%d')

In [14]:
us['date'] = us['date'].map(dateFun)

In [15]:
us.set_index('date').sort_index()

Unnamed: 0_level_0,county,state,fips,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-21,Snohomish,Washington,53061.0,1,0
2020-01-22,Snohomish,Washington,53061.0,1,0
2020-01-23,Snohomish,Washington,53061.0,1,0
2020-01-24,Cook,Illinois,17031.0,1,0
2020-01-24,Snohomish,Washington,53061.0,1,0
...,...,...,...,...,...
2020-04-13,Sublette,Wyoming,56035.0,1,0
2020-04-13,Sweetwater,Wyoming,56037.0,9,0
2020-04-13,Teton,Wyoming,56039.0,56,0
2020-04-13,Uinta,Wyoming,56041.0,4,0


In [16]:
us['date'].max()

Timestamp('2020-04-13 00:00:00')

# LOADING

### Mongo Database

In [17]:
conn = 'mongodb://localhost:27017'

In [18]:
client = pymongo.MongoClient(conn)

In [19]:
db = client.etl_db
table = db.etl

In [20]:
world3.head()

Unnamed: 0,Province/State,Country,Date last updated,Confirmed,Deaths
0,Shanghai,Mainland China,1/21/2020,9.0,0.0
1,Yunnan,Mainland China,1/21/2020,1.0,0.0
2,Beijing,Mainland China,1/21/2020,10.0,0.0
3,Taiwan,Taiwan,1/21/2020,1.0,0.0
4,Jilin,Mainland China,1/21/2020,0.0,0.0


In [21]:
world3.set_index('Date last updated', inplace = True)

In [22]:
table.insert_one(world3.to_dict())

<pymongo.results.InsertOneResult at 0x2b41439f8c8>

### PostGresSQL Database

In [23]:
db = create_engine(f'postgresql://{username}:{password}@localhost:5432/etl_db')

In [24]:
db.table_names()

[]

In [27]:
world3.to_sql('etl', con=db)