## Basic Imports ##

In [3]:
import pandas as pd
import os
import numpy as np
from pprint import pprint
import tqdm
from datetime import date, timedelta
from copy import deepcopy

## Global Country Level Data ## 
Here we will use the Hopkins API for Quering its data - it is available [here](https://coviddata.github.io/covid-api/). We will pull the country level data.
- ` country_confirmed_df` - Country level time series data on the number of confirmed cases 
- ` country_deaths_df` - Country level time series data on the number of deaths. 

In [4]:
country_confirmed_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
country_deaths_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
country_confirmed_df['Country/Region'].replace(['US'], 'United States',inplace=True)
country_deaths_df['Country/Region'].replace(['US'], 'United States',inplace=True)
country_deaths_df = pd.DataFrame(country_deaths_df.groupby('Country/Region').sum())
country_confirmed_df = pd.DataFrame(country_confirmed_df.groupby('Country/Region').sum())

## State Level Data ## 
- `us_state_confirmed_df ` - is the time series data for confirmed cases on a state level.
- `us_state_deaths_df ` - is the time series data for deaths on a state level. 

In [5]:
us_confirmed_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
us_deaths_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv")
us_states_confirmed_df = pd.DataFrame(us_confirmed_df.groupby('Province_State').sum())
us_states_deaths_df = pd.DataFrame(us_deaths_df.groupby('Province_State').sum())

## Import Long/Lat Data for Countries and each States##
We are going to be using Paul Mooneys Latitude and Longitude for Every Country and State data set from Kaggle - it is available [here](https://www.kaggle.com/paultimothymooney/latitude-and-longitude-for-every-country-and-stateworld_country_and_usa_states_latitude_and_longitude_values.csv)
* ``` country_long_lat_df ``` stores all of the country level data
* ``` usa_long_lat_df ``` stores the state level data for the United States 

And add this to our original data frames. 

In [7]:

country_long_lat_df = pd.read_csv('data/world_country_and_usa_states_latitude_and_longitude_values.csv')[['country_code', 'latitude', 'longitude', 'country']]
usa_long_lat_df = pd.read_csv('data/world_country_and_usa_states_latitude_and_longitude_values.csv')[['usa_state_code',
       'usa_state_latitude', 'usa_state_longitude', 'usa_state']]
country_confirmed_df = country_confirmed_df.merge(country_long_lat_df, right_on = 'country', left_on = 'Country/Region')
country_deaths_df = country_deaths_df.merge(country_long_lat_df, right_on = 'country', left_on = 'Country/Region')

us_states_confirmed_df = us_states_confirmed_df.merge(usa_long_lat_df, right_on = 'usa_state', left_on = 'Province_State')
us_states_deaths_df = us_states_deaths_df.merge(usa_long_lat_df, right_on = 'usa_state', left_on = 'Province_State')


## Dates/ Getting Data for Time Series Analysis ## 
We need to know what dates we are dealing with - so lets generate a list of the available dates in our dataset.  

In [30]:

rel_dates = list(country_confirmed_df.columns[country_confirmed_df.columns.str.contains(r"\d{1,2}\/\d{1,2}\/\d{1,2}")])

def stripAndFlip(df, index = 'country', date_range = rel_dates):
    temp_df = df.copy()
    temp_df.set_index(index)
    temp_df = temp_df[date_range]
    return temp_df

ts_global_confirmed = stripAndFlip(country_confirmed_df)
ts_global_deaths = stripAndFlip(country_deaths_df)
ts_us_confirmed = stripAndFlip(us_states_confirmed_df, index = 'usa_state')
ts_us_deaths = stripAndFlip(us_states_deaths_df, index = 'usa_state')



Index(['UID', 'code3', 'FIPS', 'Lat', 'Long_', 'Population', '1/22/20',
       '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20',
       '1/29/20', '1/30/20', '1/31/20', '2/1/20', '2/2/20', '2/3/20', '2/4/20',
       '2/5/20', '2/6/20', '2/7/20', '2/8/20', '2/9/20', '2/10/20', '2/11/20',
       '2/12/20', '2/13/20', '2/14/20', '2/15/20', '2/16/20', '2/17/20',
       '2/18/20', '2/19/20', '2/20/20', '2/21/20', '2/22/20', '2/23/20',
       '2/24/20', '2/25/20', '2/26/20', '2/27/20', '2/28/20', '2/29/20',
       '3/1/20', '3/2/20', '3/3/20', '3/4/20', '3/5/20', '3/6/20', '3/7/20',
       '3/8/20', '3/9/20', '3/10/20', '3/11/20', '3/12/20', '3/13/20',
       '3/14/20', '3/15/20', '3/16/20', '3/17/20', '3/18/20', '3/19/20',
       '3/20/20', '3/21/20', '3/22/20', '3/23/20', '3/24/20', '3/25/20',
       '3/26/20', '3/27/20', '3/28/20', '3/29/20', '3/30/20', '3/31/20',
       '4/1/20', 'usa_state_code', 'usa_state_latitude', 'usa_state_longitude',
       'usa_state'],
      dty

## Pipelines ##
Not doing anythign with it yet but lets set up some pipelines to make our life down the road


In [37]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
cat_trans = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='constant')), #Subject to change please
    ('ordinal', OrdinalEncoder())])
num_trans = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='constant'))
])
dataclean = ColumnTransformer(transformers=[
        ('num', num_trans, numerical_cols),
        ('cat', cat_trans, categorical_cols)
    ])
model = LinearRegression()
my_pipeline = Pipeline(steps=[('preprocessor', dataclean),
                              ('model', model)
                             ])

NameError: name 'numerical_transformer' is not defined