In [None]:
'''
Unit 7.6 Data Wrangling for Capstone 2
Nantawat Samermit 
'''

# <center>Data Scraping<center>
I'll be scraping two datasets. 

The first is from the Cambridge MA wastewater data archive. It is listed as JSON, which I will convert to a pandas DataFrame. 

The URL is here: 'https://data.cambridgema.gov/resource/ayt4-g2ye.json'


This is the dataset by which I will train and test my model.
The second will be the statewide Massachusetts data. This dataset is where I will test my final model.

The URL is here: 'https://www.mass.gov/doc/covid-19-raw-data-june-15-2021/download'

## Scrapping the First Dataset (Cambridge Wastewater)

In [1]:
#import relevant modules for data wrangling - keep on adding as needed
import pandas as pd
import json, requests
import fuzzywuzzy 
import matplotlib

In [4]:
#ask Luka: Should I just use requests or should I use the API given by the data set manager?
#here is a code snippet from the API doc
#the source is: https://dev.socrata.com/foundry/data.cambridgema.gov/ayt4-g2ye

#from sodapy import Socrates
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
#client = Socrata("data.cambridgema.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cambridgema.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
#results = client.get("ayt4-g2ye", limit=2000)

# Convert to pandas DataFrame
#results_df = pd.DataFrame.from_records(results)

In [2]:
#Cambridge MA wastewater dataset as JSON
url = 'https://data.cambridgema.gov/resource/ayt4-g2ye.json'

## Scrapping the second Dataset (Massachusetts Covid-19)
After some attempts, it seems the Massachusetts dataset is a multi-page csv, and it isn't easily accessible.

I will have to use BeautifulSoup to isolate the link to the excel file.

In [7]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request

In [9]:
# Use BS to identify/isolate the html "class" pointing to the href of the .xls 
#class and "ma__download-link__file-link" was identified by inspecting the page's elements in chrome
source = requests.get("https://www.mass.gov/doc/covid-19-raw-data-june-15-2021")
BeautifulSoup(source.content).find("a", {"class":"ma__download-link__file-link"}).get("href")

'/doc/covid-19-raw-data-june-15-2021/download'

BeautifulSoup has isloated the href for the xls. 

It is here: https://www.mass.gov/doc/covid-19-raw-data-june-15-2021/download

## Scrape, Download, and Save datasets
I will write a function to scrape, convert, and save both datasets to my directory as pickle files. 

In [10]:
#Luka states it is easier (and acceptable) to just download the files to local path. 

#For the mass.gov dataset, it has mulitiple sheets, so it isolates the intended sheet. (not dynamic, ikr)
sources = ['https://data.cambridgema.gov/resource/ayt4-g2ye.json', 'https://www.mass.gov/doc/covid-19-raw-data-june-15-2021/download']

def get_sources(source_list):
    '''
    this function takes a list of urls, ID's their format, requests dataset from source,
    converts data into a dataframe, then saves it as a pickled file.
    '''
    for source in source_list:
        if source.endswith('.json'):
            response = requests.get(source)
            data = response.json()
            json_df = pd.DataFrame(data)
            json_df.to_pickle('Cambridge.pkl')
            print('Cambridge.pkl created using this source: ', source)
            
        else:
            response = requests.get(source, allow_redirects=True)
            with open('excel_data.xlsx', 'wb') as file:
                file.write(response.content)
            
            #using pd.read_excel on a multi-spreadsheet xls yields a dictionary
            excel_dict = pd.read_excel('excel_data.xlsx', sheet_name=None)
            
            #can ID sheetnames by evaluating keys
            #print(excel_dict.keys())
            
            #Will be using the "Weekly_City_Town" sheet
            Mass_weekly_city = excel_dict['Weekly_City_Town']
            
            #Use df.to_pickle to save as .pkl file
            Mass_weekly_city.to_pickle('Mass_Weekly_City_df.pkl')
            
            print('Mass_Weekly_City.pkl created using this source', source)

In [11]:
get_sources(sources)

Cambrdige.pkl created using this source:  https://data.cambridgema.gov/resource/ayt4-g2ye.json
Mass_Weekly_City.pkl created using this source https://www.mass.gov/doc/covid-19-raw-data-june-15-2021/download


## Use pickle to confirm the saved files can be loaded in Jupyter

In [12]:
import pickle

In [16]:
#should use context manager to load pickles 
# Don't forget to state 'rb' param or will get an Unicode Error.
with open('Cambridge.pkl', 'rb') as f:
    Cambridge_df = pickle.load(f)
    
with open('Mass_Weekly_City_df.pkl', 'rb') as f:
    Mass_df = pickle.load(f)

In [22]:
#testing both dataframes
Cambridge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   date                          289 non-null    object
 1   mwra_concentration            289 non-null    object
 2   mwra_7dayaverage              289 non-null    object
 3   concentration_mid_cambridge   289 non-null    object
 4   concentration_east_cambridge  289 non-null    object
 5   concentration_cambridgeport   289 non-null    object
 6   concentration_north_west      289 non-null    object
 7   highci_mid_cambridge          289 non-null    object
 8   highci_east_cambridge         289 non-null    object
 9   highci_cambridgeport          289 non-null    object
 10  highci_north_west_cambridge   289 non-null    object
 11  lowci_mid_cambridge           289 non-null    object
 12  lowci_east_cambridge          289 non-null    object
 13  lowci_cambridgeport 

In [23]:
Mass_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8800 entries, 0 to 8799
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   City/Town                   8800 non-null   object        
 1   County                      8800 non-null   object        
 2   Population                  8800 non-null   object        
 3   Total Case Counts           8800 non-null   object        
 4   Two Week Case Counts        8800 non-null   object        
 5   Average Daily Rate          8800 non-null   object        
 6   Color                       8791 non-null   object        
 7   Change in Last Week         8800 non-null   object        
 8   Total Tests                 8800 non-null   int64         
 9   Total Tests Last Two Weeks  8799 non-null   float64       
 10  Total Positive Tests        8799 non-null   float64       
 11  Percent Positivity          8799 non-null   object      

# <center>Data Wrangling<center>
Explore the dataset. Understand the datatypes for the various columns, and confirm that there are no missing values.

In [9]:
Cambridge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253 entries, 0 to 252
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   date                          253 non-null    object
 1   mwra_concentration            253 non-null    object
 2   mwra_7dayaverage              253 non-null    object
 3   concentration_mid_cambridge   253 non-null    object
 4   concentration_east_cambridge  253 non-null    object
 5   concentration_cambridgeport   253 non-null    object
 6   concentration_north_west      253 non-null    object
 7   highci_mid_cambridge          253 non-null    object
 8   highci_east_cambridge         253 non-null    object
 9   highci_cambridgeport          253 non-null    object
 10  highci_north_west_cambridge   253 non-null    object
 11  lowci_mid_cambridge           253 non-null    object
 12  lowci_east_cambridge          253 non-null    object
 13  lowci_cambridgeport 

In [10]:
Cambridge_df.describe()

Unnamed: 0,date,mwra_concentration,mwra_7dayaverage,concentration_mid_cambridge,concentration_east_cambridge,concentration_cambridgeport,concentration_north_west,highci_mid_cambridge,highci_east_cambridge,highci_cambridgeport,highci_north_west_cambridge,lowci_mid_cambridge,lowci_east_cambridge,lowci_cambridgeport,lowci_north_west_cambridge
count,253,253.0,253,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0
unique,253,216.0,197,28.0,30.0,24.0,30.0,28.0,30.0,24.0,30.0,28.0,30.0,24.0,30.0
top,2021-01-05T00:00:00.000,,257,,,,,,,,,,,,
freq,1,10.0,5,226.0,224.0,228.0,224.0,226.0,224.0,228.0,224.0,226.0,224.0,228.0,224.0


# Note to myself:
For my own review!

Column Definitions:
Date - sample date for 24 hour composite sample


MWRA_Concentration - Normalized SARS-CoV2 RNA Copies per ML collected during sampling period.


MWRA_7DayAverage - 7 day rolling average for Normalized SARS-CoV2 RNA Copies per ML collected during sampling period.


Concentration_* - Normalized SARS-CoV2 RNA Copies per ML collected during sampling period.


highci_* - Upper Limit on Confidence Interval for Normalized SARS-CoV2 RNA Copies per ML collected during sampling period.


lowci_* - Lower Limit on Confidence Interval for Normalized SARS-CoV2 RNA Copies per ML collected during sampling period.
* means there are multiple columns with this prefix which share the same definition - Nantawat

A refresher on confidence interval limits - https://bit.ly/3vtuI7h


In [12]:
Cambridge_df.tail()

Unnamed: 0,date,mwra_concentration,mwra_7dayaverage,concentration_mid_cambridge,concentration_east_cambridge,concentration_cambridgeport,concentration_north_west,highci_mid_cambridge,highci_east_cambridge,highci_cambridgeport,highci_north_west_cambridge,lowci_mid_cambridge,lowci_east_cambridge,lowci_cambridgeport,lowci_north_west_cambridge
248,2021-06-06T00:00:00.000,24,25,,,,,,,,,,,,
249,2021-06-07T00:00:00.000,29,27,,,,,,,,,,,,
250,2021-06-08T00:00:00.000,15,24,,,,,,,,,,,,
251,2021-06-09T00:00:00.000,14,22,,,,,,,,,,,,
252,2021-06-10T00:00:00.000,33,23,,,,,,,,,,,,


In [13]:
#Note: There seems to be plenty of empty values, must make sure to sanitize them to NaN 
# The last date entry is for 6/10/21

In [17]:
Cambridge_wastewater.columns

Index(['date', 'mwra_concentration', 'mwra_7dayaverage',
       'concentration_mid_cambridge', 'concentration_east_cambridge',
       'concentration_cambridgeport', 'concentration_north_west',
       'highci_mid_cambridge', 'highci_east_cambridge', 'highci_cambridgeport',
       'highci_north_west_cambridge', 'lowci_mid_cambridge',
       'lowci_east_cambridge', 'lowci_cambridgeport',
       'lowci_north_west_cambridge'],
      dtype='object')

In [18]:
#load Mass_DoH pickle
with open('Mass_Weekly_City_df.pkl', 'rb') as f:
    Mass_Weekly_City_df = pickle.load(f)

In [19]:
Mass_Weekly_City_df.shape

(8800, 17)

In [20]:
Mass_Weekly_City_df.columns

Index(['City/Town', 'County', 'Population', 'Total Case Counts',
       'Two Week Case Counts', 'Average Daily Rate', 'Color',
       'Change in Last Week', 'Total Tests', 'Total Tests Last Two Weeks',
       'Total Positive Tests', 'Percent Positivity', 'Change Since Last Week',
       'Testing Rate', 'Report Date', 'Start_Date', 'End_Date'],
      dtype='object')

In [21]:
Cambridge_weekly_trend = Mass_Weekly_City_df[Mass_Weekly_City_df['City/Town'] == 'Cambridge']

In [22]:
Cambridge_weekly_trend.shape

(25, 17)

In [23]:
Cambridge_weekly_trend.index

Int64Index([  49,  401,  753, 1105, 1457, 1809, 2161, 2513, 2865, 3217, 3569,
            3921, 4273, 4625, 4977, 5329, 5681, 6033, 6385, 6737, 7089, 7441,
            7793, 8145, 8497],
           dtype='int64')

In [24]:
#dropping index of Cambridge_weekly_trend
#inplace=False will allow a copy to be assigned to Cambridge_weekly - Nantawat
Cambridge_weekly = Cambridge_weekly_trend.reset_index(drop=True, inplace=False)

In [25]:
Cambridge_weekly.index

RangeIndex(start=0, stop=25, step=1)

In [26]:
#Time to clean up Cambridge_weekly_trend
#Because I isolated Cambridge when defining Cambridge_weekly_trend, I can drop the 'City/Town' column
#But before I do, I should check if there are any null City/Town entries
Cambridge_weekly['City/Town'].isnull().values.any()

False

In [27]:
#check to see if any entries in 'City/Town' does not match with 'Cambridge'
Cambridge_weekly['City/Town'].unique()

array(['Cambridge'], dtype=object)

In [28]:
#I feel safe in dropping city/town. Check for County
Cambridge_weekly['County'].isnull().values.any()

False

In [29]:
Cambridge_weekly['County'].unique()

array(['Middlesex'], dtype=object)

In [30]:
#I feel safe in dropping County
#pre-drop checking of columns
Cambridge_weekly.columns

Index(['City/Town', 'County', 'Population', 'Total Case Counts',
       'Two Week Case Counts', 'Average Daily Rate', 'Color',
       'Change in Last Week', 'Total Tests', 'Total Tests Last Two Weeks',
       'Total Positive Tests', 'Percent Positivity', 'Change Since Last Week',
       'Testing Rate', 'Report Date', 'Start_Date', 'End_Date'],
      dtype='object')

In [31]:
labels = ['City/Town', 'County']
weekly_trends = Cambridge_weekly.drop(columns = labels)

In [32]:
#confirmed the two columns were dropped
weekly_trends.columns

Index(['Population', 'Total Case Counts', 'Two Week Case Counts',
       'Average Daily Rate', 'Color', 'Change in Last Week', 'Total Tests',
       'Total Tests Last Two Weeks', 'Total Positive Tests',
       'Percent Positivity', 'Change Since Last Week', 'Testing Rate',
       'Report Date', 'Start_Date', 'End_Date'],
      dtype='object')

In [33]:
#Exploring data type of 'Report Date' column, reviewing for null values, may need to cast as pd.datetime
weekly_trends['Report Date'].shape
weekly_trends['Report Date'].describe
weekly_trends['Report Date'].head

<bound method NDFrame.head of 0    2020-12-24
1    2020-12-31
2    2021-01-07
3    2021-01-14
4    2021-01-21
5    2021-01-28
6    2021-02-04
7    2021-02-11
8    2021-02-18
9    2021-02-25
10   2021-03-04
11   2021-03-11
12   2021-03-18
13   2021-03-25
14   2021-04-01
15   2021-04-08
16   2021-04-15
17   2021-04-22
18   2021-04-29
19   2021-05-06
20   2021-05-13
21   2021-05-20
22   2021-05-27
23   2021-06-03
24   2021-06-10
Name: Report Date, dtype: datetime64[ns]>

In [34]:
#Cambridge wastewater surveillance data starts at 10/01/2020, will select this as the minimum date range
#something funky, there is an index for this column? - fixed this with reset_index for Cambridge_weekly - Nantawat
weekly_trends['Report Date'].index

RangeIndex(start=0, stop=25, step=1)

In [35]:
#explicitly convert to datetime object by the 'infer_datetime_format = 'coerce' flag
weekly_trends['dt'] = pd.to_datetime(weekly_trends['Report Date'], 
                                     infer_datetime_format='coerce')
weekly_trends['dt'].describe

<bound method NDFrame.describe of 0    2020-12-24
1    2020-12-31
2    2021-01-07
3    2021-01-14
4    2021-01-21
5    2021-01-28
6    2021-02-04
7    2021-02-11
8    2021-02-18
9    2021-02-25
10   2021-03-04
11   2021-03-11
12   2021-03-18
13   2021-03-25
14   2021-04-01
15   2021-04-08
16   2021-04-15
17   2021-04-22
18   2021-04-29
19   2021-05-06
20   2021-05-13
21   2021-05-20
22   2021-05-27
23   2021-06-03
24   2021-06-10
Name: dt, dtype: datetime64[ns]>

In [36]:
#should convert 'Report Date' to match with Cambridge_wastewater
Cambridge_wastewater.columns

Index(['date', 'mwra_concentration', 'mwra_7dayaverage',
       'concentration_mid_cambridge', 'concentration_east_cambridge',
       'concentration_cambridgeport', 'concentration_north_west',
       'highci_mid_cambridge', 'highci_east_cambridge', 'highci_cambridgeport',
       'highci_north_west_cambridge', 'lowci_mid_cambridge',
       'lowci_east_cambridge', 'lowci_cambridgeport',
       'lowci_north_west_cambridge'],
      dtype='object')

In [37]:
#shows wastewater['date']'s formatting is inconsistent with weekly_trends
Cambridge_wastewater['date'].describe

<bound method NDFrame.describe of 0      2020-10-01T00:00:00.000
1      2020-10-02T00:00:00.000
2      2020-10-03T00:00:00.000
3      2020-10-04T00:00:00.000
4      2020-10-05T00:00:00.000
                ...           
248    2021-06-06T00:00:00.000
249    2021-06-07T00:00:00.000
250    2021-06-08T00:00:00.000
251    2021-06-09T00:00:00.000
252    2021-06-10T00:00:00.000
Name: date, Length: 253, dtype: object>

In [38]:
#confirming no null values
Cambridge_wastewater['date'].isnull().values.any()

False

In [39]:
#casting the entire 'date' column in wastewater to datetime type, will match weekly_trends
Cambridge_wastewater['dt'] = pd.to_datetime(Cambridge_wastewater['date'])
Cambridge_wastewater['dt'].describe

<bound method NDFrame.describe of 0     2020-10-01
1     2020-10-02
2     2020-10-03
3     2020-10-04
4     2020-10-05
         ...    
248   2021-06-06
249   2021-06-07
250   2021-06-08
251   2021-06-09
252   2021-06-10
Name: dt, Length: 253, dtype: datetime64[ns]>

In [40]:
#now both wastewater and weekly_trends datetime types matches
type(Cambridge_wastewater['dt']) == type(weekly_trends['dt'])

True

In [44]:
#check for any duplicate values
wastewater_duplicated_list = list(Cambridge_wastewater.duplicated(subset='dt'))

#tests if there are any True values from the list populated by .duplicated()
#confirms no True - no duplicated values in Cambridge_wastewater. 
#casted .duplicated into list because there were too many rows to visually inspect
True in wastewater_duplicated_list

False

In [42]:
#small enough output to visually confirm no duplicated values in weekly_trends
weekly_trends.duplicated(subset='dt')

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
dtype: bool

In [None]:
# From this point, can unify the two df's with a join based on their respective 'dt' columns as index
# I don't think it is ready to join yet, will have to check other columns of interest for missing values
# ID data types (categorical vs. numerical). Pause here - NS


In [None]:
#resume work 6.17.21
#let's work on Data Definition
#ask: 
#Do column names correspond to what those columns store?
#Are data types within columns sensible?
#Calculate summary statistics - mean, median, mode, std. dev., range, and number of unique values

In [69]:
weekly_columns = weekly_trends.columns
wastewater_columns = Cambridge_wastewater.columns
print(weekly_columns, wastewater_columns)

Index(['Population', 'Total Case Counts', 'Two Week Case Counts',
       'Average Daily Rate', 'Color', 'Change in Last Week', 'Total Tests',
       'Total Tests Last Two Weeks', 'Total Positive Tests',
       'Percent Positivity', 'Change Since Last Week', 'Testing Rate',
       'Report Date', 'Start_Date', 'End_Date', 'dt'],
      dtype='object') Index(['date', 'mwra_concentration', 'mwra_7dayaverage',
       'concentration_mid_cambridge', 'concentration_east_cambridge',
       'concentration_cambridgeport', 'concentration_north_west',
       'highci_mid_cambridge', 'highci_east_cambridge', 'highci_cambridgeport',
       'highci_north_west_cambridge', 'lowci_mid_cambridge',
       'lowci_east_cambridge', 'lowci_cambridgeport',
       'lowci_north_west_cambridge', 'dt'],
      dtype='object')


In [71]:
#categorical datatype in Change in Last week
change_in_last_week = pd.DataFrame(weekly_trends['Change in Last Week']).set_index(keys=weekly_trends.dt, 
                                                                     inplace=False)
print(change_in_last_week)

           Change in Last Week
dt                            
2020-12-24               Lower
2020-12-31               Lower
2021-01-07              Higher
2021-01-14              Higher
2021-01-21              Higher
2021-01-28               Lower
2021-02-04               Lower
2021-02-11               Lower
2021-02-18               Lower
2021-02-25               Lower
2021-03-04              Higher
2021-03-11              Higher
2021-03-18               Lower
2021-03-25               Lower
2021-04-01              Higher
2021-04-08              Higher
2021-04-15               Lower
2021-04-22               Lower
2021-04-29               Lower
2021-05-06               Lower
2021-05-13               Lower
2021-05-20               Lower
2021-05-27               Lower
2021-06-03               Lower
2021-06-10               Lower


In [73]:
#categorical datatype in Change Since Last week
#this might be a better metric to use when looking and surge prediction - Nantawat
change_since_last_week = pd.DataFrame(weekly_trends['Change Since Last Week']).set_index(keys=weekly_trends.dt, 
                                                                     inplace=False)
print(change_since_last_week)

           Change Since Last Week
dt                               
2020-12-24              No Change
2020-12-31                 Higher
2021-01-07                 Higher
2021-01-14              No Change
2021-01-21                  Lower
2021-01-28                  Lower
2021-02-04                  Lower
2021-02-11                  Lower
2021-02-18                  Lower
2021-02-25                  Lower
2021-03-04              No Change
2021-03-11              No Change
2021-03-18              No Change
2021-03-25              No Change
2021-04-01                 Higher
2021-04-08              No Change
2021-04-15              No Change
2021-04-22              No Change
2021-04-29                  Lower
2021-05-06                  Lower
2021-05-13                  Lower
2021-05-20              No Change
2021-05-27              No Change
2021-06-03              No Change
2021-06-10              No Change


In [46]:
weekly_trends['Average Daily Rate'].describe

<bound method NDFrame.describe of 0     26.852121
1     26.022958
2      26.78834
3     32.847607
4     35.335095
5     32.337353
6     27.553721
7     21.494453
8     14.542241
9     12.756352
10    13.840642
11    16.710821
12    16.328131
13    15.180059
14    20.473945
15    22.514962
16    21.494453
17    20.027473
18    12.820134
19     9.120792
20     6.378176
21     2.997743
22     2.423707
23     1.913453
24     1.722108
Name: Average Daily Rate, dtype: object>

In [84]:
#i'd like to see this paired with the 'dt' as index
Average_Daily = pd.DataFrame(weekly_trends['Average Daily Rate'].astype(float)).set_index(
    keys = weekly_trends.dt, inplace = False)

In [85]:
Average_Daily.describe

<bound method NDFrame.describe of             Average Daily Rate
dt                            
2020-12-24           26.852121
2020-12-31           26.022958
2021-01-07           26.788340
2021-01-14           32.847607
2021-01-21           35.335095
2021-01-28           32.337353
2021-02-04           27.553721
2021-02-11           21.494453
2021-02-18           14.542241
2021-02-25           12.756352
2021-03-04           13.840642
2021-03-11           16.710821
2021-03-18           16.328131
2021-03-25           15.180059
2021-04-01           20.473945
2021-04-08           22.514962
2021-04-15           21.494453
2021-04-22           20.027473
2021-04-29           12.820134
2021-05-06            9.120792
2021-05-13            6.378176
2021-05-20            2.997743
2021-05-27            2.423707
2021-06-03            1.913453
2021-06-10            1.722108>

In [87]:
Average_daily_mean = Average_Daily.mean()
Average_daily_median = Average_Daily.median()
Average_daily_mode = Average_Daily.mode()
print(Average_daily_mode)

    Average Daily Rate
0             1.722108
1             1.913453
2             2.423707
3             2.997743
4             6.378176
5             9.120792
6            12.756352
7            12.820134
8            13.840642
9            14.542241
10           15.180059
11           16.328131
12           16.710821
13           20.027473
14           20.473945
15           21.494453
16           21.494453
17           22.514962
18           26.022958
19           26.788340
20           26.852121
21           27.553721
22           32.337353
23           32.847607
24           35.335095


In [56]:
weekly_trends.set_index(weekly_trends.dt, inplace = True)
print(weekly_trends.head)

<bound method NDFrame.head of                Population Total Case Counts Two Week Case Counts  \
dt                                                                 
2020-12-24  111989.024088              2645                  421   
2020-12-31  111989.024088              2835                  408   
2021-01-07  111989.024088              3083                  420   
2021-01-14  111989.024088              3346                  515   
2021-01-21  111989.024088              3618                  554   
2021-01-28  111989.024088              3862                  507   
2021-02-04  111989.024088              4058                  432   
2021-02-11  111989.024088              4177                  337   
2021-02-18  111989.024088              4271                  228   
2021-02-25  111989.024088              4377                  200   
2021-03-04  111989.024088              4486                  217   
2021-03-11  111989.024088              4623                  262   
2021-03-18  111989

In [None]:
#ask Luka to review 6.17.21 NS