In [None]:
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import time
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
pd.set_option('display.max_rows', 1000)

In [None]:
driver_path = '/Users/Luis/Downloads/chromedriver.exe'
driver = webdriver.Chrome(driver_path)
driver.get('https://www.nytimes.com/interactive/2020/nyregion/new-york-city-coronavirus-cases.html')

#sleep timer necessary to load full page
time.sleep(7)

In [None]:
#scroll some distance to remove advertisement
driver.execute_script('window.scrollTo(0,2400)')
time.sleep(1)

#Must expand table for webscrape to retrieve full table. 
expand_table_button = driver.find_element_by_class_name('expand')
expand_table_button.click()

#saves page with expanded table for webscraping
page = driver.page_source
driver.quit()

In [None]:
soup = bs(page, 'html.parser')

In [None]:
table = soup.find('table', {'class':"svelte-19yxb3p"})
table_headers = ['Zip_code','Borough','Neighborhood','Cases','Cases_per_10000','Deaths','Deaths_per_10000']

In [None]:
#Creates DataFrame from web scraped table
ny_cases_df = pd.DataFrame()
record_list = []
for i in table.find_all('tr')[1:]:
    row = i.find_all('td')
    
    #strips a row's information (zipcode, borough, neighborhood,total_cases,cases_per_1000,deaths,deaths_per_1000)
    zipcode = str(row[0].find_all('span')[0].find_all('span')[0]).lstrip('<span class="bolder">').rstrip('</span>')
    borough = str(row[0].find_all('span')[0].find_all('span')[1]).lstrip('<span class="lighter">').replace('</span>',"")
    neighborhood = str(row[0].find_all('span')[0].find_all('span')[2]).lstrip('<span class="neighborhood">').replace('</span>',"")
    total_cases = str(row[1].find_all('span')[0]).lstrip('<span>').rstrip('</span>').replace(',',"")
    cases_per_1000 =str(row[2].find_all('span')[0]).lstrip('<span>').rstrip('</span>').replace(',',"")
    total_deaths = str(row[5].find_all('span')[0]).lstrip('<span>').rstrip('</span>').replace(',',"")
    deaths_per_1000 = str(row[6].find_all('span')[0]).lstrip('<span>').rstrip('</span>').replace(',',"")
    
    #appends all scraped info into list
    record = [zipcode,borough,neighborhood,total_cases,cases_per_1000,total_deaths,deaths_per_1000]
    
    #appends record to record_list
    record_list.append(record)

#creates dataframe
ny_df = pd.DataFrame(record_list, columns = table_headers)
ny_df

In [None]:
#cleaning
ny_df.drop(ny_df[ny_df['Deaths'] == '—'].index, inplace = True)
ny_df.columns
ny_df = ny_df.astype({'Cases':'int32','Cases_per_10000':'int32','Deaths_per_10000': 'int32','Deaths':'int32'})

In [None]:
plt.hist(ny_df['Cases'], bins = 30)
plt.xlabel('Cases')
plt.ylabel('Amount of Neighborhoods')
plt.title('Distribution of Cases in all 175 New York Neighborhoods')

In [None]:
plt.hist(ny_df['Deaths'], bins = 30)
plt.xlabel('Deaths')
plt.ylabel('Amount of Neighborhoods')
plt.title('Distribution of Deaths in all 175 NY neighborhoods')

In [None]:
sns.boxplot(x = ny_df['Cases'], y = ny_df['Borough'])
plt.title('Distribution of Cases by Borough using boxplot')

In [None]:
sns.boxplot(x = ny_df['Deaths'], y = ny_df['Borough'])
plt.title('Distributions of Deaths by Borough using boxplot')

In [None]:
agg_df = ny_df.groupby('Borough').sum().reset_index()
agg_df

In [None]:
plt.bar(agg_df['Borough'],agg_df['Cases_per_10000']/agg_df['Cases_per_10000'].sum())

In [None]:
plt.bar(agg_df['Borough'],agg_df['Cases']/agg_df['Cases'].sum())