In [1]:
import math
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [2]:
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/test.csv
./data/graph.csv
./data/submission.csv
./data/train.csv


**Change the `data_dir` to the correct data set parent directory**

In [4]:
data_dir = "data"

In [5]:
train_data_path = os.path.join(data_dir, "train.csv")
test_data_path  = os.path.join(data_dir, "test.csv")
graph_data_path = os.path.join(data_dir, "graph.csv")

**Load data sheets to `pandas.DataFrame()`**

In [8]:
train_data = pd.read_csv(train_data_path)
train_data.head()

Unnamed: 0,ID,Province_State,Date,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
0,0,Alabama,04-12-2020,3563,93,,3470.0,75.98802,21583.0,437.0,2.61016,460.300152,12.264945
1,1,Alaska,04-12-2020,272,8,66.0,264.0,45.504049,8038.0,31.0,2.941176,1344.711576,11.397059
2,2,Arizona,04-12-2020,3542,115,,3427.0,48.662422,42109.0,,3.246753,578.522286,
3,3,Arkansas,04-12-2020,1280,27,367.0,1253.0,49.439423,19722.0,130.0,2.109375,761.753354,10.15625
4,4,California,04-12-2020,22795,640,,22155.0,58.137726,190328.0,5234.0,2.81202,485.423869,22.961176


## Now let's visualize some <br>S.H.I.T. (Scatterplots and Histograms that Interpret the Trend)

In [10]:
# First, we will get the total confirmed/deaths of each state
states = list(np.unique(train_data['Province_State']))
collected = ['Confirmed', 'Deaths', 'People_Tested']
states_data = dict.fromkeys(states, None)

for s in states:
    df_filter = train_data['Province_State'] == s
    state_df = train_data[df_filter]
    states_data[s] = state_df

In [12]:
states_data['California']

Unnamed: 0,ID,Province_State,Date,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
4,4,California,04-12-2020,22795,640,,22155.0,58.137726,190328.0,5234.0,2.812020,485.423869,22.961176
54,54,California,04-13-2020,23931,714,,23217.0,61.035048,190882.0,3015.0,2.987756,486.836823,12.598721
104,104,California,04-14-2020,25356,767,,24589.0,64.669453,202208.0,5163.0,3.028869,515.723328,20.362044
154,154,California,04-15-2020,26686,860,,25826.0,68.061564,216486.0,5163.0,3.226411,552.138790,19.347223
204,204,California,04-16-2020,27677,956,,26721.0,70.589069,246400.0,5031.0,3.457745,628.433237,18.177548
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6854,6854,California,08-27-2020,691821,12677,,679144.0,1750.903765,10918415.0,,1.832410,27633.006121,
6904,6904,California,08-28-2020,697385,12805,,684580.0,1764.985483,11010637.0,,1.836145,27866.407314,
6954,6954,California,08-29-2020,702038,12894,,689144.0,1776.761586,11109630.0,,1.836653,28116.944977,
7004,7004,California,08-30-2020,705951,12937,,693014.0,1786.664851,11231829.0,,1.832563,28426.213833,
