# Project Stage - III (Basic Machine Learning)

### Import essential libraries

In [1]:
import os
# Science libraries
import pandas as pd
import numpy as np
# Ploting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
# Machine learning
from sklearn.linear_model import LinearRegression

In [2]:
# Create folder to hold the graphs
if not os.path.exists("images"):
    os.mkdir("images")

### Read and Filtter Dataset

#### Read dataset

In [3]:
# Read super Covid-19 dataframe with state as index. 
covid = pd.read_csv("../../data/output/covid.csv", index_col=0)

#### Clean the dataset.
remove unnessasery columns or rows.

In [4]:
# Drop unneccesary variables in Super covid-19 dataframe. 
covid.drop(["countyFIPS","stateFIPS"], axis=1, inplace=True)

# Drop unneccesary rows in Super COVID-19 dataframe.
covid.drop(
    covid.loc[covid['County Name'] == ("Statewide Unallocated" or 'New York City Unallocated')].index,
    inplace=True)

# Reset indexes after removing rows
covid.reset_index(drop=True, inplace=True)

#### Splite super covid dataset
Create dataframe that hold covid death and dataframe that hold covid cases.

In [5]:
# Regalare expression that will help filltering the data.
regex_cases = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[x])|^County Name$|^State$|^population$'
regex_deaths = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[y])|^County Name$|^State$|^population$'

cases = covid.filter(regex=regex_cases)
deaths = covid.filter(regex=regex_deaths)

Save the first three columns temprary so that we fix:
1. the date syntax.
2. Find the new cases/deaths

In [6]:
# Split the data: first three columns
covid_columns = covid[covid.columns[:3]]

# Split the data: fourth to the end columns
cases_data = cases[cases.columns[3:]]
deaths_data = deaths[deaths.columns[3:]]

#### Convert date type from String to Timestamp

In [7]:
# Fixing date string syntax
daily_cases = cases_data.rename(
    columns=lambda x: (pd.to_datetime((x.split('_')[0]), errors='ignore')),
    errors='raise')
daily_deaths = deaths_data.rename(
    columns=lambda x: (pd.to_datetime((x.split('_')[0]), errors='ignore')),
    errors='raise')

In [8]:
daily_cases

Unnamed: 0,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,2020-01-31,...,2020-10-07,2020-10-08,2020-10-09,2020-10-10,2020-10-11,2020-10-12,2020-10-13,2020-10-14,2020-10-15,2020-10-16
0,0,0,0,0,0,0,0,0,0,0,...,1852,1863,1882,1898,1905,1911,1924,1928,1949,1966
1,0,0,0,0,0,0,0,0,0,0,...,6134,6141,6172,6190,6203,6220,6248,6270,6285,6333
2,0,0,0,0,0,0,0,0,0,0,...,927,927,939,942,942,944,950,950,965,968
3,0,0,0,0,0,0,0,0,0,0,...,703,708,719,726,736,738,744,744,761,771
4,0,0,0,0,0,0,0,0,0,0,...,1673,1681,1689,1704,1713,1722,1742,1750,1768,1783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3140,0,0,0,0,0,0,0,0,0,0,...,378,382,393,394,394,394,401,402,406,410
3141,0,0,0,0,0,0,0,0,0,0,...,633,645,656,658,662,675,679,686,687,692
3142,0,0,0,0,0,0,0,0,0,0,...,368,373,378,379,380,381,384,385,388,392
3143,0,0,0,0,0,0,0,0,0,0,...,119,123,123,124,125,126,127,132,132,133


#### Find the daily new cases/death.
Calculate the diffrence between the current day and the day before it. The results is the new cases.

In [9]:
daily_cases = daily_cases.diff(axis=1).fillna(0)
daily_deaths = daily_deaths.diff(axis=1).fillna(0)

#### Merge the popolation, states and county names to the daily cases/deaths

In [10]:
new_cases = covid_columns.merge(daily_cases, left_index=True, right_index=True)
new_deaths = covid_columns.merge(daily_deaths, left_index=True, right_index=True)

In [11]:
new_cases.head()

Unnamed: 0,County Name,State,population,2020-01-22 00:00:00,2020-01-23 00:00:00,2020-01-24 00:00:00,2020-01-25 00:00:00,2020-01-26 00:00:00,2020-01-27 00:00:00,2020-01-28 00:00:00,...,2020-10-07 00:00:00,2020-10-08 00:00:00,2020-10-09 00:00:00,2020-10-10 00:00:00,2020-10-11 00:00:00,2020-10-12 00:00:00,2020-10-13 00:00:00,2020-10-14 00:00:00,2020-10-15 00:00:00,2020-10-16 00:00:00
0,Autauga County,AL,55869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.0,11.0,19.0,16.0,7.0,6.0,13.0,4.0,21.0,17.0
1,Baldwin County,AL,223234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.0,7.0,31.0,18.0,13.0,17.0,28.0,22.0,15.0,48.0
2,Barbour County,AL,24686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,12.0,3.0,0.0,2.0,6.0,0.0,15.0,3.0
3,Bibb County,AL,22394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,5.0,11.0,7.0,10.0,2.0,6.0,0.0,17.0,10.0
4,Blount County,AL,57826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,8.0,8.0,15.0,9.0,9.0,20.0,8.0,18.0,15.0


In [12]:
new_deaths.head()

Unnamed: 0,County Name,State,population,2020-01-22 00:00:00,2020-01-23 00:00:00,2020-01-24 00:00:00,2020-01-25 00:00:00,2020-01-26 00:00:00,2020-01-27 00:00:00,2020-01-28 00:00:00,...,2020-10-07 00:00:00,2020-10-08 00:00:00,2020-10-09 00:00:00,2020-10-10 00:00:00,2020-10-11 00:00:00,2020-10-12 00:00:00,2020-10-13 00:00:00,2020-10-14 00:00:00,2020-10-15 00:00:00,2020-10-16 00:00:00
0,Autauga County,AL,55869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Baldwin County,AL,223234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,8.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,Barbour County,AL,24686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bibb County,AL,22394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Blount County,AL,57826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0


## Linear regression models for predicting cases and deaths in US.

#### United States Cases

Sum up all states daily cases

In [13]:
united_states_cases = pd.DataFrame(new_cases[new_cases.columns[3:]].sum(axis=0))

In [14]:
united_states_cases

Unnamed: 0,0
2020-01-22,0.0
2020-01-23,0.0
2020-01-24,1.0
2020-01-25,0.0
2020-01-26,3.0
...,...
2020-10-12,42554.0
2020-10-13,50399.0
2020-10-14,59692.0
2020-10-15,66735.0


Fit a linewar regression model using sklearn's LinearRegression package

In [15]:
X, Y = united_states_cases.T, united_states_cases.columns

mdl = LinearRegression().fit(X,Y)
result = mdl.predict(X)
m = mdl.coef_[0]
b = mdl.intercept_
print(f'formula: y= {m}x + {b}')

formula: y= 0.0x + 0.0


In [16]:
[i for i in range(X.index.size)]

[0]

In [17]:
index = [i for i in range(united_states_cases.index.size)]
united_states_cases

Unnamed: 0,0
2020-01-22,0.0
2020-01-23,0.0
2020-01-24,1.0
2020-01-25,0.0
2020-01-26,3.0
...,...
2020-10-12,42554.0
2020-10-13,50399.0
2020-10-14,59692.0
2020-10-15,66735.0


In [18]:
united_states_cases

Unnamed: 0,0
2020-01-22,0.0
2020-01-23,0.0
2020-01-24,1.0
2020-01-25,0.0
2020-01-26,3.0
...,...
2020-10-12,42554.0
2020-10-13,50399.0
2020-10-14,59692.0
2020-10-15,66735.0


In [21]:
X = united_states_cases.values.reshape(-1, 1)
Y = united_states_cases.index.values

fig = px.scatter(united_states_cases,
                 x=index,
                 y=0,
                 opacity=0.65,
                 trendline="ols")

# Set custom x-axis labels
fig.update_xaxes(
    ticktext=united_states_cases.index[0::40],
    tickvals=index[0::40],
)
fig

## Non-Linear regression models for predicting cases and deaths in US.