# Project Stage - III (Basic Machine Learning)

### Import essential libraries

In [71]:
import os
# Science libraries
import pandas as pd
import numpy as np
# Ploting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly
import plotly.graph_objs as go
# Machine learning
from sklearn.linear_model import LinearRegression

In [51]:
# Create folder to hold the graphs
if not os.path.exists("images"):
    os.mkdir("images")

### Read and Filtter Dataset

#### Read dataset

In [52]:
# Read super Covid-19 dataframe with state as index. 
covid = pd.read_csv("../../data/output/covid.csv", index_col=0)

#### Clean the dataset.
remove unnessasery columns or rows.

In [53]:
# Drop unneccesary variables in Super covid-19 dataframe. 
covid.drop(["countyFIPS","stateFIPS"], axis=1, inplace=True)

# Drop unneccesary rows in Super COVID-19 dataframe.
covid.drop(
    covid.loc[covid['County Name'] == ("Statewide Unallocated" or 'New York City Unallocated')].index,
    inplace=True)

# Reset indexes after removing rows
covid.reset_index(drop=True, inplace=True)

#### Splite super covid dataset
Create dataframe that hold covid death and dataframe that hold covid cases.

In [54]:
# Regalare expression that will help filltering the data.
regex_cases = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[x])|^County Name$|^State$|^population$'
regex_deaths = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[y])|^County Name$|^State$|^population$'

cases = covid.filter(regex=regex_cases)
deaths = covid.filter(regex=regex_deaths)

Save the first three columns temprary so that we fix:
1. the date syntax.
2. Find the new cases/deaths

In [55]:
# Split the data: first three columns
covid_columns = covid[covid.columns[:3]]

# Split the data: fourth to the end columns
cases_data = cases[cases.columns[3:]]
deaths_data = deaths[deaths.columns[3:]]

#### Convert date type from String to Timestamp

In [56]:
# Fixing date string syntax
daily_cases = cases_data.rename(
    columns=lambda x: (pd.to_datetime((x.split('_')[0]), errors='ignore')),
    errors='raise')
daily_deaths = deaths_data.rename(
    columns=lambda x: (pd.to_datetime((x.split('_')[0]), errors='ignore')),
    errors='raise')

In [57]:
daily_cases

Unnamed: 0,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,2020-01-31,...,2020-10-31,2020-11-01,2020-11-02,2020-11-03,2020-11-04,2020-11-05,2020-11-06,2020-11-07,2020-11-08,2020-11-09
0,0,0,0,0,0,0,0,0,0,0,...,2159,2173,2186,2197,2212,2230,2242,2267,2283,2304
1,0,0,0,0,0,0,0,0,0,0,...,6940,6966,6985,6995,7061,7097,7134,7188,7226,7263
2,0,0,0,0,0,0,0,0,0,0,...,1060,1061,1065,1074,1079,1080,1090,1092,1095,1098
3,0,0,0,0,0,0,0,0,0,0,...,873,878,883,890,897,907,917,924,926,932
4,0,0,0,0,0,0,0,0,0,0,...,2074,2095,2108,2162,2188,2222,2253,2286,2297,2335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3140,0,0,0,0,0,0,0,0,0,0,...,553,585,601,621,638,658,681,692,726,776
3141,0,0,0,0,0,0,0,0,0,0,...,796,814,847,862,873,897,922,932,977,1007
3142,0,0,0,0,0,0,0,0,0,0,...,479,488,492,510,518,529,542,544,551,568
3143,0,0,0,0,0,0,0,0,0,0,...,158,160,160,167,175,179,185,185,187,190


#### Find the daily new cases/death.
Calculate the diffrence between the current day and the day before it. The results is the new cases.

In [58]:
daily_cases = daily_cases.diff(axis=1).fillna(0)
daily_deaths = daily_deaths.diff(axis=1).fillna(0)

#### Merge the popolation, states and county names to the daily cases/deaths

In [59]:
new_cases = covid_columns.merge(daily_cases, left_index=True, right_index=True)
new_deaths = covid_columns.merge(daily_deaths, left_index=True, right_index=True)

In [60]:
new_cases.head()

Unnamed: 0,County Name,State,population,2020-01-22 00:00:00,2020-01-23 00:00:00,2020-01-24 00:00:00,2020-01-25 00:00:00,2020-01-26 00:00:00,2020-01-27 00:00:00,2020-01-28 00:00:00,...,2020-10-31 00:00:00,2020-11-01 00:00:00,2020-11-02 00:00:00,2020-11-03 00:00:00,2020-11-04 00:00:00,2020-11-05 00:00:00,2020-11-06 00:00:00,2020-11-07 00:00:00,2020-11-08 00:00:00,2020-11-09 00:00:00
0,Autauga County,AL,55869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.0,14.0,13.0,11.0,15.0,18.0,12.0,25.0,16.0,21.0
1,Baldwin County,AL,223234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,52.0,26.0,19.0,10.0,66.0,36.0,37.0,54.0,38.0,37.0
2,Barbour County,AL,24686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,4.0,9.0,5.0,1.0,10.0,2.0,3.0,3.0
3,Bibb County,AL,22394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,5.0,5.0,7.0,7.0,10.0,10.0,7.0,2.0,6.0
4,Blount County,AL,57826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,35.0,21.0,13.0,54.0,26.0,34.0,31.0,33.0,11.0,38.0


In [61]:
new_deaths.head()

Unnamed: 0,County Name,State,population,2020-01-22 00:00:00,2020-01-23 00:00:00,2020-01-24 00:00:00,2020-01-25 00:00:00,2020-01-26 00:00:00,2020-01-27 00:00:00,2020-01-28 00:00:00,...,2020-10-31 00:00:00,2020-11-01 00:00:00,2020-11-02 00:00:00,2020-11-03 00:00:00,2020-11-04 00:00:00,2020-11-05 00:00:00,2020-11-06 00:00:00,2020-11-07 00:00:00,2020-11-08 00:00:00,2020-11-09 00:00:00
0,Autauga County,AL,55869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Baldwin County,AL,223234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,3.0,0.0,6.0,0.0,0.0,0.0
2,Barbour County,AL,24686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bibb County,AL,22394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Blount County,AL,57826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Linear regression models for predicting cases and deaths in US.

#### United States Cases

Sum up all states daily cases

In [62]:
united_states_cases = pd.DataFrame(new_cases[new_cases.columns[3:]].sum(axis=0))

In [63]:
united_states_cases

Unnamed: 0,0
2020-01-22,0.0
2020-01-23,0.0
2020-01-24,1.0
2020-01-25,0.0
2020-01-26,3.0
...,...
2020-11-05,117655.0
2020-11-06,132276.0
2020-11-07,112610.0
2020-11-08,103701.0


Fit a linewar regression model using sklearn's LinearRegression package

In [64]:
X, Y = united_states_cases.T, united_states_cases.columns

mdl = LinearRegression().fit(X,Y)
result = mdl.predict(X)
m = mdl.coef_[0]
b = mdl.intercept_
print(f'formula: y= {m}x + {b}')

formula: y= 0.0x + 0.0


In [65]:
[i for i in range(X.index.size)]

[0]

In [66]:
index = [i for i in range(united_states_cases.index.size)]
united_states_cases

Unnamed: 0,0
2020-01-22,0.0
2020-01-23,0.0
2020-01-24,1.0
2020-01-25,0.0
2020-01-26,3.0
...,...
2020-11-05,117655.0
2020-11-06,132276.0
2020-11-07,112610.0
2020-11-08,103701.0


In [67]:
united_states_cases=united_states_cases.reset_index().rename(columns={'index':'Date', 0:'Cases'})
united_states_cases

Unnamed: 0,Date,Cases
0,2020-01-22,0.0
1,2020-01-23,0.0
2,2020-01-24,1.0
3,2020-01-25,0.0
4,2020-01-26,3.0
...,...,...
288,2020-11-05,117655.0
289,2020-11-06,132276.0
290,2020-11-07,112610.0
291,2020-11-08,103701.0


In [73]:
df = united_states_cases
X = united_states_cases.index.values.reshape(-1, 1)

model = LinearRegression()
model.fit(X, df.Cases)

x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))

fig = go.Figure()
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

fig.add_trace(
    go.Scatter(
        x=df.index,
        y=df['Cases'],
        opacity=0.65,
        name="Cases",
        mode="markers",
        line=dict(width=2, color=cols[0]),
    ))
fig.add_trace(
    go.Scatter(
        x=x_range,
        y=y_range,
        opacity=0.65,
        name='Regression Fit',
        line=dict(width=2, color=cols[0]),
    ))
# Set custom x-axis labels
fig.update_xaxes(
    tickvals=df.index[0::40],
    ticktext=df['Date'][0::40],
)
fig.show()

In [75]:
from sklearn.metrics import mean_squared_error 
import math
mse = mean_squared_error(x_range, y_range)

rmse = math.sqrt(mse)
print(rmse)

40089.40391163356


In [None]:
X = united_states_cases.values.reshape(-1, 1)
Y = united_states_cases.index.values

fig = px.scatter(united_states_cases,
                 x=index,
                 y=0,
                 opacity=0.65,
                 trendline="ols")

# Set custom x-axis labels
fig.update_xaxes(
    ticktext=united_states_cases.index[0::40],
    tickvals=index[0::40],
)
fig

## Non-Linear regression models for predicting cases and deaths in US.

In [74]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

def format_coefs(coefs):
    equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)]
    equation = "$" +  " + ".join(equation_list) + "$"

    replace_map = {"x^0": "", "x^1": "x", '+ -': '- '}
    for old, new in replace_map.items():
        equation = equation.replace(old, new)

    return equation


x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)

fig = px.scatter(df, x=df.index, y='Cases', opacity=0.65)
for degree in [1, 2, 3, 4]:
    poly = PolynomialFeatures(degree)
    poly.fit(X)
    X_poly = poly.transform(X)
    x_range_poly = poly.transform(x_range)

    model = LinearRegression(fit_intercept=False)
    model.fit(X_poly, df.Cases)
    y_poly = model.predict(x_range_poly)

    equation = format_coefs(model.coef_.round(2))
    fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation))

fig.show()