# Project Stage - III (Basic Machine Learning)

### Import essential libraries

In [100]:
import os
import math
# Science libraries
import pandas as pd
import numpy as np
# Ploting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly
import plotly.graph_objs as go
# Machine learning
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error 

In [101]:
# Create folder to hold the graphs
if not os.path.exists("images"):
    os.mkdir("images")

### Read and Filtter Dataset

#### Read dataset

In [102]:
# Read super Covid-19 dataframe with state as index. 
covid = pd.read_csv("../../data/output/covid.csv", index_col=0)
CountryNewCases = pd.read_csv("../../data/output/Country_NewCases.csv")
CountryNewDeaths = pd.read_csv("../../data/output/Country_NewDeaths.csv")


#### Clean the dataset.
remove unnessasery columns or rows.

In [104]:
# Drop unneccesary variables in Super covid-19 dataframe. 
covid.drop(["countyFIPS","stateFIPS"], axis=1, inplace=True)

# Drop unneccesary rows in Super COVID-19 dataframe.
covid.drop(
    covid.loc[covid['County Name'] == ("Statewide Unallocated" or 'New York City Unallocated')].index,
    inplace=True)

# Reset indexes after removing rows
covid.reset_index(drop=True, inplace=True)

#### Splite super covid dataset
Create dataframe that hold covid death and dataframe that hold covid cases.

In [105]:
# Regalare expression that will help filltering the data.
regex_cases = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[x])|^County Name$|^State$|^population$'
regex_deaths = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[y])|^County Name$|^State$|^population$'

cases = covid.filter(regex=regex_cases)
deaths = covid.filter(regex=regex_deaths)

Save the first three columns temprary so that we fix:
1. the date syntax.
2. Find the new cases/deaths

In [106]:
# Split the data: first three columns
covid_columns = covid[covid.columns[:3]]

# Split the data: fourth to the end columns
cases_data = cases[cases.columns[3:]]
deaths_data = deaths[deaths.columns[3:]]

#### Convert date type from String to Timestamp

In [107]:
# Fixing date string syntax
daily_cases = cases_data.rename(
    columns=lambda x: (pd.to_datetime((x.split('_')[0]), errors='ignore')),
    errors='raise')
daily_deaths = deaths_data.rename(
    columns=lambda x: (pd.to_datetime((x.split('_')[0]), errors='ignore')),
    errors='raise')

#### Find the daily new cases/death.
Calculate the diffrence between the current day and the day before it. The results is the new cases.

In [108]:
daily_cases = daily_cases.diff(axis=1).fillna(0)
daily_deaths = daily_deaths.diff(axis=1).fillna(0)

#### Merge the popolation, states and county names to the daily cases/deaths

In [109]:
new_cases = covid_columns.merge(daily_cases, left_index=True, right_index=True)
new_deaths = covid_columns.merge(daily_deaths, left_index=True, right_index=True)

In [110]:
new_cases.head()

Unnamed: 0,County Name,State,population,2020-01-22 00:00:00,2020-01-23 00:00:00,2020-01-24 00:00:00,2020-01-25 00:00:00,2020-01-26 00:00:00,2020-01-27 00:00:00,2020-01-28 00:00:00,...,2020-11-05 00:00:00,2020-11-06 00:00:00,2020-11-07 00:00:00,2020-11-08 00:00:00,2020-11-09 00:00:00,2020-11-10 00:00:00,2020-11-11 00:00:00,2020-11-12 00:00:00,2020-11-13 00:00:00,2020-11-14 00:00:00
0,Autauga County,AL,55869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.0,12.0,25.0,16.0,21.0,24.0,23.0,34.0,32.0,18.0
1,Baldwin County,AL,223234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,36.0,37.0,54.0,38.0,37.0,85.0,61.0,45.0,69.0,73.0
2,Barbour County,AL,24686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,10.0,2.0,3.0,3.0,9.0,5.0,1.0,4.0,6.0
3,Bibb County,AL,22394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,10.0,7.0,2.0,6.0,16.0,13.0,5.0,7.0,5.0
4,Blount County,AL,57826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,34.0,31.0,33.0,11.0,38.0,43.0,22.0,29.0,59.0,30.0


In [111]:
new_deaths.head()

Unnamed: 0,County Name,State,population,2020-01-22 00:00:00,2020-01-23 00:00:00,2020-01-24 00:00:00,2020-01-25 00:00:00,2020-01-26 00:00:00,2020-01-27 00:00:00,2020-01-28 00:00:00,...,2020-11-05 00:00:00,2020-11-06 00:00:00,2020-11-07 00:00:00,2020-11-08 00:00:00,2020-11-09 00:00:00,2020-11-10 00:00:00,2020-11-11 00:00:00,2020-11-12 00:00:00,2020-11-13 00:00:00,2020-11-14 00:00:00
0,Autauga County,AL,55869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0
1,Baldwin County,AL,223234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,Barbour County,AL,24686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bibb County,AL,22394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Blount County,AL,57826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0


## Linear regression models for predicting cases and deaths in US.

#### United States Cases

Sum up all states daily cases

In [112]:
united_states_cases = pd.DataFrame(new_cases[new_cases.columns[3:]].sum(axis=0))
united_states_deaths = pd.DataFrame(new_deaths[new_deaths.columns[3:]].sum(axis=0))

Fit a linewar regression model using sklearn's LinearRegression package

In [113]:
united_states_cases_ = united_states_cases.reset_index().rename(columns ={'index': 'Date', 0: 'Cases'})
united_states_deaths_ = united_states_deaths.reset_index().rename(columns ={'index': 'Date', 0: 'Deaths'})

In [114]:
df = united_states_cases_
X = united_states_cases_.index.values.reshape(-1, 1)

# fitting the data
model = LinearRegression()
model.fit(X, df['Cases'])

x_range = np.linspace(X.min(), X.max(), 298)
y_range = model.predict(np.array(united_states_cases_.index).reshape(-1,1))

# Plot using plotly
fig = go.Figure()
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

fig.add_trace(
    go.Scatter(
        x=df.index,
        y=df['Cases'],
        opacity=0.65,
        name="Cases",
        mode="markers",
        line=dict(width=2, color=cols[0]),
    ))
fig.add_trace(
    go.Scatter(
        x=x_range,
        y=y_range,
        opacity=0.65,
        name='Regression Fit',
        line=dict(width=2, color=cols[0]),
    ))
# Set custom x-axis labels
fig.update_xaxes(
    tickvals=df.index[0::40],
    ticktext=df['Date'][0::40],
)
fig.show()

# plot using mathlap
# plt.plot(X.squeeze(), df['Cases'], 'o')
# plt.plot(x_range.squeeze(), y_range);

In [115]:
mse = mean_squared_error(x_range, y_range)

rmse = np.sqrt(mse)
print(rmse)

43191.514603438


In [116]:
df = united_states_cases_
X = united_states_cases_.index.values.reshape(-1, 1)

# fitting the data
model = LinearRegression()
model.fit(X, df['Cases'])
test = np.array([299,300,301,302,303,304,305]).reshape(-1,1)

x_range = np.linspace(X.min(), X.max(), 293)
y_range_test = model.predict(test)

# Plot using plotly
fig = go.Figure()
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

# fig.add_trace(
#     go.Scatter(
#         x=df.index,
#         y=df['Cases'],
#         opacity=0.65,
#         name="Cases",
#         mode="markers",
#         line=dict(width=2, color=cols[0]),
#     ))
fig.add_trace(
    go.Scatter(
        x=x_range,
        y=y_range_test,
        opacity=0.65,
        name='Regression Fit',
        line=dict(width=2, color=cols[0]),
    ))

fig.show()

# plt.plot(test.squeeze(), df[0], 'o')
# plt.plot(test.squeeze(), y_range_test);

## Non-Linear regression models for predicting cases and deaths in US.

In [117]:
def format_coefs(coefs):
    equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)]
    equation = "$" +  " + ".join(equation_list) + "$"

    replace_map = {"x^0": "", "x^1": "x", '+ -': '- '}
    for old, new in replace_map.items():
        equation = equation.replace(old, new)

    return equation

In [118]:
x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)

fig = px.scatter(df, x=df.index, y='Cases', opacity=0.65)
degrees = []
for degree in [ 2, 3, 4, 5, 6, 7]:
    poly = PolynomialFeatures(degree)
    poly.fit(X)
    X_poly = poly.transform(X)
    x_range_poly = poly.transform(x_range)

    model = LinearRegression(fit_intercept=False)
    model.fit(X_poly, df.Cases)
    y_poly = model.predict(x_range_poly)

    equation = format_coefs(model.coef_.round(2))
    fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation))
    degrees.append(y_poly)
# Set custom x-axis labels
fig.update_xaxes(
    tickvals=df.index[0::40],
    ticktext=df['Date'][0::40],
)

fig.show()

In [119]:
mse = mean_squared_error(x_range, degrees[5])

rmse = math.sqrt(mse)


print(rmse)

46467.62841730884


In [120]:
x_range = np.array(test).reshape(-1, 1)

# fig = px.scatter(df, x=df.index, y='Cases', opacity=0.65)
degrees = []
for degree in [ 2, 3, 4, 5, 6, 7]:
    poly = PolynomialFeatures(degree)
    poly.fit(X)
    X_poly = poly.transform(X)
    x_range_poly = poly.transform(x_range)

    model = LinearRegression(fit_intercept=False)
    model.fit(X_poly, df.Cases)
    y_poly = model.predict(x_range_poly)

    equation = format_coefs(model.coef_.round(2))
    fig.add_traces(go.Scatter(x=test.squeeze(), y=y_poly, name=equation))
    degrees.append(y_poly)

fig.show()

In [121]:
united_states_cases_=united_states_cases_.iloc[:268]
united_states_deaths_=united_states_deaths_.iloc[:268]

In [122]:
CountryNewCases = CountryNewCases.merge(united_states_cases_['Cases'], left_index=True, right_index=True).rename(columns={'Cases': 'US'})
CountryNewDeaths = CountryNewDeaths.merge(united_states_deaths_['Deaths'], left_index=True, right_index=True).rename(columns={'Deaths': 'US'})

In [123]:
fig = go.Figure()
days = CountryNewCases.index.values.reshape(-1, 1)
def draw(df, name):
    X = days
    model = LinearRegression()
    model.fit(X, df.values.reshape(-1, 1))

    x_range = np.linspace(X.min(), X.max(), df.size)
    y_range = model.predict(x_range.reshape(-1, 1))
    color1 = list(np.random.choice(range(256), size=3))
    Y_Range = pd.Series(y_range[:,0], index=df.index)
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df.values,
            opacity=0.65,
            name=name,
            mode="markers",
            line=dict(width=2, color=f'rgb({color1[0]},{color1[1]},{color1[2]} )'),
        ))
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=Y_Range,
            opacity=1,
            name='Regression ' + name,
            line=dict(width=2, color=f'rgb({color1[0]},{color1[1]},{color1[2]} )'),
        ))
# # Set custom x-axis labels
# fig.update_xaxes(
#     tickvals=df.index[0::40],
#     ticktext=df['Date'][0::40],
# )


draw(CountryNewCases['US'], 'US')
draw(CountryNewCases['Bangladesh'], 'Bangladesh')
draw(CountryNewCases['Indonesia'], 'Indonesia')
draw(CountryNewCases['Pakistan'], 'Pakistan')
draw(CountryNewCases['Brazil'], 'Brazil')
draw(CountryNewCases['Nigeria'], 'Nigeria')

fig.show()

In [124]:
fig = go.Figure()
days = CountryNewDeaths.index.values.reshape(-1, 1)
def draw(df, name):
    X = days
    model = LinearRegression()
    model.fit(X, df.values.reshape(-1, 1))

    x_range = np.linspace(X.min(), X.max(), df.size)
    y_range = model.predict(x_range.reshape(-1, 1))
    color1 = list(np.random.choice(range(256), size=3))
    Y_Range = pd.Series(y_range[:,0], index=df.index)
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df.values,
            opacity=0.65,
            name=name,
            mode="markers",
            line=dict(width=2, color=f'rgb({color1[0]},{color1[1]},{color1[2]} )'),
        ))
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=Y_Range,
            opacity=1,
            name='Regression ' + name,
            line=dict(width=2, color=f'rgb({color1[0]},{color1[1]},{color1[2]} )'),
        ))
# # Set custom x-axis labels
# fig.update_xaxes(
#     tickvals=df.index[0::40],
#     ticktext=df['Date'][0::40],
# )


draw(CountryNewDeaths['US'], 'US')
draw(CountryNewDeaths['Bangladesh'], 'Bangladesh')
draw(CountryNewDeaths['Indonesia'], 'Indonesia')
draw(CountryNewDeaths['Pakistan'], 'Pakistan')
draw(CountryNewDeaths['Brazil'], 'Brazil')
draw(CountryNewDeaths['Nigeria'], 'Nigeria')

fig.show()

### This graph show the cases trends for the countires.