# Project Stage - III (Basic Machine Learning)

### Import essential libraries

In [87]:
import os
# Science libraries
import pandas as pd
import numpy as np
# Ploting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
import plotly
# Machine learning
from sklearn.linear_model import LinearRegression

In [2]:
# Create folder to hold the graphs
if not os.path.exists("images"):
    os.mkdir("images")

### Read and Filtter Dataset

#### Read dataset

In [3]:
# Read super Covid-19 dataframe with state as index. 
covid = pd.read_csv("../../../data/output/covid.csv", index_col=0)

#### Clean the dataset.
remove unnessasery columns or rows.

In [4]:
# Drop unneccesary variables in Super covid-19 dataframe. 
covid.drop(["countyFIPS","stateFIPS"], axis=1, inplace=True)

# Drop unneccesary rows in Super COVID-19 dataframe.
covid.drop(
    covid.loc[covid['County Name'] == ("Statewide Unallocated" or 'New York City Unallocated')].index,
    inplace=True)

# Reset indexes after removing rows
covid.reset_index(drop=True, inplace=True)

#### Splite super covid dataset
Create dataframe that hold covid death and dataframe that hold covid cases.

In [5]:
# Regalare expression that will help filltering the data.
regex_cases = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[x])|^County Name$|^State$|^population$'
regex_deaths = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[y])|^County Name$|^State$|^population$'

cases = covid.filter(regex=regex_cases)
deaths = covid.filter(regex=regex_deaths)

Save the first three columns temprary so that we fix:
1. the date syntax.
2. Find the new cases/deaths

In [6]:
# Split the data: first three columns
covid_columns = covid[covid.columns[:3]]

# Split the data: fourth to the end columns
cases_data = cases[cases.columns[3:]]
deaths_data = deaths[deaths.columns[3:]]

#### Convert date type from String to Timestamp

In [7]:
# Fixing date string syntax
daily_cases = cases_data.rename(
    columns=lambda x: str(pd.to_datetime((x.split('_')[0]))).split(" ")[0],
    errors='raise')
daily_deaths = deaths_data.rename(
    columns=lambda x: str(pd.to_datetime((x.split('_')[0]))).split(" ")[0],
    errors='raise')

In [8]:
daily_cases

Unnamed: 0,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,2020-01-31,...,2020-10-07,2020-10-08,2020-10-09,2020-10-10,2020-10-11,2020-10-12,2020-10-13,2020-10-14,2020-10-15,2020-10-16
0,0,0,0,0,0,0,0,0,0,0,...,1852,1863,1882,1898,1905,1911,1924,1928,1949,1966
1,0,0,0,0,0,0,0,0,0,0,...,6134,6141,6172,6190,6203,6220,6248,6270,6285,6333
2,0,0,0,0,0,0,0,0,0,0,...,927,927,939,942,942,944,950,950,965,968
3,0,0,0,0,0,0,0,0,0,0,...,703,708,719,726,736,738,744,744,761,771
4,0,0,0,0,0,0,0,0,0,0,...,1673,1681,1689,1704,1713,1722,1742,1750,1768,1783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3140,0,0,0,0,0,0,0,0,0,0,...,378,382,393,394,394,394,401,402,406,410
3141,0,0,0,0,0,0,0,0,0,0,...,633,645,656,658,662,675,679,686,687,692
3142,0,0,0,0,0,0,0,0,0,0,...,368,373,378,379,380,381,384,385,388,392
3143,0,0,0,0,0,0,0,0,0,0,...,119,123,123,124,125,126,127,132,132,133


#### Find the daily new cases/death.
Calculate the diffrence between the current day and the day before it. The results is the new cases.

In [9]:
daily_cases = daily_cases.diff(axis=1).fillna(0)
daily_deaths = daily_deaths.diff(axis=1).fillna(0)

#### Merge the popolation, states and county names to the daily cases/deaths

In [10]:
new_cases = covid_columns.merge(daily_cases, left_index=True, right_index=True)
new_deaths = covid_columns.merge(daily_deaths, left_index=True, right_index=True)

In [11]:
new_cases.head()

Unnamed: 0,County Name,State,population,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,...,2020-10-07,2020-10-08,2020-10-09,2020-10-10,2020-10-11,2020-10-12,2020-10-13,2020-10-14,2020-10-15,2020-10-16
0,Autauga County,AL,55869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.0,11.0,19.0,16.0,7.0,6.0,13.0,4.0,21.0,17.0
1,Baldwin County,AL,223234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.0,7.0,31.0,18.0,13.0,17.0,28.0,22.0,15.0,48.0
2,Barbour County,AL,24686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,12.0,3.0,0.0,2.0,6.0,0.0,15.0,3.0
3,Bibb County,AL,22394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,5.0,11.0,7.0,10.0,2.0,6.0,0.0,17.0,10.0
4,Blount County,AL,57826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,8.0,8.0,15.0,9.0,9.0,20.0,8.0,18.0,15.0


In [121]:
new_deaths.head()

Unnamed: 0,County Name,State,population,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,...,2020-10-07,2020-10-08,2020-10-09,2020-10-10,2020-10-11,2020-10-12,2020-10-13,2020-10-14,2020-10-15,2020-10-16
0,Autauga County,AL,55869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Baldwin County,AL,223234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,8.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,Barbour County,AL,24686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bibb County,AL,22394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Blount County,AL,57826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0


## Linear regression models for predicting cases and deaths in US.

#### Florida cases

Filtter the dataset to get Florida state and it's counties

In [134]:
florida_cases_with_counties = new_cases.loc[new_cases['State'] == 'FL']

In [135]:
florida_cases = pd.DataFrame(florida_cases_with_counties[florida_cases_with_counties.columns[3:]].sum(axis=0)).T

In [136]:
florida_cases =florida_cases.T

Fit a linewar regression model using sklearn's LinearRegression package

In [137]:
florida_cases_=florida_cases.reset_index().rename(columns={'index':'Date', 0:'Cases'})

In [138]:
df = florida_cases_
X = df.index.values.reshape(-1, 1)

model = LinearRegression()
model.fit(X, df.Cases)

x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))

fig = go.Figure()
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

fig.add_trace(
    go.Scatter(
        x=df.index,
        y=df['Cases'],
        opacity=0.65,
        name="Cases",
        mode="markers",
        line=dict(width=2, color=cols[0]),
    ))
fig.add_trace(
    go.Scatter(
        x=x_range,
        y=y_range,
        opacity=0.65,
        name='Regression Fit',
        line=dict(width=2, color=cols[0]),
    ))
# Set custom x-axis labels
fig.update_xaxes(
    tickvals=df.index[0::40],
    ticktext=df['Date'][0::40],
)
fig.show()

Florida's counties

In [139]:
florida_cases_with_counties=florida_cases_with_counties.T

In [140]:
florida_cases_with_counties[322][0]

'Alachua County'

In [141]:
florida_cases_with_counties

Unnamed: 0,322,323,324,325,326,327,328,329,330,331,...,379,380,381,382,383,384,385,386,387,388
County Name,Alachua County,Baker County,Bay County,Bradford County,Brevard County,Broward County,Calhoun County,Charlotte County,Citrus County,Clay County,...,Sarasota County,Seminole County,Sumter County,Suwannee County,Taylor County,Union County,Volusia County,Wakulla County,Walton County,Washington County
State,FL,FL,FL,FL,FL,FL,FL,FL,FL,FL,...,FL,FL,FL,FL,FL,FL,FL,FL,FL,FL
population,269043,29210,174705,28201,601942,1952778,14105,188910,149657,219252,...,433742,471826,132420,44417,21569,15237,553284,33739,74071,25473
2020-01-22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-12,54,2,2,2,23,102,3,6,3,19,...,45,19,4,9,1,0,39,5,3,3
2020-10-13,70,3,44,4,89,177,1,12,10,32,...,52,40,26,9,4,8,50,1,10,2
2020-10-14,78,17,19,6,85,265,1,22,23,21,...,50,25,26,12,4,4,67,8,28,3
2020-10-15,57,13,28,7,73,236,5,29,5,40,...,55,37,16,10,2,17,90,10,23,10


In [142]:
florida_cases_with_counties_ =florida_cases_with_counties[3:].reset_index().rename(columns={'index':'Date'})

In [158]:
fig = go.Figure()
for i in range(322, florida_cases_with_counties_.keys()[-1]):
    df = florida_cases_with_counties_[['Date', i]]
    X = df.index.values.reshape(-1, 1)

    model = LinearRegression()
    model.fit(X, df[i])

    x_range = np.linspace(X.min(), X.max(), 100)
    y_range = model.predict(x_range.reshape(-1, 1))

    color1 = list(np.random.choice(range(256), size=3))
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df[i],
            opacity=0.65,
            name=florida_cases_with_counties[i][0],
            mode="markers",
            line=dict(width=2, color=f'rgb({color1[0]},{color1[1]},{color1[2]} )'),
        ))
    fig.add_trace(
        go.Scatter(
            x=x_range,
            y=y_range,
            opacity=0.65,
            name='Regression ' + florida_cases_with_counties[i][0],
            line=dict(width=2, color=f'rgb({color1[0]},{color1[1]},{color1[2]} )'),
        ))
# Set custom x-axis labels
fig.update_xaxes(
    tickvals=df.index[0::40],
    ticktext=df['Date'][0::40],
)
fig.show()

In [None]:
florida_cases_with_counties[322].iloc[0]

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=florida_cases_with_counties[3:].index, y=florida_cases_with_counties[322].iloc[3:], name=florida_cases_with_counties[322].iloc[0], mode='markers')) # fill down to xaxis
fig.add_trace(go.Scatter(x=florida_cases_with_counties[3:].index, y=florida_cases_with_counties[323].iloc[3:], name=florida_cases_with_counties[323].iloc[0], mode='markers')) # fill down to xaxis


fig.update_layout(
    title="Compare Florida Covid-19 Cases With Other States",
    xaxis_title="Probability of occurring",
    yaxis_title="Number of Cases",
    legend_title="States",
)
fig.show()


In [None]:
df = px.data.tips()
df

In [None]:
df = florida_cases_with_counties[3:]
X = df.iloc[:,0].values.reshape(-1, 1)
Y = df.index.values.reshape(-1,1)

In [None]:
model = LinearRegression()
model.fit(X, Y)

x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))

fig = px.scatter(df, x=X, y=Y, opacity=0.65)
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit'))
fig.show()

In [None]:
florida_cases_with_counties.T

In [None]:
df = florida_cases_with_counties
Y = df
new_columns = Y.iloc[0]
Y = Y.rename(columns=new_columns)
Y = Y[3:]
Y

In [None]:
Y[Y.columns]

In [None]:
df = florida_cases_with_counties.T
fig = px.scatter(df, x=[i for i in range(Y.index.size)], y=Y['Alachua County'], trendline="ols")
fig2 = px.scatter(df, x=[i for i in range(Y.index.size)], y=Y['Brevard County'], trendline="ols")

fig.add_trace(fig2.data[0])
fig.show()



# results.query("sex == 'Male' and smoker == 'Yes'").px_fit_results.iloc[0].summary()

In [None]:
fig.data[0]

In [None]:
import pandas as pd
import plotly.express as px

iris = px.data.iris()
fig = px.scatter(iris, x="sepal_width", y="sepal_length", color="species")
df = pd.DataFrame({
    'x':[1,2,3,4],
    'y':[5,6,7,8],})
fig2 = px.bar(df, x="x", y="y")
fig.add_trace(fig2.data[0])
fig.show()

In [None]:
fig2.data[0]

## Non-Linear regression models for predicting cases and deaths in US.