In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt
from datetime import timedelta
from sklearn import datasets, linear_model
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
#To remove warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Create reference to CSV file
covid_path = "data/state covid_data.csv"
covid_data = pd.read_csv(covid_path, low_memory=False)
covid_data.head()

In [None]:
#Data stops at 08/31/2020
covid_data.tail()

In [None]:
#plot all states
# Get list of states
states = covid_data['state_name'].unique()
    
fig, axs = plt.subplots(18,3,figsize=(20,60))
fig.tight_layout()
plt.subplots_adjust(hspace = 0.5)

fig.autofmt_xdate()

states_s = states[0:18]
rows = range(0,len(states_s))
for row in rows:
    state = states_s[row]
    x = covid_data[(covid_data.state_name == state)]['date']
    y = covid_data[(covid_data.state_name == state)]['new_case_count']
    x2 = covid_data[(covid_data.state_name == state)]['date']
    y2 = covid_data[(covid_data.state_name == state)]['case_count']*10
    axs[row,0].plot(x,y,'b')
    axs[row,0].plot(x2,y2,'r')
    axs[row,0].set_title(state)

states_s = states[18:36]
rows = range(0,len(states_s))
for row in rows:
    state = states_s[row]
    x = covid_data[(covid_data.state_name == state)]['date']
    y = covid_data[(covid_data.state_name == state)]['new_case_count']
    x2 = covid_data[(covid_data.state_name == state)]['date']
    y2 = covid_data[(covid_data.state_name == state)]['case_count']*10
    axs[row,1].plot(x,y,'b')
    axs[row,1].plot(x2,y2,'r')
    axs[row,1].set_title(state)
    
states_s = states[36:54]
rows = range(0,len(states_s))
for row in rows:
    state = states_s[row]
    x = covid_data[(covid_data.state_name == state)]['date']
    y = covid_data[(covid_data.state_name == state)]['new_case_count']
    x2 = covid_data[(covid_data.state_name == state)]['date']
    y2 = covid_data[(covid_data.state_name == state)]['case_count']*10
    axs[row,2].plot(x,y,'b')
    axs[row,2].plot(x2,y2,'r')
    axs[row,2].set_title(state)

    plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x=covid_data.date,y=covid_data["new_case_count"])
plt.title("Distribution plot for daily cases")
plt.xticks(rotation=90)

In [None]:
# Observing the total numbers of horizontal rows and vertical columns
covid_data.shape

In [None]:
# Total data size
covid_data.size

In [None]:
#N/A values
covid_data.isna().sum()

In [None]:
#Since, our dataset is very long let's take the latest recent 5 days case and analyse it
us_last5days = covid_data.tail()
us_last5days.plot(x="date",y="new_case_count",figsize=(8,8),color="red")
plt.xlabel("Date")
plt.ylabel("Recent 5 days cases")
plt.title("Total cases for latest 5 days")
plt.show()

In [None]:
us_last5days = covid_data.tail()
us_last5days.plot(x="date",y="case_count",figsize=(8,8),color="red")
plt.xlabel("Date")
plt.ylabel("Recent 5 days cases")
plt.title("Total cases for latest 5 days")
plt.show()

In [None]:
sns.catplot(x="date",y="new_case_count",data=us_last5days);

In [None]:
#First 5 days
us_first5days = covid_data.head(5)

#Plot first 5 days
us_first5days.plot(x="date",y="new_case_count",figsize=(6,5),color="orange")
plt.xlabel("Date")
plt.ylabel("Recent 5 days cases")
plt.title("Total cases for first 5 days")
plt.show()

In [None]:
#Plot first 5 days
us_first5days.plot(x="date",y="case_count",figsize=(6,5),color="orange")
plt.xlabel("Date")
plt.ylabel("Recent 5 days cases")
plt.title("Total cases for first 5 days")
plt.show()

# Conclusion: there was no cases in the beginning of february

In [None]:
#ML model can only read number, therefore, verify data types
covid_data.dtypes

In [None]:
covid_data.shape

# CREATE MODEL

In [None]:
#If our model can predict with 90% accuacy we will assume that our model is good for predicting the new cases
np.random.seed(45)
# X contains all the independent attributes except our target new_case_count
X = covid_data['case_count'].values.reshape(-1,1)
# y is our target variable which is new_case_count
y = covid_data['new_case_count'].values.reshape(-1,1)


In [None]:
print(X)

In [None]:
print(y)

In [None]:
#Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
#split
np.random.seed(42)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.2, random_state=42)


# Train, test, split

In [None]:
rg = RandomForestRegressor(n_estimators=40)

In [None]:
X_train.shape, y_train.shape

In [None]:
rg.fit(X_train,y_train)

In [None]:
rg.score(X_test,y_test)

In [None]:
rg.score(X_train,y_train)

# we can see that our model is able to predict with an accuracy of 91%. I don't know what to do here going forward

In [None]:
# fit a model - found this on Towards data science
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)


In [None]:
#Let’s see what the predictions are:
predictions

In [None]:
#Plot the model
plt.scatter(y_test, predictions)
plt.xlabel('True Values')
plt.ylabel('predictions')

In [None]:
#  print the accuracy score
print("score: ", model.score(X_test, y_test))