In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
covid_df = pd.read_csv("Resources/nc_covid_data_updated.csv")
covid_df.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,current_status,symptom_status,hosp_yn,death_yn
0,2021-09-01,NC,37,CUMBERLAND,37051.0,0 - 17 years,Female,Black,Non-Hispanic/Latino,0.0,Laboratory-confirmed case,Asymptomatic,No,No
1,2021-08-01,NC,37,MECKLENBURG,37119.0,18 to 49 years,Male,White,Hispanic/Latino,0.0,Laboratory-confirmed case,Symptomatic,No,No
2,2021-09-01,NC,37,CUMBERLAND,37051.0,0 - 17 years,Female,Black,Non-Hispanic/Latino,1.0,Laboratory-confirmed case,Symptomatic,No,No
3,2021-09-01,NC,37,CUMBERLAND,37051.0,0 - 17 years,Female,Black,Non-Hispanic/Latino,0.0,Laboratory-confirmed case,Symptomatic,No,No
4,2021-08-01,NC,37,MECKLENBURG,37119.0,18 to 49 years,Male,White,Hispanic/Latino,0.0,Laboratory-confirmed case,Asymptomatic,No,No


In [7]:
import datetime as dt
covid_df['case_month'] = pd.to_datetime(covid_df.case_month)

In [8]:
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7378 entries, 0 to 7377
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   case_month                       7378 non-null   datetime64[ns]
 1   res_state                        7378 non-null   object        
 2   state_fips_code                  7378 non-null   int64         
 3   res_county                       7378 non-null   object        
 4   county_fips_code                 7378 non-null   float64       
 5   age_group                        7378 non-null   object        
 6   sex                              7378 non-null   object        
 7   race                             7378 non-null   object        
 8   ethnicity                        7378 non-null   object        
 9   case_positive_specimen_interval  7378 non-null   float64       
 10  current_status                   7378 non-null   object     

In [9]:
# Assign the data to X and y
X = covid_df[['case_month','res_county','age_group','sex','race',
              'ethnicity','case_positive_specimen_interval','current_status',
             'symptom_status']]
y = covid_df['hosp_yn']

print("Shape: ", X.shape, y.shape)

Shape:  (7378, 9) (7378,)


In [10]:
# Split the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=19)

Logistic Regression Model

In [11]:
# Classify the machine learning model
classifier = LogisticRegression()
classifier

LogisticRegression()

In [12]:
# Fit the model
classifier.fit(X_train, y_train)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [None]:
# Create confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

In [None]:
# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
print(classification_report(y_true, y_pred))

DecisionTree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Fit the model
clf = DecisionTreeClassifier().fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
print(classification_report(y_test, y_pred))

RandomForest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)

print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
features = clf.feature_importances_
print(features)
plt.bar(x = range(len(features)), height=features)
plt.show()

In [None]:
feature_importances = clf.feature_importances_

In [None]:
features = sorted(zip(X.columns, clf.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,200)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(clf)
sel.fit(X_train_scaled, y_train)
sel.get_support()

In [None]:
X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

In [None]:
clf = LogisticRegression()
clf.fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_selected_test_scaled, y_test)}')

ExtraTrees Model

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)

print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')