In [42]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
vax_df = pd.read_csv('cleaned_vax_data.csv')
vax_df.head()

Unnamed: 0,location,date,record,date.1,location.1,2019_population,total_vaccinations,total_distributed,unused_doses,people_vaccinated,...,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used,total_boosters,total_boosters_per_hundred
0,Alabama,2021-01-12,1,2021-01-12,Alabama,4903185,78134.0,377025.0,298891,70861.0,...,1.59,7270.0,1.45,7.69,,,,0.207,,
1,Alabama,2021-01-19,8,2021-01-19,Alabama,4903185,130795.0,444650.0,344083,114319.0,...,2.67,16346.0,2.33,9.07,8267.0,7523.0,1534.0,0.294,,
2,Alabama,2021-01-26,15,2021-01-26,Alabama,4903185,254959.0,569000.0,344075,223854.0,...,5.2,30933.0,4.57,11.6,33395.0,18294.0,3731.0,0.481,,
3,Alabama,2021-02-02,22,2021-02-02,Alabama,4903185,371882.0,680550.0,376282,312711.0,...,7.58,58825.0,6.38,13.88,41521.0,18178.0,3707.0,0.546,,
4,Alabama,2021-02-09,29,2021-02-09,Alabama,4903185,490969.0,823600.0,391247,395196.0,...,10.01,95033.0,8.06,16.8,24576.0,18526.0,3778.0,0.596,,


In [13]:
# Manipulate data to create a yes/no question regarding wasted dosages

for i in vax_df.index:
    if (vax_df.at[i, 'share_doses_used'] > 0.6):
        vax_df.at[i, 'share_doses_used'] = 1
    else:
        vax_df.at[i, 'share_doses_used'] = 0

vax_df = vax_df.fillna(0)
vax_df

Unnamed: 0,location,date,record,date.1,location.1,2019_population,total_vaccinations,total_distributed,unused_doses,people_vaccinated,...,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used,total_boosters,total_boosters_per_hundred
0,Alabama,2021-01-12,1,2021-01-12,Alabama,4903185,78134.0,377025.0,298891,70861.0,...,1.59,7270.0,1.45,7.69,0.0,0.0,0.0,0.0,0.0,0.00
1,Alabama,2021-01-19,8,2021-01-19,Alabama,4903185,130795.0,444650.0,344083,114319.0,...,2.67,16346.0,2.33,9.07,8267.0,7523.0,1534.0,0.0,0.0,0.00
2,Alabama,2021-01-26,15,2021-01-26,Alabama,4903185,254959.0,569000.0,344075,223854.0,...,5.20,30933.0,4.57,11.60,33395.0,18294.0,3731.0,0.0,0.0,0.00
3,Alabama,2021-02-02,22,2021-02-02,Alabama,4903185,371882.0,680550.0,376282,312711.0,...,7.58,58825.0,6.38,13.88,41521.0,18178.0,3707.0,0.0,0.0,0.00
4,Alabama,2021-02-09,29,2021-02-09,Alabama,4903185,490969.0,823600.0,391247,395196.0,...,10.01,95033.0,8.06,16.80,24576.0,18526.0,3778.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6886,Wyoming,2022-12-27,47438,2022-12-27,Wyoming,578759,843727.0,1240065.0,396338,351292.0,...,145.78,306174.0,60.70,214.26,0.0,1064.0,1838.0,1.0,213248.0,36.85
6887,Wyoming,2023-01-03,47445,2023-01-03,Wyoming,578759,844861.0,1241825.0,396964,351498.0,...,145.98,306301.0,60.73,214.57,0.0,181.0,313.0,1.0,214049.0,36.98
6888,Wyoming,2023-01-10,47452,2023-01-10,Wyoming,578759,846149.0,1244315.0,398166,351713.0,...,146.20,306435.0,60.77,215.00,0.0,184.0,318.0,1.0,214988.0,37.15
6889,Wyoming,2023-01-17,47459,2023-01-17,Wyoming,578759,847156.0,1246855.0,399699,351894.0,...,146.37,306545.0,60.80,215.44,0.0,144.0,249.0,1.0,215704.0,37.27


In [36]:
vax_encoded = pd.get_dummies(vax_df, columns=["2019_population", "total_vaccinations", "total_distributed", "people_vaccinated", "people_fully_vaccinated", "daily_vaccinations", "total_boosters"])
# Create our features
X = vax_df.drop(columns = ["date.1", "location.1", "date", "location", "record", "unused_doses", "total_vaccinations", "total_distributed", "share_doses_used"])
# Create our target
Y = vax_df["share_doses_used"]

In [37]:
X.describe()

Unnamed: 0,2019_population,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,total_boosters,total_boosters_per_hundred
count,6891.0,6891.0,6891.0,6891.0,6891.0,6891.0,6891.0,6891.0,6891.0,6891.0,6891.0,6891.0
mean,10843170.0,6193611.0,46.806456,118.372981,5169392.0,55.639092,152.640937,38011.97,31093.65,2909.700624,2109559.0,19.310943
std,41030630.0,26765570.0,25.882171,69.813221,22679920.0,28.968167,91.257788,195102.3,153753.3,2915.252262,11946220.0,21.300379
min,17916.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1344212.0,442204.0,31.015,69.535,297353.5,39.755,88.85,0.0,1734.5,908.0,0.0,0.0
50%,3509529.0,1732353.0,54.35,129.4,1348285.0,63.06,159.34,4213.0,6183.0,1971.0,164824.0,12.22
75%,7278717.0,4195770.0,65.43,169.385,3613639.0,75.91,223.96,23392.5,18154.5,3979.5,1221933.0,35.79
max,328239500.0,268765900.0,90.12,294.93,229508400.0,117.11,396.59,4629928.0,3384387.0,27652.0,186640900.0,100.39


In [38]:
# Check the balance of our target values
Y.value_counts()

1.0    6224
0.0     667
Name: share_doses_used, dtype: int64

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y, 
                                                    random_state=1, 
                                                    stratify=Y)
X_train.shape

(5168, 12)

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Resample the training data with the BalancedRandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 
rf_model = rf_model.fit(X_train, y_train)

In [44]:
predictions = rf_model.predict(X_test)
acc_score = accuracy_score(y_test, predictions)
acc_score

0.987811955890888

In [47]:
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,153,14
Actual 1,7,1549


In [48]:
from sklearn.metrics import classification_report
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,153,14
Actual 1,7,1549


Accuracy Score : 0.987811955890888
Classification Report
              precision    recall  f1-score   support

         0.0       0.96      0.92      0.94       167
         1.0       0.99      1.00      0.99      1556

    accuracy                           0.99      1723
   macro avg       0.97      0.96      0.96      1723
weighted avg       0.99      0.99      0.99      1723



In [49]:
importances = rf_model.feature_importances_
importances

array([0.02594747, 0.1914605 , 0.11802067, 0.10896054, 0.24105748,
       0.1001026 , 0.0813134 , 0.02076838, 0.02671635, 0.03501987,
       0.03939346, 0.01123929])

In [54]:
rf_model.feature_names_in_

array(['2019_population', 'people_vaccinated',
       'people_fully_vaccinated_per_hundred',
       'total_vaccinations_per_hundred', 'people_fully_vaccinated',
       'people_vaccinated_per_hundred', 'distributed_per_hundred',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'daily_vaccinations_per_million', 'total_boosters',
       'total_boosters_per_hundred'], dtype=object)