In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
vax_df = pd.read_csv('cleaned_vax_data.csv')
vax_df.head()

Unnamed: 0,location,date,record,date.1,location.1,2019_population,total_vaccinations,total_distributed,unused_doses,people_vaccinated,...,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used,total_boosters,total_boosters_per_hundred
0,Alabama,2021-01-12,1,2021-01-12,Alabama,4903185,78134.0,377025.0,298891,70861.0,...,1.59,7270.0,1.45,7.69,,,,0.207,,
1,Alabama,2021-01-19,8,2021-01-19,Alabama,4903185,130795.0,444650.0,344083,114319.0,...,2.67,16346.0,2.33,9.07,8267.0,7523.0,1534.0,0.294,,
2,Alabama,2021-01-26,15,2021-01-26,Alabama,4903185,254959.0,569000.0,344075,223854.0,...,5.2,30933.0,4.57,11.6,33395.0,18294.0,3731.0,0.481,,
3,Alabama,2021-02-02,22,2021-02-02,Alabama,4903185,371882.0,680550.0,376282,312711.0,...,7.58,58825.0,6.38,13.88,41521.0,18178.0,3707.0,0.546,,
4,Alabama,2021-02-09,29,2021-02-09,Alabama,4903185,490969.0,823600.0,391247,395196.0,...,10.01,95033.0,8.06,16.8,24576.0,18526.0,3778.0,0.596,,


In [6]:
# Drop the null columns where all values are null
vax_df = vax_df.dropna(axis='columns', how='all')

# Drop the null rows
vax_df = vax_df.dropna()

In [7]:
vax_df

Unnamed: 0,location,date,record,date.1,location.1,2019_population,total_vaccinations,total_distributed,unused_doses,people_vaccinated,...,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used,total_boosters,total_boosters_per_hundred
38,Alabama,2021-10-05,267,2021-10-05,Alabama,4903185,4584378.0,6742810.0,2193925,2597448.0,...,93.50,2111925.0,52.97,137.52,20875.0,12950.0,2641.0,0.680,15821.0,0.32
39,Alabama,2021-10-12,274,2021-10-12,Alabama,4903185,4657171.0,6864150.0,2228121,2616672.0,...,94.98,2139547.0,53.37,139.99,20885.0,12874.0,2626.0,0.680,42621.0,0.87
40,Alabama,2021-10-19,281,2021-10-19,Alabama,4903185,4714057.0,6946650.0,2257180,2632750.0,...,96.14,2159523.0,53.69,141.68,15276.0,9374.0,1912.0,0.679,64151.0,1.31
41,Alabama,2021-10-26,288,2021-10-26,Alabama,4903185,4779470.0,7069650.0,2302826,2653566.0,...,97.48,2179242.0,54.12,144.18,17503.0,10252.0,2091.0,0.676,90226.0,1.84
42,Alabama,2021-11-02,295,2021-11-02,Alabama,4903185,4889817.0,7302580.0,2412763,2677664.0,...,99.73,2198533.0,54.61,148.94,25743.0,15857.0,3234.0,0.675,158669.0,3.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6855,Wyoming,2022-05-24,47221,2022-05-24,Wyoming,578759,751513.0,996185.0,245308,337036.0,...,129.85,295587.0,58.23,172.12,901.0,387.0,669.0,0.756,145579.0,25.15
6856,Wyoming,2022-05-31,47228,2022-05-31,Wyoming,578759,752932.0,1003685.0,250753,337275.0,...,130.09,295802.0,58.28,173.42,520.0,406.0,702.0,0.755,146563.0,25.32
6857,Wyoming,2022-06-07,47235,2022-06-07,Wyoming,578759,756636.0,1009085.0,252449,337871.0,...,130.73,296239.0,58.38,174.35,1427.0,529.0,914.0,0.752,149260.0,25.79
6858,Wyoming,2022-06-14,47242,2022-06-14,Wyoming,578759,758784.0,1015885.0,257101,338314.0,...,131.11,296557.0,58.46,175.53,596.0,362.0,625.0,0.750,150679.0,26.03


In [8]:
# Manipulate data to create a yes/no question regarding wasted dosages

# Share doses used is a column that lists percentage of doses that have been used against the total distributed. 
# We plan to classify any record where share dosages used are under a certain percentage as "high percentage wasted" 
# and under a certain percentage as "low percentage wasted" and then train the model to predict which one for that record
# based off of the data

In [9]:
vax_encoded = pd.get_dummies(vax_df, columns=["2019_population", "people_vaccinated", "people_fully_vaccinated", "daily_vaccinations", "total_boosters"])
# Create our features
X = vax_df.drop(columns = ["total_vaccinations", "total_distributed", "share doses_used"])
# Create our target
Y = vax.df["wasted_doses"]

In [10]:
# Check the balance of our target values
y.value_counts()

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

NameError: name 'X' is not defined

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Resample the training data with the BalancedRandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [13]:
# Calculated the balanced accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [15]:
# Print the imbalanced classification report
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [16]:
# List the features sorted in descending order by feature importance