In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve
from sklearn.ensemble import RandomForestClassifier

In [86]:
data = pd.read_csv('./vaccination-data.csv')
data.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
0,AFG,Asia,Afghanistan,2/24/2020,5.0,5.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
1,AFG,Asia,Afghanistan,2/25/2020,5.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
2,AFG,Asia,Afghanistan,2/26/2020,5.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
3,AFG,Asia,Afghanistan,2/27/2020,5.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
4,AFG,Asia,Afghanistan,2/28/2020,5.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,


In [87]:
data.dtypes

iso_code                       object
continent                      object
location                       object
date                           object
total_cases                   float64
                               ...   
handwashing_facilities        float64
hospital_beds_per_thousand    float64
life_expectancy               float64
human_development_index       float64
excess_mortality              float64
Length: 62, dtype: object

In [88]:
data['new_cases_per_million'].mean()

82.16184037352743

In [89]:
data['new_cases_per_million'].median()

10.3585

In [90]:
data['high_cases'] = data['new_cases_per_million'] > 85

In [91]:
data['high_cases'].isna().sum()

0

In [108]:
data.dropna(subset=['new_deaths_per_million', 'iso_code', 'new_cases_per_million', 'new_vaccinations', 'total_vaccinations', 'population_density', 'median_age', 'extreme_poverty'], inplace=True)

In [109]:
data.isnull().sum()['new_deaths_per_million']

0

In [110]:
label_encoder = LabelEncoder()
data['iso_code'] = label_encoder.fit_transform(data['iso_code'])
data['new_deaths_per_million'] = label_encoder.fit_transform(data['new_deaths_per_million'])

In [111]:
X = data[['iso_code', 'new_vaccinations', 'total_vaccinations', 'population_density', 'median_age', 'extreme_poverty']]
y = data['high_cases']

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [113]:
# Random Forest Parameters
# See the excel for previous results
max_depth = 2
random_state = 101  
n_estimators = 10    # number of trees
min_samples_split = 10 # minimum samples to split a node
min_samples_leaf = 3  # minimum samples to be a leaf

In [114]:
random_forest = RandomForestClassifier(max_depth=max_depth, random_state=random_state, n_estimators=n_estimators, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf).fit(X_train, y_train)

In [115]:
y_predicted = random_forest.predict(X_test)

In [116]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

       False       0.79      0.44      0.56      2135
        True       0.63      0.90      0.74      2332

    accuracy                           0.68      4467
   macro avg       0.71      0.67      0.65      4467
weighted avg       0.71      0.68      0.66      4467



In [117]:
confusion_matrix(y_test, y_predicted)

array([[ 930, 1205],
       [ 243, 2089]], dtype=int64)