In [None]:
%matplotlib inline
import time
import re

#data import libraries
import pandas as pd

#path libraries
from pathlib import Path
import os.path as osp

#math libraries
import math
import numpy as np
import scipy as sp
import scipy.stats as stats
from scipy.stats import norm

#plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap


#sklearn libraries for data cleaning
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets
from sklearn.preprocessing import StandardScaler #normalize your dataset
from sklearn.model_selection import train_test_split #split data to train and test data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler #normalize your dataset
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#sklearn library for linear regression
from sklearn.linear_model import LinearRegression

#sklearn libraries for regularized regression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV

#sklearn library for logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import BayesianRidge

#sklearn library for knn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor

#sklearn libraries for decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.tree import _tree
from sklearn.tree import plot_tree

#sklearn libraries for data cleaning and cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler #normalize your dataset

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


#sklearn library for naive bayes
from sklearn.naive_bayes import GaussianNB

#sklearn library for support vector machine
from sklearn.svm import SVC

#sklearn library for neural networks
from sklearn.neural_network import MLPClassifier

#Random Forest
from sklearn.ensemble import RandomForestClassifier

#LGBM
from lightgbm import LGBMClassifier

#optuna
import optuna

#sklearn library for scores and errors
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import cohen_kappa_score, log_loss
from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score


#feature selection
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFECV, RFE

#imbalanced smotes
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler

#warnings 
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import category_encoders as ce

In [None]:
vaccine_features = pd.read_csv("training_set_features.csv")

In [None]:
vaccine_features.head()

In [None]:
vaccine_features.describe()

In [None]:
vaccine_features.dtypes

In [None]:
vaccine_labels = pd.read_csv("training_set_labels.csv")

In [None]:
vaccine_labels.head()

In [None]:
vaccine_labels.describe()

In [None]:
vaccine_labels.dtypes

In [None]:
vaccine = pd.merge(vaccine_features, vaccine_labels, on="respondent_id")

In [None]:
vaccine.head()

In [None]:
vaccine.describe()

In [None]:
vaccine.dtypes

In [None]:
# this is the function that plots the custom confusion matrix with the colors and labels 
def confusion_matrix_plotting(cm, title):
    plt.style.use('dark_background')
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.heatmap(np.eye(2), annot=cm, fmt='g', annot_kws={'size': 25},
            cmap=sns.color_palette(['tomato', 'palegreen']), cbar=False,
            yticklabels=['Vaccinated', 'Not Vaccinated'], xticklabels=['Vaccinated', 'Not Vaccinated'], ax=ax)
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')
    ax.tick_params(labelsize=14, length=0)

    ax.set_title(title, size=18, pad=10)
    ax.set_xlabel('Predicted', size=14)
    ax.set_ylabel('Actual', size=14)

    additional_texts = ['(True Positive)', '(False Negative)', '(False Positive)', '(True Negative)']
    for text_elt, additional_text in zip(ax.texts, additional_texts):
        ax.text(*text_elt.get_position(), '\n' + additional_text, color=text_elt.get_color(),
            ha='center', va='top', size=12)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_roc_curve(test, prediction):
    plt.figure(figsize=(7,7))
    fpr1, tpr1, threshold1 = roc_curve(test, prediction)
    roc_auc = metrics.auc(fpr1, tpr1)
    display = RocCurveDisplay(fpr=fpr1, tpr=tpr1, roc_auc=roc_auc,estimator_name='example estimator')
    display.plot()
    plt.show()

In [None]:
# Helper function
def quick_evaluate_with_dt(X_train, X_test, y_train, y_test, name, balance_weights=False):
    
    cw = None
    if balance_weights == True:
        cw = 'balanced'
        
    clf = DecisionTreeClassifier(random_state=0, class_weight=cw)
    clf = RandomForestClassifier(random_state=0, n_estimators=100, class_weight=cw)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy       = accuracy_score(y_test, y_pred)
    f1             = f1_score(y_test, y_pred)
    recall         = tn/(tn+fp)
    precision      = precision_score(y_test, y_pred)
    roc_auc        = roc_auc_score(y_test, y_pred)
    
    df = pd.DataFrame({"Method"    : [name],
                       "Neg"       : [tn + fn],
                       "True Neg"  : [tn],
                       "False Neg" : [fn],
                       "Pos"       : [tp + fp],
                       "TP"        : [tp],
                       "FP"        : [fp],
                       "Accuracy"  : [accuracy],
                       "Recall"    : [recall],
                       "Precision" : [precision],
                       "F1"        : [f1],
                       "AUC"       : [roc_auc],
                      })
    
    print(df)
    return df

In [None]:
X_seasonal = vaccine.drop(columns=['h1n1_vaccine', 'seasonal_vaccine', 'respondent_id'], axis=1)
y_seasonal = vaccine['seasonal_vaccine']
X_seasonal_train, X_seasonal_test, y_seasonal_train, y_seasonal_test = train_test_split(X_seasonal, y_seasonal, test_size=0.2, stratify=y_seasonal, random_state=0)

In [None]:
X_h1n1 = vaccine.drop(columns=['h1n1_vaccine', 'seasonal_vaccine', 'respondent_id'], axis=1)
y_h1n1 = vaccine['h1n1_vaccine']
X_h1n1_train, X_h1n1_test, y_h1n1_train, y_h1n1_test = train_test_split(X_h1n1, y_h1n1, test_size=0.2, stratify=y_h1n1, random_state=0)

In [None]:
# list of numeric features 
numeric_features = ['h1n1_concern', 'h1n1_knowledge',  'behavioral_face_mask',
            'behavioral_wash_hands', 'behavioral_large_gatherings',
                     'behavioral_touch_face',
            'doctor_recc_h1n1', 'chronic_med_condition',
                    'child_under_6_months', 'health_worker',
            'health_insurance', 'opinion_h1n1_vacc_effective',
                    'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc','doctor_recc_seasonal',
                 'opinion_seas_vacc_effective','opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
             'household_children', 'behavioral_outside_home', 'behavioral_antiviral_meds',
                    'behavioral_avoidance'
                    ]

# list of categorical features
categorical_features = ['hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation', 
                        'marital_status', 'race', 'sex', 'rent_or_own', 'age_group', 
                        'employment_status', 'education', 'income_poverty']


# here we do the data cleaning for the numerical features, fill in missing values using the mean and the scaling the data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

# here we do the data cleaning for the categorical features, fill in missing values using the most frequent 
# then use one hot encoder to create dummy variables and just ignore unknown variables
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder())])


# here we use column transformer to do all the numerical and categorical feature data cleaning in one function
preprocessor4 = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)],
            remainder = 'passthrough',
            sparse_threshold=0)

a = X_seasonal_train

# transform the train by using the data cleaning stuff above to fill in missing data and create dummy variables
processed_data = preprocessor4.fit_transform(a)

# Get the feature names for the transformed data
feature_names = preprocessor4.named_transformers_['cat']\
                    .named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numeric feature names with categorical feature names
all_feature_names = numeric_features + list(feature_names)

# Convert the processed data array back into a DataFrame
X_seasonal_train = pd.DataFrame(processed_data, columns=all_feature_names)

In [None]:
X_seasonal_train["seasonal_vaccine_effectiveness"] = X_seasonal_train["opinion_seas_vacc_effective"]*X_seasonal_train["opinion_seas_sick_from_vacc"]
X_seasonal_train["seasonal_household"]=X_seasonal_train["household_children"]*X_seasonal_train["household_adults"]

X_seasonal_train = X_seasonal_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X_seasonal_train = X_seasonal_train.loc[:,~X_seasonal_train.columns.duplicated()].copy()

In [None]:
X_seasonal_train.head()

In [None]:
# calculation of probability of studies being late
y_seasonal_train.value_counts()

In [None]:
# list of numeric features 
numeric_features = ['h1n1_concern', 'h1n1_knowledge',  'behavioral_face_mask',
            'behavioral_wash_hands', 'behavioral_large_gatherings',
                     'behavioral_touch_face',
            'doctor_recc_h1n1', 'chronic_med_condition',
                    'child_under_6_months', 'health_worker',
            'health_insurance', 'opinion_h1n1_vacc_effective',
                    'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc','doctor_recc_seasonal',
                 'opinion_seas_vacc_effective','opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
             'household_children', 'behavioral_outside_home', 'behavioral_antiviral_meds',
                    'behavioral_avoidance'
                    ]

# list of categorical features
categorical_features = ['hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation', 
                        'marital_status', 'race', 'sex', 'rent_or_own', 'age_group', 
                        'employment_status', 'education', 'income_poverty']


# here we do the data cleaning for the numerical features, fill in missing values using the mean and the scaling the data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

# here we do the data cleaning for the categorical features, fill in missing values using the most frequent 
# then use one hot encoder to create dummy variables and just ignore unknown variables
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder())])


# here we use column transformer to do all the numerical and categorical feature data cleaning in one function
preprocessor4 = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)],
            remainder = 'passthrough',
            sparse_threshold=0)

a = X_h1n1_train

# transform the train by using the data cleaning stuff above to fill in missing data and create dummy variables
processed_data = preprocessor4.fit_transform(a)

# Get the feature names for the transformed data
feature_names = preprocessor4.named_transformers_['cat']\
                    .named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numeric feature names with categorical feature names
all_feature_names = numeric_features + list(feature_names)

# Convert the processed data array back into a DataFrame
X_h1n1_train = pd.DataFrame(processed_data, columns=all_feature_names)

In [None]:
X_h1n1_train["h1n1_flu"] = X_h1n1_train["h1n1_concern"]*X_h1n1_train["h1n1_knowledge"]
X_h1n1_train["h1n1_vaccine_effectiveness"] = X_h1n1_train["opinion_h1n1_vacc_effective"]*X_h1n1_train["opinion_h1n1_sick_from_vacc"]
X_h1n1_train["h1n1_chronic"] = X_h1n1_train["chronic_med_condition"]*X_h1n1_train["doctor_recc_h1n1"]

In [None]:
X_h1n1_train = X_h1n1_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X_h1n1_train = X_h1n1_train.loc[:,~X_h1n1_train.columns.duplicated()].copy()

In [None]:
X_h1n1_train.head()

In [None]:
y_h1n1_train.value_counts()

In [None]:
#LGBM Boost

In [None]:
def objective(trial, F, t):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=25), # this is the number of trees that the model will build for training
        "min_samples_split": trial.suggest_int("min_samples_split", 10, 100, step=5), # the minimum number of samples (or records) that need to be in a node/box before it can split to new ones 
        "max_depth": trial.suggest_int("max_depth", 3,15), # how deep do you want the tree to be 
        "max_features": trial.suggest_float("max_features", 0.05, 1.0, step=0.01), # how many features or columns do you want to use to build a tree, this helps with generalizing your model as the less features you give, it will help with predicting new data
        "max_samples": trial.suggest_float("max_samples", 0.05, 1.0, step=0.01), # how many samples or records do you want to use to build a tree, this helps with generalizing your model as the less features you give, it will help with predicting new data
        "class_weight":  trial.suggest_categorical("class_weight", ["balanced", None]), # using balanced data set or original dataset
    }

 

    cv = StratifiedKFold(n_splits=10) # split the data into 10 equally weighted folds
    cv_scores = np.empty(10) # create an empty array of 10 elements to take an average of each trial 
    for idx, (train_idx, test_idx) in enumerate(cv.split(F, t)): 
        X_train_f, X_test_f = F.iloc[train_idx], F.iloc[test_idx] # assigning the train part for the fold
        y_train_f, y_test_f = t[train_idx], t[test_idx] # assigning the test part for the fold

 

        model = RandomForestClassifier(random_state=42, n_jobs=-1 , **param_grid, verbose=0) #here we pass in the model, could be any ML algorithm and verbose is the parameter used to print each trial
        
        model.fit(X_train_f, y_train_f) # fit the data

        preds = model.predict_proba(X_test_f)[:,1] #get the predicted probability to calculate the AUC score

        cv_scores[idx] = roc_auc_score(y_test_f, preds) # calculate the auc score for each fold

 

    return np.mean(cv_scores) # take the average AUC score after running 10 folds 

# this code is for one trial 

In [None]:
X_seasonal_train.reset_index(drop=True, inplace=True)
y_seasonal_train.reset_index(drop=True, inplace=True)

study_seasonal = optuna.create_study(direction='maximize', study_name="Random Forest Classifier")
func = lambda trial: objective(trial, X_seasonal_train, y_seasonal_train)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_seasonal.optimize(func, n_trials=100, n_jobs=-1, show_progress_bar=True)

In [None]:
print('Best hyperparameters:', study_seasonal.best_params)
print('Best Score:', study_seasonal.best_value)

In [None]:
X_h1n1_train.reset_index(drop=True, inplace=True)
y_h1n1_train.reset_index(drop=True, inplace=True)

study_h1n1 = optuna.create_study(direction='maximize', study_name="Random Forest Classifier")
func = lambda trial: objective(trial, X_h1n1_train, y_h1n1_train)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_h1n1.optimize(func, n_trials=100, n_jobs=-1, show_progress_bar=True)

In [None]:
print('Best hyperparameters:', study_h1n1.best_params)
print('Best Score:', study_h1n1.best_value)

In [None]:
clf_seasonal = RandomForestClassifier(**study_seasonal.best_params, n_jobs=-1, random_state=42, verbose=0)

In [None]:

# Assuming you have your data 'X' (input features) and 'y' (target labels)

# Define the number of folds (K)
k = 10

# Initialize lists to store the evaluation metrics
confusion_matrices = []
accuracy_scores = []
truepositive = []
truenegative = []
falsepositive = []
falsenegative = []
specificity = []
sensitivity = []
LogLoss = []
auc = []
test = []
pred_proba = []

# Create the K-fold cross-validation object
kf = KFold(n_splits=k)


X1 = X_seasonal_train.values
y1 = y_seasonal_train.values

# Perform K-fold cross-validation
for train_index, test_index in kf.split(X1):
    # Split the data into training and test sets
    X_train_lr, X_test_lr = X1[train_index], X1[test_index]
    y_train_lr, y_test_lr = y1[train_index], y1[test_index]

    # Train your model on the training set
    clf_seasonal.fit(X_train_lr, y_train_lr)

    # Make predictions on the test set
    y_pred = clf_seasonal.predict(X_test_lr)
    y_pred_proba = clf_seasonal.predict_proba(X_test_lr)


    # Compute the confusion matrix
    cm = confusion_matrix(y_test_lr, y_pred)
    confusion_matrices.append(np.flip(cm))

    tn, fp, fn, tp = cm.ravel()
    truenegative.append(tn)
    falsepositive.append(fp)
    falsenegative.append(fn)
    truepositive.append(tp)


    #Compute Specificity and Sensitivity
    Sensitivity = np.round(tp / (tp+fn),4)
    sensitivity.append(Sensitivity)
    Specificity = np.round(tn / (tn+fp),4)
    specificity.append(Specificity)


    #Compute the log loss
    logloss = log_loss(y_test_lr, y_pred, labels=[0, 1])
    LogLoss.append(logloss)

    # Compute the classification report
    accuracy = classification_report(y_test_lr, y_pred)
    accuracy_scores.append(accuracy)

    AUC = roc_auc_score(y_test_lr, y_pred_proba[:,1], average='macro')
    print(AUC)
    auc.append(AUC)

    pred_proba.append(y_pred_proba[:,1])
    test.append(y_test_lr)

# Print the confusion matrices and accuracy scores for each fold
for fold in range(k):
    confusion_matrix_plotting(confusion_matrices[fold], ("Random Forest " + f"Fold {fold+1}"))
    print("\n")
    print("Classification Report:")
    print("\n")
    print(accuracy_scores[fold])
    print("\n")
    print("Sensitivity: " + str(sensitivity[fold]))
    print("Specificity: " + str(specificity[fold]))
    print("Log Loss: " + str(LogLoss[fold]))
    print("\n")
    plot_roc_curve(test[fold],pred_proba[fold])
    print()

In [None]:
# Python program to get average of a list
def Average(lst):
    return sum(lst) / len(lst)

In [None]:
test = np.array([[Average(truepositive), Average(falsenegative)], [Average(falsepositive), Average(truenegative)]]) 
confusion_matrix_plotting(test, ("Random Forest Seasonal Average"))

In [None]:
print(Average(auc))

In [None]:
clf_h1n1 = RandomForestClassifier(**study_h1n1.best_params, n_jobs=-1, random_state=42, verbose=0)

In [None]:

# Assuming you have your data 'X' (input features) and 'y' (target labels)

# Define the number of folds (K)
k = 10

# Initialize lists to store the evaluation metrics
confusion_matrices = []
accuracy_scores = []
truepositive = []
truenegative = []
falsepositive = []
falsenegative = []
specificity = []
sensitivity = []
LogLoss = []
auc = []
test = []
pred_proba = []

# Create the K-fold cross-validation object
kf = KFold(n_splits=k)


X1 = X_h1n1_train.values
y1 = y_h1n1_train.values

# Perform K-fold cross-validation
for train_index, test_index in kf.split(X1):
    # Split the data into training and test sets
    X_train_lr, X_test_lr = X1[train_index], X1[test_index]
    y_train_lr, y_test_lr = y1[train_index], y1[test_index]

    # Train your model on the training set
    clf_h1n1.fit(X_train_lr, y_train_lr)

    # Make predictions on the test set
    y_pred = clf_h1n1.predict(X_test_lr)
    y_pred_proba = clf_h1n1.predict_proba(X_test_lr)


    # Compute the confusion matrix
    cm = confusion_matrix(y_test_lr, y_pred)
    confusion_matrices.append(np.flip(cm))

    tn, fp, fn, tp = cm.ravel()
    truenegative.append(tn)
    falsepositive.append(fp)
    falsenegative.append(fn)
    truepositive.append(tp)


    #Compute Specificity and Sensitivity
    Sensitivity = np.round(tp / (tp+fn),4)
    sensitivity.append(Sensitivity)
    Specificity = np.round(tn / (tn+fp),4)
    specificity.append(Specificity)


    #Compute the log loss
    logloss = log_loss(y_test_lr, y_pred, labels=[0, 1])
    LogLoss.append(logloss)

    # Compute the classification report
    accuracy = classification_report(y_test_lr, y_pred)
    accuracy_scores.append(accuracy)

    AUC = roc_auc_score(y_test_lr, y_pred_proba[:,1], average='macro')
    print(AUC)
    auc.append(AUC)

    pred_proba.append(y_pred_proba[:,1])
    test.append(y_test_lr)

# Print the confusion matrices and accuracy scores for each fold
for fold in range(k):
    confusion_matrix_plotting(confusion_matrices[fold], ("Random Forest " + f"Fold {fold+1}"))
    print("\n")
    print("Classification Report:")
    print("\n")
    print(accuracy_scores[fold])
    print("\n")
    print("Sensitivity: " + str(sensitivity[fold]))
    print("Specificity: " + str(specificity[fold]))
    print("Log Loss: " + str(LogLoss[fold]))
    print("\n")
    plot_roc_curve(test[fold],pred_proba[fold])
    print()

In [None]:
test_h1n1 = np.array([[Average(truepositive), Average(falsenegative)], [Average(falsepositive), Average(truenegative)]]) 
confusion_matrix_plotting(test_h1n1, ("Random Forest H1N1 Average"))

In [None]:
print(Average(auc))

In [None]:
a = X_seasonal_test

# transform the train by using the data cleaning stuff above to fill in missing data and create dummy variables
processed_data = preprocessor4.transform(a)

# Get the feature names for the transformed data
feature_names = preprocessor4.named_transformers_['cat']\
                    .named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numeric feature names with categorical feature names
all_feature_names = numeric_features + list(feature_names)

# Convert the processed data array back into a DataFrame
X_seasonal_test = pd.DataFrame(processed_data, columns=all_feature_names)

X_seasonal_test["seasonal_vaccine_effectiveness"] = X_seasonal_test["opinion_seas_vacc_effective"]*X_seasonal_test["opinion_seas_sick_from_vacc"]
X_seasonal_test["seasonal_household"]=X_seasonal_test["household_children"]*X_seasonal_test["household_adults"]

X_seasonal_test = X_seasonal_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X_seasonal_test = X_seasonal_test.loc[:,~X_seasonal_test.columns.duplicated()].copy()

In [None]:
clf_seasonal.fit(X_seasonal_train, y_seasonal_train)

In [None]:
importances = (clf_seasonal.feature_importances_)/sum(clf_seasonal.feature_importances_)
importances

In [None]:
indices = np.argsort(importances)[-10:]
plt.figure(figsize=(7,7))
plt.style.use('dark_background')
plt.barh(range(len(indices)), importances[indices], color="cyan", height=0.2, align='center')
plt.yticks(range(len(indices)), [X_seasonal_train.columns.values.tolist()[i] for i in indices], fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel("Feature Importance", fontsize=18)
plt.ylabel("Features", fontsize=18)
plt.title("Feature Importance for Random Forest - Seasonal", fontsize=18)
plt.grid()
plt.show()

In [None]:
y_seasonal_pred = clf_seasonal.predict(X_seasonal_test)

In [None]:
print(X_seasonal_test)

false_predictions = np.where(y_seasonal_pred != y_seasonal_test)[0]

# Print the indices of false predictions and their corresponding true and predicted labels
for idx in false_predictions:
    print(f"Index: {idx}, True label: {y_seasonal_test.to_list()[idx]}, Predicted label: {y_seasonal_pred[idx]}")

In [None]:
y_seasonal_test_new = y_seasonal_test.to_list()
data_seasonal = pd.concat([X_seasonal_test, pd.DataFrame(data={'Prediction': y_seasonal_pred, 'Actual': y_seasonal_test_new})], axis=1)
data_seasonal

In [None]:
data_seasonal.iloc[:,[-2,-1]].to_csv("seasonalrf.csv", sep=",")

In [None]:
# False Negative
data_seasonal.query('Prediction==0 & Actual==1 ')

In [None]:
# False Positive
data_seasonal.query('Prediction==1 & Actual==0 ')

In [None]:
# True Positive
data_seasonal.query('Prediction==1 & Actual==1 ')

In [None]:
# True Negative
data_seasonal.query('Prediction==0 & Actual==0 ')

In [None]:
cm = confusion_matrix(y_seasonal_test, y_seasonal_pred)
confusion_matrix_plotting(np.flip(cm), "Random Forest Seasonal Test Set")

In [None]:
tn = cm[0][0]
fp = cm[0][1]

fn = cm[1][0]
tp = cm[1][1]

negative = tn + fp
positive = tp + fn

total = tn + fp + tp + fn

no_weights = []
no_weights.append((fp/negative)*100)
no_weights.append((fn/positive)*100)
no_weights.append(((fn+fp)/total)*100)

yes_weights = []
yes_weights.append((tn/negative)*100)
yes_weights.append((tp/positive)*100)
yes_weights.append(((tn+tp)/total)*100)


species = (
    "No",
    "Yes",
    "Total",
)
weight_counts = {
    "Correct": yes_weights,
    "Wrong": no_weights,

}

fig, ax = plt.subplots(1,1,figsize=(8,8))
bottom = np.zeros(3)

j = 0

colors = ['palegreen', 'tomato']

for boolean, weight_count in weight_counts.items():
    p = ax.bar(species, weight_count, label=boolean, bottom=bottom, color=colors[j])
    
    for i, rect in enumerate(p):
        height = rect.get_height()
        if height >= 0:
            ax.text(rect.get_x() + rect.get_width() / 2, 
                    bottom[i] + height / 2, 
                    "{:.1f}%".format(height),
                    ha='center', 
                    va='center',
                   fontsize=12,
                   weight="bold")
    
    bottom += weight_count
    j += 1

ax.set_title("Total Proportion of Correct Predictions per Class - Seasonal")
ax.legend(bbox_to_anchor=(1.02, 1))
ax.set_yticks(ticks=[0, 20, 40, 60, 80, 100])
plt.show()

In [None]:
sensitivity = np.round(tp / (tp+fn),4)
print("Sensitivity: " + str(sensitivity))

In [None]:
specificity = np.round(tn / (tn+fp),4)
print("Specificity: " + str(specificity))

In [None]:
print(classification_report(y_seasonal_test, y_seasonal_pred))

In [None]:
y_seasonal_pred_proba = clf_seasonal.predict_proba(X_seasonal_test)
plt.style.use('dark_background')
plt.figure(figsize=(7,7))
fpr1, tpr1, threshold1 = roc_curve(y_seasonal_test, y_seasonal_pred_proba[:,1])
roc_auc = metrics.auc(fpr1, tpr1)
print(roc_auc)
display = RocCurveDisplay(fpr=fpr1, tpr=tpr1, roc_auc=np.round(roc_auc,4))
display.plot(color="cyan")
plt.grid(False)
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel("False Positive Rate (FPR)", fontsize=14)
plt.ylabel("True Positive Rate (TPR)", fontsize=14)
plt.title("ROC-AUC Curve for Random Forest", fontsize=18)
plt.xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=14)
plt.yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=14)
legend_properties = {'size': 16, 'weight':'bold'}
plt.legend(loc=4, labels=['Random Forest (AUC: ' + str(np.round(roc_auc,4)) + ")"], prop=legend_properties, frameon=False)
plt.show()

In [None]:
a = X_h1n1_test

# transform the train by using the data cleaning stuff above to fill in missing data and create dummy variables
processed_data = preprocessor4.transform(a)

# Get the feature names for the transformed data
feature_names = preprocessor4.named_transformers_['cat']\
                    .named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numeric feature names with categorical feature names
all_feature_names = numeric_features + list(feature_names)

# Convert the processed data array back into a DataFrame
X_h1n1_test = pd.DataFrame(processed_data, columns=all_feature_names)

X_h1n1_test["h1n1_flu"] = X_h1n1_test["h1n1_concern"]*X_h1n1_test["h1n1_knowledge"]
X_h1n1_test["h1n1_vaccine_effectiveness"] = X_h1n1_test["opinion_h1n1_vacc_effective"]*X_h1n1_test["opinion_h1n1_sick_from_vacc"]
X_h1n1_test["h1n1_chronic"] = X_h1n1_test["chronic_med_condition"]*X_h1n1_test["doctor_recc_h1n1"]

X_h1n1_test = X_h1n1_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X_h1n1_test = X_h1n1_test.loc[:,~X_h1n1_test.columns.duplicated()].copy()

In [None]:
clf_h1n1.fit(X_h1n1_train, y_h1n1_train)

In [None]:
importances = (clf_h1n1.feature_importances_)/sum(clf_h1n1.feature_importances_)
importances

In [None]:
indices = np.argsort(importances)[-10:]
plt.figure(figsize=(7,7))
plt.style.use('dark_background')
plt.barh(range(len(indices)), importances[indices], color="cyan", height=0.2, align='center')
plt.yticks(range(len(indices)), [X_h1n1_train.columns.values.tolist()[i] for i in indices], fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel("Feature Importance", fontsize=18)
plt.ylabel("Features", fontsize=18)
plt.title("Feature Importance for Random Forest - H1N1", fontsize=18)
plt.grid()
plt.show()

In [None]:
y_h1n1_pred = clf_h1n1.predict(X_h1n1_test)

In [None]:
print(X_h1n1_test)

false_predictions = np.where(y_h1n1_pred != y_h1n1_test)[0]

# Print the indices of false predictions and their corresponding true and predicted labels
for idx in false_predictions:
    print(f"Index: {idx}, True label: {y_h1n1_test.to_list()[idx]}, Predicted label: {y_h1n1_pred[idx]}")

In [None]:
y_h1n1_test_new = y_h1n1_test.to_list()
data_h1n1 = pd.concat([X_h1n1_test, pd.DataFrame(data={'Prediction': y_h1n1_pred, 'Actual': y_h1n1_test_new})], axis=1)
data_h1n1

In [None]:
data_h1n1.iloc[:,[-2,-1]].to_csv("h1n1rf.csv", sep=",")

In [None]:
# False Negative
data_h1n1.query('Prediction==0 & Actual==1 ')

In [None]:
# False Positive
data_h1n1.query('Prediction==1 & Actual==0 ')

In [None]:
# True Positive
data_h1n1.query('Prediction==1 & Actual==1 ')

In [None]:
# True Negative
data_h1n1.query('Prediction==0 & Actual==0 ')

In [None]:
cm_h1n1 = confusion_matrix(y_h1n1_test, y_h1n1_pred)
confusion_matrix_plotting(np.flip(cm_h1n1), "Random Forest H1N1 Test Set")

In [None]:
tn = cm_h1n1[0][0]
fp = cm_h1n1[0][1]

fn = cm_h1n1[1][0]
tp = cm_h1n1[1][1]

negative = tn + fp
positive = tp + fn

total = tn + fp + tp + fn

no_weights = []
no_weights.append((fp/negative)*100)
no_weights.append((fn/positive)*100)
no_weights.append(((fn+fp)/total)*100)

yes_weights = []
yes_weights.append((tn/negative)*100)
yes_weights.append((tp/positive)*100)
yes_weights.append(((tn+tp)/total)*100)


species = (
    "No",
    "Yes",
    "Total",
)
weight_counts = {
    "Correct": yes_weights,
    "Wrong": no_weights,

}

fig, ax = plt.subplots(1,1,figsize=(8,8))
bottom = np.zeros(3)

j = 0

colors = ['palegreen', 'tomato']

for boolean, weight_count in weight_counts.items():
    p = ax.bar(species, weight_count, label=boolean, bottom=bottom, color=colors[j])
    
    for i, rect in enumerate(p):
        height = rect.get_height()
        if height >= 0:
            ax.text(rect.get_x() + rect.get_width() / 2, 
                    bottom[i] + height / 2, 
                    "{:.1f}%".format(height),
                    ha='center', 
                    va='center',
                   fontsize=12,
                   weight="bold")
    
    bottom += weight_count
    j += 1

ax.set_title("Total Proportion of Correct Predictions per Class - H1N1")
ax.legend(bbox_to_anchor=(1.02, 1))
ax.set_yticks(ticks=[0, 20, 40, 60, 80, 100])
plt.show()

In [None]:
sensitivity = np.round(tp / (tp+fn),4)
print("Sensitivity: " + str(sensitivity))

In [None]:
specificity = np.round(tn / (tn+fp),4)
print("Specificity: " + str(specificity))

In [None]:
print(classification_report(y_h1n1_test, y_h1n1_pred))

In [None]:
y_h1n1_pred_proba = clf_h1n1.predict_proba(X_h1n1_test)
plt.style.use('dark_background')
plt.figure(figsize=(7,7))
fpr1, tpr1, threshold1 = roc_curve(y_h1n1_test, y_h1n1_pred_proba[:,1])
roc_auc = metrics.auc(fpr1, tpr1)
print(roc_auc)
display = RocCurveDisplay(fpr=fpr1, tpr=tpr1, roc_auc=np.round(roc_auc,4))
display.plot(color="cyan")
plt.grid(False)
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel("False Positive Rate (FPR)", fontsize=14)
plt.ylabel("True Positive Rate (TPR)", fontsize=14)
plt.title("ROC-AUC Curve for LGBM", fontsize=18)
plt.xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=14)
plt.yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=14)
legend_properties = {'size': 16, 'weight':'bold'}
plt.legend(loc=4, labels=['Random Forest (AUC: ' + str(np.round(roc_auc,4)) + ")"], prop=legend_properties, frameon=False)
plt.show()

In [None]:
# list of numeric features 
numeric_features = ['h1n1_concern', 'h1n1_knowledge',  'behavioral_face_mask',
            'behavioral_wash_hands', 'behavioral_large_gatherings',
                     'behavioral_touch_face',
            'doctor_recc_h1n1', 'chronic_med_condition',
                    'child_under_6_months', 'health_worker',
            'health_insurance', 'opinion_h1n1_vacc_effective',
                    'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc','doctor_recc_seasonal',
                 'opinion_seas_vacc_effective','opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
             'household_children', 'behavioral_outside_home', 'behavioral_antiviral_meds',
                    'behavioral_avoidance'
                    ]

# list of categorical features
categorical_features = ['hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation', 
                        'marital_status', 'race', 'sex', 'rent_or_own', 'age_group', 
                        'employment_status', 'education', 'income_poverty']


# here we do the data cleaning for the numerical features, fill in missing values using the mean and the scaling the data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

# here we do the data cleaning for the categorical features, fill in missing values using the most frequent 
# then use one hot encoder to create dummy variables and just ignore unknown variables
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder())])


# here we use column transformer to do all the numerical and categorical feature data cleaning in one function
preprocessor4 = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)],
            remainder = 'passthrough',
            sparse_threshold=0)


a = X_seasonal

# transform the train by using the data cleaning stuff above to fill in missing data and create dummy variables
processed_data = preprocessor4.fit_transform(a)

# Get the feature names for the transformed data
feature_names = preprocessor4.named_transformers_['cat']\
                    .named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numeric feature names with categorical feature names
all_feature_names = numeric_features + list(feature_names)

# Convert the processed data array back into a DataFrame
X_seasonal = pd.DataFrame(processed_data, columns=all_feature_names)

X_seasonal["seasonal_vaccine_effectiveness"] = X_seasonal["opinion_seas_vacc_effective"]*X_seasonal["opinion_seas_sick_from_vacc"]
X_seasonal["seasonal_household"]=X_seasonal["household_children"]*X_seasonal["household_adults"]

X_seasonal = X_seasonal.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X_seasonal = X_seasonal.loc[:,~X_seasonal.columns.duplicated()].copy()

In [None]:
# list of numeric features 
numeric_features = ['h1n1_concern', 'h1n1_knowledge',  'behavioral_face_mask',
            'behavioral_wash_hands', 'behavioral_large_gatherings',
                     'behavioral_touch_face',
            'doctor_recc_h1n1', 'chronic_med_condition',
                    'child_under_6_months', 'health_worker',
            'health_insurance', 'opinion_h1n1_vacc_effective',
                    'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc','doctor_recc_seasonal',
                 'opinion_seas_vacc_effective','opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
             'household_children', 'behavioral_outside_home', 'behavioral_antiviral_meds',
                    'behavioral_avoidance'
                    ]

# list of categorical features
categorical_features = ['hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation', 
                        'marital_status', 'race', 'sex', 'rent_or_own', 'age_group', 
                        'employment_status', 'education', 'income_poverty']


# here we do the data cleaning for the numerical features, fill in missing values using the mean and the scaling the data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

# here we do the data cleaning for the categorical features, fill in missing values using the most frequent 
# then use one hot encoder to create dummy variables and just ignore unknown variables
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder())])


# here we use column transformer to do all the numerical and categorical feature data cleaning in one function
preprocessor4 = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)],
            remainder = 'passthrough',
            sparse_threshold=0)


a = X_h1n1

# transform the train by using the data cleaning stuff above to fill in missing data and create dummy variables
processed_data = preprocessor4.fit_transform(a)

# Get the feature names for the transformed data
feature_names = preprocessor4.named_transformers_['cat']\
                    .named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numeric feature names with categorical feature names
all_feature_names = numeric_features + list(feature_names)

# Convert the processed data array back into a DataFrame
X_h1n1 = pd.DataFrame(processed_data, columns=all_feature_names)

X_h1n1["h1n1_flu"] = X_h1n1["h1n1_concern"]*X_h1n1["h1n1_knowledge"]
X_h1n1["h1n1_vaccine_effectiveness"] = X_h1n1["opinion_h1n1_vacc_effective"]*X_h1n1["opinion_h1n1_sick_from_vacc"]
X_h1n1["h1n1_chronic"] = X_h1n1["chronic_med_condition"]*X_h1n1["doctor_recc_h1n1"]

X_h1n1 = X_h1n1.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X_h1n1 = X_h1n1.loc[:,~X_h1n1.columns.duplicated()].copy()

In [None]:
clf_seasonal.fit(X_seasonal, y_seasonal)

In [None]:
clf_seasonal.feature_importances_

In [None]:
clf_h1n1.fit(X_h1n1, y_h1n1)

In [None]:
clf_h1n1.feature_importances_

In [None]:
test = pd.read_csv("test_set_features.csv")

In [None]:
test.head()

In [None]:
test.describe()

In [None]:
test.dtypes

In [None]:
# here apply the same data cleaning process to the test data set 
a = test.drop(columns=['respondent_id'], axis=1)

processed_data = preprocessor4.transform(a)

# Get the feature names for the transformed data
feature_names = preprocessor4.named_transformers_['cat']\
                    .named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numeric feature names with categorical feature names
all_feature_names = numeric_features + list(feature_names)

# Convert the processed data array back into a DataFrame
processed_df  = pd.DataFrame(processed_data, columns=all_feature_names)

processed_df["seasonal_vaccine_effectiveness"] = processed_df["opinion_seas_vacc_effective"]*processed_df["opinion_seas_sick_from_vacc"]
processed_df["seasonal_household"]=processed_df["household_children"]*processed_df["household_adults"]

processed_df = processed_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

processed_df = processed_df.loc[:,~processed_df.columns.duplicated()].copy()

In [None]:
prediction = clf_seasonal.predict_proba(processed_df)

In [None]:
# here apply the same data cleaning process to the test data set 
a = test.drop(columns=['respondent_id'], axis=1)

processed_data = preprocessor4.transform(a)

# Get the feature names for the transformed data
feature_names = preprocessor4.named_transformers_['cat']\
                    .named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numeric feature names with categorical feature names
all_feature_names = numeric_features + list(feature_names)

# Convert the processed data array back into a DataFrame
processed_df  = pd.DataFrame(processed_data, columns=all_feature_names)

processed_df["h1n1_flu"] = processed_df["h1n1_concern"]*processed_df["h1n1_knowledge"]
processed_df["h1n1_vaccine_effectiveness"] = processed_df["opinion_h1n1_vacc_effective"]*processed_df["opinion_h1n1_sick_from_vacc"]
processed_df["h1n1_chronic"] = processed_df["chronic_med_condition"]*processed_df["doctor_recc_h1n1"]

processed_df = processed_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

processed_df = processed_df.loc[:,~processed_df.columns.duplicated()].copy()

In [None]:
prediction_h1n1 = clf_h1n1.predict_proba(processed_df)

In [None]:
d = {'respondent_id': test['respondent_id']}

In [None]:
df = pd.DataFrame(data=d)

In [None]:
df['h1n1_vaccine'] = prediction_h1n1[:,1]

In [None]:
df['seasonal_vaccine'] = prediction[:,1]

In [None]:
df.to_csv('rf.csv', sep=",", index=False)