# COVID-19's Impact on Healthcare Accessibility
### By: Tristan Call and Maria Elena Aviles-Baquero

# Classification
In this document we will aim to apply classifiers to the dataset

(question about this notebook: what is the difference b/w this one and the EDA?)

In [1]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.plot_utils
importlib.reload(mysklearn.plot_utils)
import mysklearn.plot_utils as plot_utils

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier, MyZeroRClassifier, MyRandomClassifier, MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import os
import pandas as pd
from tabulate import tabulate

In [2]:
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

working_data_filename = os.path.join("input_data", "week21_working.csv")

# Load the data into a mypytable for future analysis
overall_table = MyPyTable()
overall_table.load_from_file(working_data_filename)
overall_table.convert_to_numeric()

# Convert year into bigger categorical chunks
year_col = overall_table.get_column("TBIRTH_YEAR")
year_label = [str(1932 + 10 * x) + " to " + str(1941 + 10 * x) for x in range(6)]
year_label.append("1992 to 2002")
cutoffs = [1932 + 10 * x for x in range(8)]
year_col = myutils.categorize_continuous_list(year_col, cutoffs, year_label)

# Create DELAYNOTGET column
delay = overall_table.get_column("DELAY")
notget = overall_table.get_column("NOTGET")
delaynotget = []
for i in range(len(delay)):
    if delay[i] == 1 or notget[i] == 1:
        delaynotget.append(1)
    else:
        delaynotget.append(2)
        
# Combine all the above into the overall_table
overall_table.column_names.append("DELAYNOTGET")
overall_table.data = [[overall_table.data[i][0]] + [year_col[i]] + overall_table.data[i][2:] + [delaynotget[i]] for i in range(len(year_col))]

In [3]:
# Break information into X_train and class_label
X_train = overall_table.get_columns(["TBIRTH_YEAR", "EGENDER", "RHISPANIC", "RRACE", "EEDUC", "INCOME"])
X_train = X_train.data
Y_train = overall_table.get_column("DELAYNOTGET")

## Compute the Baseline
First we will compute the baseline classifiers to get an idea of how must we must improve our classifiers.

In [4]:
print('''===========================================
Predictive Accuracy
===========================================
Stratified 10-Fold Cross Validation''')
k = 10
all_predicted_delay_zero = []
all_actual_delay_zero = []

# Get training data
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X_train, Y_train, k)
for i in range(k):
    # Sort training data
    xtrain = myutils.distribute_data_by_index(X_train, train_folds[i])
    ytrain = myutils.distribute_data_by_index(Y_train, train_folds[i])
    xtest = myutils.distribute_data_by_index(X_train, test_folds[i])
    ytest = myutils.distribute_data_by_index(Y_train, test_folds[i])

    # Compute prediction and convert
    zero = MyZeroRClassifier()
    zero.fit(xtrain, ytrain)
    predicted_delay = zero.predict(xtest)
    all_predicted_delay_zero += predicted_delay
    all_actual_delay_zero += ytest
    
# Calculate overall accuracy
accuracy = myutils.calculate_accuracy(all_predicted_delay_zero, all_actual_delay_zero)
error_rate = 1- accuracy

print("Zero R: accuracy = " + str(accuracy) + ", error rate = " + str(error_rate))

Predictive Accuracy
Stratified 10-Fold Cross Validation
Zero R: accuracy = 0.614735226400614, error rate = 0.38526477359938605


In [5]:
print('''===========================================
Confusion Matrices
===========================================
Zero R (Stratified 10-Fold Cross Validation):''')

ylabels = list(set(Y_train))
matrix = myevaluation.confusion_matrix(all_actual_delay_zero, all_predicted_delay_zero, ylabels)
header = myutils.format_confusion_matrix_into_table(matrix, ["Delayed/canceled", "Not delayed"], "Delayed/canceled")

print(tabulate(matrix, headers=header, tablefmt="rst", numalign="right"))

Confusion Matrices
Zero R (Stratified 10-Fold Cross Validation):
Delayed/canceled      Delayed/canceled    Not delayed    Total    Recognition (%)
Delayed/canceled                     0           1506     1506                  0
Not delayed                          0           2403     2403                100


### Random Classifier

In [6]:
print('''===========================================
Predictive Accuracy
===========================================
Stratified 10-Fold Cross Validation''')
k = 10
all_predicted_delay_random = []
all_actual_delay_random = []

# Get training data
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X_train, Y_train, k)
for i in range(k):
    # Sort training data
    xtrain = myutils.distribute_data_by_index(X_train, train_folds[i])
    ytrain = myutils.distribute_data_by_index(Y_train, train_folds[i])
    xtest = myutils.distribute_data_by_index(X_train, test_folds[i])
    ytest = myutils.distribute_data_by_index(Y_train, test_folds[i])

    # Compute prediction and convert
    random = MyRandomClassifier()
    random.fit(xtrain, ytrain)
    predicted_delay = random.predict(xtest)
    all_predicted_delay_random += predicted_delay
    all_actual_delay_random += ytest
    
# Calculate overall accuracy
accuracy = myutils.calculate_accuracy(all_predicted_delay_random, all_actual_delay_random)
error_rate = 1- accuracy

print("Random: accuracy = " + str(accuracy) + ", error rate = " + str(error_rate))

Predictive Accuracy
Stratified 10-Fold Cross Validation
Random: accuracy = 0.5154771041187004, error rate = 0.4845228958812996


In [7]:
print('''===========================================
Confusion Matrices
===========================================
Random (Stratified 10-Fold Cross Validation):''')

ylabels = list(set(Y_train))
matrix = myevaluation.confusion_matrix(all_actual_delay_random, all_predicted_delay_random, ylabels)
header = myutils.format_confusion_matrix_into_table(matrix, ["Delayed/canceled", "Not delayed"], "Delayed/canceled")

print(tabulate(matrix, headers=header, tablefmt="rst", numalign="right"))

Confusion Matrices
Random (Stratified 10-Fold Cross Validation):
Delayed/canceled      Delayed/canceled    Not delayed    Total    Recognition (%)
Delayed/canceled                   573            933     1506              38.05
Not delayed                        961           1442     2403              60.01


## Compute Actual Classifiers
Next we will try running some more intensive classifiers over our database and see how they compare

### Naive Bayes

In [8]:
print('''===========================================
Predictive Accuracy
===========================================
Stratified 10-Fold Cross Validation''')
k = 10
all_predicted_delay_bayes = []
all_actual_delay_bayes = []

# Get training data
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X_train, Y_train, k)
for i in range(k):
    # Sort training data
    xtrain = myutils.distribute_data_by_index(X_train, train_folds[i])
    ytrain = myutils.distribute_data_by_index(Y_train, train_folds[i])
    xtest = myutils.distribute_data_by_index(X_train, test_folds[i])
    ytest = myutils.distribute_data_by_index(Y_train, test_folds[i])

    # Compute prediction and convert
    bayes = MyNaiveBayesClassifier()
    bayes.fit(xtrain, ytrain)
    predicted_delay = bayes.predict(xtest)
    all_predicted_delay_bayes += predicted_delay
    all_actual_delay_bayes += ytest
    
# Calculate overall accuracy
accuracy = myutils.calculate_accuracy(all_predicted_delay_bayes, all_actual_delay_bayes)
error_rate = 1- accuracy

print("Naive bayes: accuracy = " + str(accuracy) + ", error rate = " + str(error_rate))

Predictive Accuracy
Stratified 10-Fold Cross Validation
Naive bayes: accuracy = 0.6144794064978255, error rate = 0.3855205935021745


In [9]:
print('''===========================================
Confusion Matrices
===========================================
Naive bayes (Stratified 10-Fold Cross Validation):''')

ylabels = list(set(Y_train))
matrix = myevaluation.confusion_matrix(all_actual_delay_bayes, all_predicted_delay_bayes, ylabels)
header = myutils.format_confusion_matrix_into_table(matrix, ["Delayed/canceled", "Not delayed"], "Delayed/canceled")

print(tabulate(matrix, headers=header, tablefmt="rst", numalign="right"))

Confusion Matrices
Naive bayes (Stratified 10-Fold Cross Validation):
Delayed/canceled      Delayed/canceled    Not delayed    Total    Recognition (%)
Delayed/canceled                   221           1285     1506              14.67
Not delayed                        222           2181     2403              90.76


### Decision Tree

In [10]:
print('''===========================================
Predictive Accuracy
===========================================
Stratified 10-Fold Cross Validation''')
k = 10
all_predicted_delay_tree = []
all_actual_delay_tree = []

# Get training data
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X_train, Y_train, k)
for i in range(k):
    # Sort training data
    xtrain = myutils.distribute_data_by_index(X_train, train_folds[i])
    ytrain = myutils.distribute_data_by_index(Y_train, train_folds[i])
    xtest = myutils.distribute_data_by_index(X_train, test_folds[i])
    ytest = myutils.distribute_data_by_index(Y_train, test_folds[i])

    # Compute prediction and convert
    tree = MyDecisionTreeClassifier()
    tree.fit(xtrain, ytrain)
    predicted_delay = tree.predict(xtest)
    all_predicted_delay_tree += predicted_delay
    all_actual_delay_tree += ytest
    
# Calculate overall accuracy
accuracy = myutils.calculate_accuracy(all_predicted_delay_tree, all_actual_delay_tree)
error_rate = 1- accuracy

print("Decision Tree: accuracy = " + str(accuracy) + ", error rate = " + str(error_rate))

Predictive Accuracy
Stratified 10-Fold Cross Validation
Decision Tree tree: accuracy = 0.5817344589409056, error rate = 0.4182655410590944


In [11]:
print('''===========================================
Confusion Matrices
===========================================
Decision Tree (Stratified 10-Fold Cross Validation):''')

ylabels = list(set(Y_train))
matrix = myevaluation.confusion_matrix(all_actual_delay_tree, all_predicted_delay_tree, ylabels)
header = myutils.format_confusion_matrix_into_table(matrix, ["Delayed/canceled", "Not delayed"], "Delayed/canceled")

print(tabulate(matrix, headers=header, tablefmt="rst", numalign="right"))

Confusion Matrices
Decision Tree (Stratified 10-Fold Cross Validation):
Delayed/canceled      Delayed/canceled    Not delayed    Total    Recognition (%)
Delayed/canceled                   333           1173     1506              22.11
Not delayed                        462           1941     2403              80.77


The decision tree, as you can see, did not have very good performance. It was out performed by the zero R classifier in terms of accuracy.

In [75]:
# tree = MyDecisionTreeClassifier()
# tree.fit(X_train, Y_train)
# tree.print_decision_rules(attribute_names=["TBIRTH_YEAR", "EGENDER", "RHISPANIC", "RRACE", "EEDUC", "INCOME"], class_name="delayed/canceled (1 = yes)")

### Forest 

In [22]:
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier, MyZeroRClassifier, MyRandomClassifier, MyRandomForestClassifier
N = 5
M = 3
F = 2

all_predicted_forest = []
all_actual_forest = []
# Run tests of each parameter 5 times
for i in range(5):
    forest = MyRandomForestClassifier(N, F, M)
    accuracy, predicted, actual = forest.test_tree_stratified_kfold(X_train, Y_train)
    all_predicted_forest += predicted
    all_actual_forest += actual
    
accuracy = myutils.calculate_accuracy(all_predicted_forest, all_actual_forest)
error_rate = 1- accuracy
print('''===========================================
Predictive Accuracy
===========================================
Stratified 3-Fold Cross Validation''')
print("Forest: accuracy = " + str(accuracy) + ", error rate = " + str(error_rate))
    


print('''===========================================
Confusion Matrices
===========================================
Decision Tree (Stratified 10-Fold Cross Validation):''')

ylabels = list(set(Y_train))
matrix = myevaluation.confusion_matrix(all_actual_delay_tree, all_predicted_delay_tree, ylabels)
header = myutils.format_confusion_matrix_into_table(matrix, ["Delayed/canceled", "Not delayed"], "Delayed/canceled")

print(tabulate(matrix, headers=header, tablefmt="rst", numalign="right"))

[{'attributes': ['a3', 'a5'], 'tree': <mysklearn.myclassifiers.MyDecisionTreeClassifier object at 0x7f29d42c28b0>, 'accuracy': 0.6182795698924731}, {'attributes': ['a4', 'a2'], 'tree': <mysklearn.myclassifiers.MyDecisionTreeClassifier object at 0x7f29d410bcd0>, 'accuracy': 0.6137071651090342}, {'attributes': ['a0', 'a3'], 'tree': <mysklearn.myclassifiers.MyDecisionTreeClassifier object at 0x7f29d410bc70>, 'accuracy': 0.6116207951070336}, {'attributes': ['a0', 'a3'], 'tree': <mysklearn.myclassifiers.MyDecisionTreeClassifier object at 0x7f29d428ceb0>, 'accuracy': 0.6092783505154639}]
[{'attributes': ['a4', 'a5'], 'tree': <mysklearn.myclassifiers.MyDecisionTreeClassifier object at 0x7f29d428c970>, 'accuracy': 0.6370143149284253}, {'attributes': ['a3', 'a0'], 'tree': <mysklearn.myclassifiers.MyDecisionTreeClassifier object at 0x7f29d46c69a0>, 'accuracy': 0.619979402677652}, {'attributes': ['a0', 'a2'], 'tree': <mysklearn.myclassifiers.MyDecisionTreeClassifier object at 0x7f29d46c6a00>, 'ac