# COVID-19's Impact on Healthcare Accessibility
### By: Tristan Call and Maria Elena Aviles-Baquero

# Classification
In this document we will aim to apply classifiers to the dataset

(question about this notebook: what is the difference b/w this one and the EDA?)

In [26]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.plot_utils
importlib.reload(mysklearn.plot_utils)
import mysklearn.plot_utils as plot_utils

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import os
import pandas as pd
from tabulate import tabulate

In [20]:
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

working_data_filename = os.path.join("input_data", "week21_working.csv")

# Load the data into a mypytable for future analysis
overall_table = MyPyTable()
overall_table.load_from_file(working_data_filename)
overall_table.convert_to_numeric()

# Convert year into bigger categorical chunks
year_col = overall_table.get_column("TBIRTH_YEAR")
year_label = [str(1932 + 10 * x) + " to " + str(1941 + 10 * x) for x in range(6)]
year_label.append("1992 to 2002")
cutoffs = [1932 + 10 * x for x in range(8)]
year_col = myutils.categorize_continuous_list(year_col, cutoffs, year_label)

# Create DELAYNOTGET column
delay = overall_table.get_column("DELAY")
notget = overall_table.get_column("NOTGET")
delaynotget = []
for i in range(len(delay)):
    if delay[i] == 1 or notget[i] == 1:
        delaynotget.append(1)
    else:
        delaynotget.append(2)
        
# Combine all the above into the overall_table
overall_table.column_names.append("DELAYNOTGET")
overall_table.data = [[overall_table.data[i][0]] + [year_col[i]] + overall_table.data[i][2:] + [delaynotget[i]] for i in range(len(year_col))]

In [21]:
# Break information into X_train and class_label
X_train = overall_table.get_columns(["TBIRTH_YEAR", "EGENDER", "RHISPANIC", "RRACE", "EEDUC", "INCOME"])
X_train = X_train.data
Y_train = overall_table.get_column("DELAYNOTGET")

In [31]:
print('''===========================================
Predictive Accuracy
===========================================
Stratified 10-Fold Cross Validation''')
k = 10
all_predicted_delay_tree = []
all_actual_delay_tree = []

# Get training data
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X_train, Y_train, k)
for i in range(k):
    # Sort training data
    xtrain = myutils.distribute_data_by_index(X_train, train_folds[i])
    ytrain = myutils.distribute_data_by_index(Y_train, train_folds[i])
    xtest = myutils.distribute_data_by_index(X_train, test_folds[i])
    ytest = myutils.distribute_data_by_index(Y_train, test_folds[i])

    # Compute prediction and convert
    tree = MyDecisionTreeClassifier()
    tree.fit(xtrain, ytrain)
    predicted_mpg = tree.predict(xtest)
    all_predicted_delay_tree += predicted_mpg
    all_actual_delay_tree += ytest
    
# Calculate overall accuracy
accuracy = myutils.calculate_accuracy(all_predicted_delay_tree, all_actual_delay_tree)
error_rate = 1- accuracy

print("Decision Tree tree: accuracy = " + str(accuracy) + ", error rate = " + str(error_rate))

Predictive Accuracy
Stratified 10-Fold Cross Validation
Decision Tree tree: accuracy = 0.5817344589409056, error rate = 0.4182655410590944


In [30]:
print('''===========================================
Confusion Matrices
===========================================
Decision Tree (Stratified 10-Fold Cross Validation):''')

matrix = myevaluation.confusion_matrix(ytest, predicted_mpg, Y_train)
header = myutils.format_confusion_matrix_into_table(matrix, Y_train, "Delayed/canceled")

print(tabulate(matrix, headers=header, tablefmt="rst", numalign="right"))

Confusion Matrices
Decision Tree (Stratified 10-Fold Cross Validation):


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
tree = MyDecisionTreeClassifier()
tree.fit(X_train, Y_train)
tree.print_decision_rules(attribute_names=["TBIRTH_YEAR", "EGENDER", "RHISPANIC", "RRACE", "EEDUC", "INCOME"], class_name="delayed/canceled (1 = yes)")

IF TBIRTH_YEAR == 1962 to 1971 AND IF INCOME == 6.0 THEN delayed/canceled (1 = yes) == 2
IF TBIRTH_YEAR == 1962 to 1971 AND IF INCOME == 4.0 THEN delayed/canceled (1 = yes) == 1
IF TBIRTH_YEAR == 1962 to 1971 AND IF INCOME == 1.0 AND IF EEDUC == 7.0 AND IF EGENDER == 2.0 AND IF RHISPANIC == 1.0 THEN delayed/canceled (1 = yes) == 1
IF TBIRTH_YEAR == 1962 to 1971 AND IF INCOME == 1.0 AND IF EEDUC == 7.0 AND IF EGENDER == 2.0 AND IF RHISPANIC == 2.0 THEN delayed/canceled (1 = yes) == 2
IF TBIRTH_YEAR == 1962 to 1971 AND IF INCOME == 1.0 AND IF EEDUC == 7.0 AND IF EGENDER == 1.0 THEN delayed/canceled (1 = yes) == 2
IF TBIRTH_YEAR == 1962 to 1971 AND IF INCOME == 1.0 AND IF EEDUC == 4.0 THEN delayed/canceled (1 = yes) == 2
IF TBIRTH_YEAR == 1962 to 1971 AND IF INCOME == 1.0 AND IF EEDUC == 3.0 THEN delayed/canceled (1 = yes) == 1
IF TBIRTH_YEAR == 1962 to 1971 AND IF INCOME == 1.0 AND IF EEDUC == 5.0 THEN delayed/canceled (1 = yes) == 2
IF TBIRTH_YEAR == 1962 to 1971 AND IF INCOME == 1.0 AN