# Assignment 3
## JuPyter Notebook - Verschuur L. 1811053, Kolenbrander M. 1653415

In [1]:
## Parameters

# Convert numeric values into ranged representations
convert_to_range = True
# Represent ranges as categorical values or rounded to nearest base value: 12, b=5 -> 10-14
range_categorical = False
# Convert categorical values (not from ranged values) into one hot representations:
# {'smoking': ['sometimes', 'regularly', 'sometimes', 'never']} -> 
# {'smoking_sometimes': [1, 0, 1, 0], 'smoking_regularly': [0, 1, 0, 0], 'smoking_never': [0, 0, 0, 1]}
convert_categorical_to_one_hot = True

## Utility functions

In [2]:
import math

def round_to_base(in_val, base):
    """
    Round the in_val value to the nearest base.
    E.g. base 5
        2  -> 0
        9  -> 10
        12 -> 10
        13 -> 15
    
    :param in_val: Value to round
    :param base: Base to round to
    :return: Rounded value
    """
    
    return round(in_val/base) * base

def floor_range(in_val, range_val):
    """
    Floor the in_val value to the nearest base (range_val). 
    And then indicate a range.
    E.g. base 5
        2  -> 0 - 4
        9  -> 5 - 9
        12 -> 10 - 14
        13 -> 10 - 14
    
    :param in_val: Value to round
    :param range_val: Range to floor to
    :return: Rounded value range
    """
    
    floored_val = math.floor(in_val/range_val) * range_val
    
    return f"{floored_val} - {floored_val + range_val - 1}"


In [3]:
def find_column_names(df, find_column_name):
    """
    Find all columns in a dataframe based on a prefix (find_column_name) and return as a list.
    E.g. columns: ["a_a", "a_b", "b_a", "c"] and find_column_name: "a"
        ["a_a", "a_b"]
    
    :param df: Input data frame
    :param find_column_name: Prefix of column name
    :return: Rounded value range
    """
    
    return [column_name for column_name in df.columns if f"{find_column_name}_" in column_name ]

def fetch_columns_on_name_list(df, find_column_name_list):
    """
    Find all subsets of the prefixes provided in find_column_name_list and form a list of these.
    The columns are found with find_column_names.
    
    :param df: Input data frame
    :param find_column_name_list: List of prefix column names
    :return: Rounded value range
    """
    
    return [name for name_group in [find_column_names(df, column_name) for column_name in find_column_name_list] for name in name_group]

## Data fetching & Data pre-processing

In [4]:
import pandas as pd
import numpy as np

file_path = "healthcare-dataset-stroke-data.csv"

# Fetching CSV and converting to data frame
data_file = pd.read_csv(file_path, header=0)

# Drop all entries with nan values
data_file = data_file.dropna()

# Convert interval and ratio variables into ranges
if convert_to_range and range_categorical:
    data_file.insert(data_file.columns.get_loc("age"), "ranged_age", [floor_range(age, 10) for age in data_file["age"]])
    data_file.insert(data_file.columns.get_loc("avg_glucose_level"), "ranged_avg_glucose_level", [floor_range(avg_glucose_level, 25) for avg_glucose_level in data_file["avg_glucose_level"]])
    data_file.insert(data_file.columns.get_loc("bmi"), "ranged_bmi", [floor_range(bmi, 2) for bmi in data_file["bmi"]])
elif convert_to_range and not range_categorical:
    data_file.insert(data_file.columns.get_loc("age"), "ranged_age", [round_to_base(age, 10) for age in data_file["age"]])
    data_file.insert(data_file.columns.get_loc("avg_glucose_level"), "ranged_avg_glucose_level", [round_to_base(avg_glucose_level, 25) for avg_glucose_level in data_file["avg_glucose_level"]])
    data_file.insert(data_file.columns.get_loc("bmi"), "ranged_bmi", [round_to_base(bmi, 2) for bmi in data_file["bmi"]])

data_file.insert(data_file.columns.get_loc("bmi"), "rounded_bmi", data_file["bmi"].round(0))
    
# Convert "boolean" variables into true boolean variables
data_file["hypertension"] = data_file["hypertension"].astype("bool")
data_file["heart_disease"] = data_file["heart_disease"].astype("bool")
data_file["stroke"] = data_file["stroke"].astype("bool")
data_file["ever_married"] = np.where(data_file["ever_married"] == "Yes", True, False).astype("bool")
    
# Convert categorical variables into a one-hot representation
if convert_categorical_to_one_hot:
    data_file = pd.concat([data_file, pd.get_dummies(data_file["gender"], prefix="gender")], axis=1)
    data_file = pd.concat([data_file, pd.get_dummies(data_file["work_type"], prefix="work_type")], axis=1)
    data_file = pd.concat([data_file, pd.get_dummies(data_file["Residence_type"], prefix="Residence_type")], axis=1)
    data_file = pd.concat([data_file, pd.get_dummies(data_file["smoking_status"], prefix="smoking_status")], axis=1)

## Classification

### Test & Train set generation
**Applied algorithms for this test and training set**
- SKLearn Random Forest Classifier
- SKLearn Support Vector Classification Classifier

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Decision Variables
X = data_file.loc[:,["ranged_age", "hypertension", "heart_disease", "ever_married", "ranged_avg_glucose_level", "rounded_bmi", *fetch_columns_on_name_list(data_file, ["work_type"])]].values
# Target Variable
y = data_file.loc[:,"stroke"].values

# Split datasets into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1)

sc = StandardScaler()

# Scale data for classifiers
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.fit_transform(X_test)

### Random Forest training & testing

In [65]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=150, n_jobs=-1)
rf_classifier.fit(X_train_scaled, Y_train)

RandomForestClassifier(n_estimators=150, n_jobs=-1)

In [70]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

rf_y_prediction = rf_classifier.predict(X_test_scaled)

print("# # # Classification report # # #")
print(classification_report(Y_test, rf_y_prediction))

print("# # # Confusion matrix # # #")
print(confusion_matrix(Y_test, rf_y_prediction), "\n")

print("# # # Accuracy score # # #")
print(accuracy_score(Y_test, rf_y_prediction), "\n")

# # # Classification report # # #
              precision    recall  f1-score   support

       False       0.95      0.98      0.97       469
        True       0.00      0.00      0.00        22

    accuracy                           0.94       491
   macro avg       0.48      0.49      0.48       491
weighted avg       0.91      0.94      0.93       491

# # # Confusion matrix # # #
[[461   8]
 [ 22   0]] 

# # # Accuracy score # # #
0.9389002036659878




### Support Vector Classification training & testing

In [77]:
from sklearn.svm import SVC

SV_classifier = SVC()
SV_classifier.fit(X_train_scaled, Y_train)

SVC()

In [78]:
SV_y_prediction = SV_classifier.predict(X_test_scaled)

print("# # # Classification report # # #")
print(classification_report(Y_test, SV_y_prediction))

print("# # # Confusion matrix # # #")
print(confusion_matrix(Y_test, SV_y_prediction), "\n")

print("# # # Accuracy score # # #")
print(accuracy_score(Y_test, SV_y_prediction), "\n")

# # # Classification report # # #
              precision    recall  f1-score   support

       False       0.96      1.00      0.98       469
        True       0.00      0.00      0.00        22

    accuracy                           0.96       491
   macro avg       0.48      0.50      0.49       491
weighted avg       0.91      0.96      0.93       491

# # # Confusion matrix # # #
[[469   0]
 [ 22   0]] 

# # # Accuracy score # # #
0.955193482688391 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
