# Import Required Libraries
#### Side note: `pip install graphviz` and reset kernel if an error occurs when importing

In [None]:
conda install python-graphviz

In [None]:
pip install graphviz

In [None]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from graphviz import Source
from sklearn import tree
from sklearn.utils import shuffle
from sklearn.tree import plot_tree
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# Set Up: Import the dataset

In [None]:
# Create a DataFrame of the dataset
dataset = pd.read_csv("mushrooms_imputed.csv")
display(dataset)

# Dataset Preparation: Data Visualization

In [None]:
## Get a description of the dataset
display(dataset.describe(include='all'))

# Create a helper function to visualize all the columns using seaborn library
def visualize_column(col_n):
    ## 2 Bar graphs
    fig=plt.figure(figsize=(22, 10))
    
    ## Left graph
    plt.subplot(221)
    sns.countplot(x=col_n, data=dataset, order=dataset[col_n].value_counts().index, palette='rocket')
    plt.title('Frequency of mushroom by {}'.format(col_n.replace('-', ' ')), fontsize=22, pad=10)
    plt.xlabel(col_n.replace('-', ' ').capitalize(), fontsize=16)
    plt.ylabel('')
    
    ## Right graph
    plt.subplot(222)
    sns.countplot(x=col_n, data=dataset, hue='class', order=dataset[col_n].value_counts().index, palette='rocket')
    plt.title('Frequency of {} by class'.format(col_n.replace('-', ' ')), fontsize=22, pad=10)
    plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right', fontsize=12)
    plt.xlabel(col_n.replace('-', ' ').capitalize(), fontsize=16)
    plt.ylabel('')
    plt.show() 

# Dataset.columns[0] is the class so exclude it when visualizing data
for col_names in list(dataset.columns)[1:]:
    print(col_names)
    visualize_column(col_names)

# Dataset Preparation: Data Checking

In [None]:
## Examine the dataset/ Check for any missing data/values
shape = dataset.shape
print(f'The dataset has {shape[0]} rows and {shape[1]} columns\n')
print("Check if there are any missing values:")
print(dataset.isnull().sum())

## Print the Unique Types within each feature
unique_df = pd.DataFrame()
print('\n{0:25} {1:25} {2}'.format('Column', '# Unique Types', 'Unique Types'))
for col in dataset:
    print('{0:32} {1} {2} {3}'.format(col, dataset[col].nunique(), ' '*15 , dataset[col].unique()))

num_missing_entries = dataset.loc[dataset['stalk-root'] == '?', 'stalk-root'].count()
total_entries = dataset['stalk-root'].count()
print(f"\nNumber of data entries w/ stalk-root = '?': {num_missing_entries}")
print(f"Total number of data entries: {total_entries} ")
print(f"Percentage of data entries w/ stalk-root = '?': {(num_missing_entries/total_entries)*100}% ")

# Dataset Preparation: Data Cleaning (Fix missing value)
> This was our inital way of handling the missing feature, but we modified the dataset externally

In [None]:
"""
# Create a deep copy of the dataset
clean_dataset_by_most_freq_feature = dataset.copy()

# Replacing all feature values '?' w/ the most frequent feature == replacing all feature values '?' w/ the most frequent feature of the class
# From the data visualization above 'b' is the most frequent feature
clean_dataset_by_most_freq_feature = clean_dataset_by_most_freq_feature.replace('?', 'b')
dataset = clean_dataset_by_most_freq_feature
print(dataset['stalk-root'].unique())
"""

# Feature Engineering: Removing Veil Type
#### Useless feature only one attribute type which both classes share (zero-variance predictors). Goal is to examine whether removing such feature will increase performance. Our prediction is it won't do much to the performace.

In [None]:
# Test the performace on this dataset as well...
r_col_dataset = dataset.drop('veil-type', axis = 1)

# Preprocess Data: Label encode the dataset
#### Side note: Label encoding is converting each feature value into a numeric form and this is needed because sklearn libraries work only on numeric labels

In [None]:
## Create a deep copy of the dataset
c_dataset = dataset.copy()

# Split the dataset
# c_X : a dataframe; c_y : a vector
c_X = c_dataset.drop('class', axis = 1)
c_y = c_dataset['class']

# Create an instance of a OneHotEncoder (for categorical features) and LabelBinarizer (for labels)
hot_encoder = OneHotEncoder()
binarizer_encoder = LabelBinarizer()

# Encode the features & values and labels
enc_X = hot_encoder.fit_transform(c_X)
enc_y = binarizer_encoder.fit_transform(c_y)

# print(enc_X)

# Checking the inverse encoding
# inv_X = hot_encoder.inverse_transform(enc_X)
# inv_y = binarizer_encoder.inverse_transform(enc_y)
# inv_dataset = pd.DataFrame(inv_X, columns = list(c_X.columns))
# inv_dataset.insert(loc = 0, column = 'class', value = inv_y)
# display(inv_dataset.head())
# display(c_dataset.head())

## Divide dataset to training and validation set
#### Side note: The row # and columns # is a bit off from the actual dataset because of the encoders

In [None]:
# Splits the data to a 2/3 for training and ~1/3 for testing
X_train, X_validation, y_train, y_validation = train_test_split(enc_X, enc_y, test_size = 0.3, random_state=0)

# print(y_train)
print(f'Dataset contains {enc_X.shape[0]} rows and {enc_X.shape[1]} columns')

print(f'Attributes & values training data contains {X_train.shape[0]} rows and {X_train.shape[1]} columns')
print(f'Class label training data contains {y_train.shape[0]} rows')

print(f'Attributes & values test data contains {X_validation.shape[0]} rows and {X_validation.shape[1]} columns')
print(f'Class label test data contains {y_validation.shape[0]} rows')

## Subroutine function to label encode & split dataset
#### Used to test any other datasets after feature engineering

In [None]:
# Basically does the two blocks above but in one function
## dataset - is the dataset to split
## test_size - is a float between [0, 1] representing the proportion of the dataset to include in the test split
## returns:
### X_train, X_validation, y_train, y_validation, OneHotEncoder, LabelBinarizer
### OneHotEncoder, LabelBinarizer - allows for the data to be decoded if wanted
def generate_random_split(dataset, test_size_val):
    # Create a deep copy of the dataset
    c_dataset = dataset.copy()

    # Split the dataset
    c_X = c_dataset.drop('class', axis = 1)
    c_y = c_dataset['class']

    # Create an instance of a OneHotEncoder (for categorical features) and LabelBinarizer (for labels)
    hot_encoder = OneHotEncoder()
    binarizer_encoder = LabelBinarizer()

    # Encode the features & values and labels
    enc_X = hot_encoder.fit_transform(c_X)
    enc_y = binarizer_encoder.fit_transform(c_y)
    
    # Splits the data
    X_train, X_validation, y_train, y_validation = train_test_split(enc_X, enc_y, test_size = test_size_val, random_state=0)
    
    return X_train, X_validation, y_train, y_validation, hot_encoder, binarizer_encoder, enc_X, enc_y

## Label Encode Dataset that Removed the 'veil-type' Feature 
#### Explore whether removing the column will provide better performance than not removing the column

In [None]:
r_X_train, r_X_validation, r_y_train, r_y_validation, r_hot_encoder, r_binarizer_encoder, r_enc_X, r_enc_y = generate_random_split(
    r_col_dataset, 0.3)

print(f'Attributes & values training data contains {r_X_train.shape[0]} rows and {r_X_train.shape[1]} columns')
print(f'Class label training data contains {r_y_train.shape[0]} rows')

print(f'Attributes & values test data contains {r_X_validation.shape[0]} rows and {r_X_validation.shape[1]} columns')
print(f'Class label test data contains {r_y_validation.shape[0]} rows')

## Label Encode Dataset that replaced '?' w/ Most Frequent 'stalk-root' Feature 
#### Explore whether removing the column will provide better performance than not removing the column

In [None]:
## Again we found a different way of cleaning the data, this is not needed
"""
c_X_train, c_X_validation, c_y_train, c_y_validation, c_hot_encoder, c_binarizer_encoder, c_enc_X, c_enc_y = generate_random_split(
    clean_dataset_by_most_freq_feature, 0.3)

print(f'Attributes & values training data contains {r_X_train.shape[0]} rows and {r_X_train.shape[1]} columns')
print(f'Class label training data contains {r_y_train.shape[0]} rows')

print(f'Attributes & values test data contains {r_X_validation.shape[0]} rows and {r_X_validation.shape[1]} columns')
print(f'Class label test data contains {r_y_validation.shape[0]} rows')
"""

## Helper Functions for Evaluating Models

In [None]:
# Returns a float of how well the decision_tree performed on the feature_data compared to the expected_output
# feature_data - is a matrix (2-d vector) of features values
# expected_output - is a vector of the corresponding classes
def measure_accuracy(model, feature_data, expected_output):
    return accuracy_score(model.predict(feature_data), expected_output)

## Helper function for testing n printing 
def evaluate_model_n_print_info(model, X_train, y_train, X_validation, y_validation):
    start = time.time()
    model_train_split_accuracy = measure_accuracy(model, X_train, y_train)
    model_testing_training_time = time.time() - start
    start = time.time()
    model_validation_split_accuracy = measure_accuracy(model, X_validation, y_validation)
    model_testing_validation_time = time.time() - start
    print(f'Train Accuracy = {model_train_split_accuracy}')
    print(f'Validation Accuracy = {model_validation_split_accuracy}')
    print(f'Time elapse for testing on training data: {model_testing_training_time}')
    print(f'Time elapsed for testing on validation data: {model_testing_validation_time}\n')
    return model_train_split_accuracy, model_validation_split_accuracy, model_testing_training_time, model_testing_validation_time

# Returns a dataframe of the results so displaying it would be easier to see
def df_k_fold_cross_validation(k_fold_results_arr, k_fold_time_arr):
    d = {"Accuracy for kth-fold": k_fold_results_arr, "Time for kth-fold":k_fold_time_arr}
    return pd.DataFrame(data = d)

# K-fold cross validation helper function
def k_fold_cross_validation(model, X_train, y_train):
    # Recommended to do 10 folds; returns a dictionary of values
    result = cross_validate(model, X_train, y_train, cv=10)
    display(df_k_fold_cross_validation(result['test_score'], result['fit_time']))
    model_mean_accuracy = round(sum(result['test_score']) / len(result['test_score']), 4)
    model_mean_fit_time = round(sum(result['fit_time']) / len(result['fit_time']), 4)
    print(f'Average accuracy for all K-fold cross-validations: {model_mean_accuracy}')
    print(f'Time elapse for K-fold cross-validations: {model_mean_fit_time}.\n')
    return result

# Grid Search helper function (used to capture the best hyperparameters for the model)
def grid_serach_helper(model, parameters, X_train, y_train):
    model_grid = GridSearchCV(model, parameters, cv = 10)
    model_grid.fit(X_train, y_train)
    return model_grid.best_estimator_, model_grid

# Decision Tree Models
> _Self-note: There are many other metrics we can play around w/ if we run out of things. I commented out decision trees that can split on the best random attribute_. We can place a range on many of the parameters, then graph them and see which one gives the best results like this: https://www.kaggle.com/tosinabase/mushroom-classification-tree-methods-comparison

In [None]:
####################### DT Models w/ 'veil-type' #######################
## Creating a decision tree w/ gini index metric (gini impurity)
# splitter = best (on the best feature)
start = time.time()
gini_decision_tree_best = DecisionTreeClassifier()
gini_decision_tree_best.fit(X_train, y_train)
train_gini_DT = time.time() - start
print(f"Time elpased to build/train gini DT: {train_gini_DT}\n")

## Creating a decision tree w/ entropy metric (info gain) (Learned in class)
# splitter = best (on the best feature)
start = time.time()
entropy_decision_tree_best = DecisionTreeClassifier(criterion = "entropy") 
entropy_decision_tree_best.fit(X_train, y_train)
train_entropy_DT = time.time() - start
print(f"Time elpased to build/train entropy DT: {train_entropy_DT}\n")

####################### DT Models w/o 'veil-type' #######################
## Creating a decision tree w/ gini index metric (gini impurity) & w/0 'veil-type' feature
# splitter = best (on the best feature)
start = time.time()
r_gini_decision_tree_best = DecisionTreeClassifier()
r_gini_decision_tree_best.fit(r_X_train, r_y_train)
r_train_gini_DT = time.time() - start
print(f"Time elpased to build/train gini DT w/o 'veil-type' feature: {r_train_gini_DT}\n")

## Creating a decision tree w/ entropy metric (info gain) (Learned in class) & w/0 'veil-type' feature
# splitter = best (on the best feature)
start = time.time()
r_entropy_decision_tree_best = DecisionTreeClassifier(criterion = "entropy") 
r_entropy_decision_tree_best.fit(r_X_train, r_y_train)
r_train_entropy_DT = time.time() - start
print(f"Time elpased to build/train entropy DT w/o 'veil-type' feature: {r_train_entropy_DT}\n")

### This not needed anymore
"""
####################### DT Models w/ clean data #######################
## Creating a decision tree w/ gini index metric (gini impurity) & w/ clean data
# splitter = best (on the best feature)
start = time.time()
c_gini_decision_tree_best = DecisionTreeClassifier()
c_gini_decision_tree_best.fit(c_X_train, c_y_train)
c_train_gini_DT = time.time() - start
print(f"Time elpased to build/train gini DT w/ clean data: {c_train_gini_DT}\n")

## Creating a decision tree w/ entropy metric (info gain) (Learned in class) & w/ clean data
# splitter = best (on the best feature)
start = time.time()
c_entropy_decision_tree_best = DecisionTreeClassifier(criterion = "entropy") 
c_entropy_decision_tree_best.fit(c_X_train, c_y_train)
c_train_entropy_DT = time.time() - start
print(f"Time elpased to build/train entropy DT w/ clean data: {c_train_entropy_DT}")
"""

### Testing Decision Tree Models on Datasets w/ and w/o 'Veil-type'

In [None]:
####################### Testing DT Models w/ 'veil-type' #######################
## Testing gini index decision tree w/ train test split
print("Performance for Decision Tree w/ gini index using train test split:")
DT_gini_train_split, DT_gini_validation_split, DT_gini_testing_training_time, DT_gini_testing_validation_time = evaluate_model_n_print_info(gini_decision_tree_best, X_train, y_train, X_validation, y_validation)

## Testing information gain decision tree w/ train test split
print("Performance for Decision Tree w/ information gain using train test split:")
DT_entropy_train_split, DT_entropy_validation_split, DT_entropy_testing_training_time, DT_entropy_testing_validation_time = evaluate_model_n_print_info(entropy_decision_tree_best, X_train, y_train, X_validation, y_validation)

####################### K-fold cross-validation #######################
## Create new DT for k-fold cross-validation
DT_gini_k_fold = DecisionTreeClassifier()
DT_entropy_k_fold = DecisionTreeClassifier(criterion = "entropy") 

## K-fold cross-validation (k = 10) on gini index decision tree
print("Performance for Decision Tree w/ gini index using K-fold cross-validation w/ K=10:")
DT_gini_k_fold_result = k_fold_cross_validation(DT_gini_k_fold, X_train, y_train)

## K-fold cross-validation (k = 10) on info gain decision tree
print("Performance for Decision Tree w/ information gain using K-fold cross-validation w/ K=10:")
DT_entropy_k_fold_result = k_fold_cross_validation(DT_entropy_k_fold, X_train, y_train)

####################### Grid-Search that uses K-fold cross-validation to fine tune parameters #######################
# This will take a while..
# Parameters want GridSearch to find the best hyperparmeter for the model
print("Performance or Grid-Search Decision Tree:")
DT_parameters = {
    'criterion' : ['gini', 'entropy'],
    'max_depth': list(range(1, 11)) + [None]
}
DT_best_grid_search, DT_grid_search_info = grid_serach_helper(DecisionTreeClassifier(), DT_parameters, X_train, y_train)
evaluate_model_n_print_info(DT_best_grid_search, X_train, y_train, X_validation, y_validation)
print("Compare parameters:")
print(f"Max depth of DT w/ gini: {gini_decision_tree_best.tree_.max_depth}")
print(f"Max depth of DT w/ entropy: {entropy_decision_tree_best.tree_.max_depth}")
print(f'Grid-Search DT Best paramters: {DT_grid_search_info.best_params_}')

In [None]:
####################### Testing DT Models w/o 'veil-type' #######################
## Testing gini index decision tree w/ train test split
print("Performance for Decision Tree w/ gini index & w/o 'veil-type' using train test split:")
r_DT_gini_train_split, r_DT_gini_validation_split, r_DT_gini_testing_training_time, r_DT_gini_testing_validation_time = evaluate_model_n_print_info(r_gini_decision_tree_best, r_X_train, r_y_train, r_X_validation, r_y_validation)

## Testing information gain decision tree w/ train test split
print("Performance for Decision Tree w/ information gain & w/o 'veil-type' using train test split:")
r_DT_entropy_train_split, r_DT_entropy_validation_split, r_DT_entropy_testing_training_time, r_DT_entropy_testing_validation_time = evaluate_model_n_print_info(r_entropy_decision_tree_best, r_X_train, r_y_train, r_X_validation, r_y_validation)

########## K-fold cross-validation to fine tune parameters ##########
## Create new DT for k-fold cross-validation
r_DT_gini_k_fold = DecisionTreeClassifier()
r_DT_entropy_k_fold = DecisionTreeClassifier(criterion = "entropy") 

## K-fold cross-validation (k = 10) on gini index decision tree
print("Performance for Decision Tree w/ gini index & w/o 'veil-type' using K-fold cross-validation w/ K=10:")
r_DT_gini_k_fold_result = k_fold_cross_validation(r_DT_gini_k_fold, r_X_train, r_y_train)

## K-fold cross-validation (k = 10) on info gain decision tree
print("Performance for Decision Tree w/ information gain & w/o 'veil-type' using K-fold cross-validation w/ K=10:")
r_DT_entropy_k_fold_result = k_fold_cross_validation(r_DT_entropy_k_fold, r_X_train, r_y_train)

####################### Grid-Search that uses K-fold cross-validation to fine tune parameters #######################
## This would just be a repeat of the results from above, since we realized that removing the 'veil-type' produces the same model
## So we will continute to develop w/o 'veil-type'

In [None]:
### This can be removed kept this for testing a way to clean the data ### 
"""
####################### Testing DT Models w/ clean data #######################
## Testing gini index decision tree w/ train test split
print("Performance for Decision Tree w/ gini index & w/ clean data using train test split:")
c_DT_gini_train_split, c_DT_gini_validation_split, c_DT_gini_testing_training_time, c_DT_gini_testing_validation_time = evaluate_model_n_print_info(c_gini_decision_tree_best, c_X_train, c_y_train, c_X_validation, c_y_validation)

## Testing information gain decision tree w/ train test split
print("Performance for Decision Tree w/ information gain & w/ clean data using train test split:")
c_DT_entropy_train_split, c_DT_entropy_validation_split, c_DT_entropy_testing_training_time, c_DT_entropy_testing_validation_time = evaluate_model_n_print_info(c_entropy_decision_tree_best, c_X_train, c_y_train, c_X_validation, c_y_validation)

########## K-fold cross-validation to fine tune parameters ##########
## Create new DT for k-fold cross-validation
c_DT_gini_k_fold = DecisionTreeClassifier()
c_DT_entropy_k_fold = DecisionTreeClassifier(criterion = "entropy") 

## K-fold cross-validation (k = 10) on gini index decision tree
print("Performance for Decision Tree w/ gini index & w/ clean data using K-fold cross-validation w/ K=10:")
c_DT_gini_k_fold_result = k_fold_cross_validation(c_DT_gini_k_fold, c_X_train, c_y_train)

## K-fold cross-validation (k = 10) on info gain decision tree
print("Performance for Decision Tree w/ information gain & w/ clean data using K-fold cross-validation w/ K=10:")
c_DT_entropy_k_fold_result = k_fold_cross_validation(c_DT_entropy_k_fold, c_X_train, c_y_train)
"""


### Display Decision Tree Models
#### Note: The graph that gets printed has misleading splitting attributes (it shows its splitting on numbers but all of the attribute values are strings; this is due to label encoding)

In [None]:
######### Display Decision Tree #############
## Display decision tree w/ gini metric (gini impurity)
# Generate a .dot file
export_graphviz(gini_decision_tree_best, out_file = "giniDecisionTree.dot", 
                feature_names = list(hot_encoder.get_feature_names(list(dataset.columns[1:]))),
                class_names = ['e', 'p'], filled = True, rounded = True)
# Generate a .png of the .dot file
giniDecisionTreePNG = Source.from_file("giniDecisionTree.dot", format='png')
giniDecisionTreePNG.view()

## Display decision tree w/ entropy metric (info gain)
export_graphviz(entropy_decision_tree_best, out_file = "entropyDecisionTree.dot", 
                feature_names = list(hot_encoder.get_feature_names(list(dataset.columns[1:]))), 
                class_names = ['e', 'p'], filled = True, rounded = True)
# Generate a .png of the .dot file
entropyDecisionTreePNG = Source.from_file("entropyDecisionTree.dot", format='png')
entropyDecisionTreePNG.view()

######### Display Grid-Search Decision Tree #############
export_graphviz(DT_best_grid_search, out_file = "GridSearchDecisionTree.dot", 
                feature_names = list(hot_encoder.get_feature_names(list(dataset.columns[1:]))), 
                class_names = ['e', 'p'], filled = True, rounded = True)
# Generate a .png of the .dot file
GridSearchDecisionTreePNG = Source.from_file("GridSearchDecisionTree.dot", format='png')
GridSearchDecisionTreePNG.view()

### Analyze Decision Tree: Feature Importance

In [None]:
# Display the numerical values of the Importance Scores
print("Decision Trees Feature Importance Scores:")
encoded_columns = hot_encoder.get_feature_names(list(dataset.columns[1:]))
print("Feature {0:25} Gini DT {0:25} Entropy DT ".format(" "))
for i, v in enumerate(gini_decision_tree_best.feature_importances_):
    print("{0:<25} {1:^25} {2:>25}".
          format(encoded_columns[i], v, entropy_decision_tree_best.feature_importances_[i], " "))

print("\n")
    
print("Grid Search Decision Tree Feature Importance Scores:")
encoded_columns = hot_encoder.get_feature_names(list(dataset.columns[1:]))
print("Feature {0:18} Feature Importance Score ".format(" "))
for i, v in enumerate(DT_best_grid_search.feature_importances_):
    print("{0:<25} {1:^25}".
          format(encoded_columns[i], v, " "))

In [None]:
# Display the numerical values of the Importance Scores in a bar graph
DT_important_features_df = pd.DataFrame(
    data = {"Gini DT" : gini_decision_tree_best.feature_importances_, 
            "Entropy DT" : entropy_decision_tree_best.feature_importances_}, index = encoded_columns)

DT_grid_search_important_features_df = pd.DataFrame(
    data = {"Grid Search DT" : DT_best_grid_search.feature_importances_}, index = encoded_columns)

DT_important_features_df.sort_values(by=["Gini DT"], ascending = False, inplace = True)
DT_grid_search_important_features_df.sort_values(by=["Grid Search DT"], ascending = False, inplace = True)

# Display the number of features each split on
print(f"Number of features split on for Gini DT: {DT_important_features_df[(DT_important_features_df['Gini DT'] != 0)].shape[0]}")
print(f"Number of features split on for Entropy DT: {DT_important_features_df[(DT_important_features_df['Entropy DT'] != 0)].shape[0]}")
print(f"Number of features split on for Grid Search DT: {DT_grid_search_important_features_df[(DT_grid_search_important_features_df['Grid Search DT'] != 0)].shape[0]}")

DT_important_features_df.drop(DT_important_features_df[(DT_important_features_df['Gini DT'] == 0) 
                                                       & (DT_important_features_df['Entropy DT'] == 0)].index, inplace = True)
DT_grid_search_important_features_df.drop(DT_grid_search_important_features_df[(DT_grid_search_important_features_df['Grid Search DT'] == 0)].index, inplace = True)      
# display(DT_grid_search_important_features_df)
# display(DT_important_features_df)
bar_DT_important_feat = DT_important_features_df.plot.bar(rot=0, fontsize=15, figsize = (50, 10))
bar_DT_grid_search_important_feat = DT_grid_search_important_features_df.plot.bar(rot=0, fontsize=15, figsize = (50, 10))


# Random Forest Models

In [None]:
## Create a vector of the classes
y_train_vect = y_train.reshape((5686, ))
y_validation_vect = y_validation.reshape((2438, ))

## Creating a random forest w/ gini index metric (gini impurity)
n_trees_gini = 100 # default amount
start = time.time()
gini_random_forest = RandomForestClassifier(n_estimators = n_trees_gini)
gini_random_forest.fit(X_train, y_train_vect)
train_gini_RF = time.time() - start
print(f"Time elpased to build/train gini RF: {train_gini_RF}\n")

## Creating a random forest w/ entropy metric (info gain) (Learned in class)
n_trees_entropy = 100 # default amount
start = time.time()
entropy_random_forest = RandomForestClassifier(n_estimators = n_trees_entropy, criterion = "entropy") 
entropy_random_forest.fit(X_train, y_train_vect)
train_entropy_RF = time.time() - start
print(f"Time elpased to build/train entropy RF: {train_entropy_RF}\n")

### Testing Random Forest Models

In [None]:
## Testing gini index random forest w/ train test split
print("Performance for Random Forest w/ gini index using train test split:")
RT_gini_train_split, RT_gini_validation_split, RT_gini_testing_training_time, RT_gini_testing_validation_time = evaluate_model_n_print_info(gini_random_forest, X_train, y_train_vect, X_validation, y_validation_vect)

## Testing information gain random forest w/ train test split
print("Performance for Random Forest w/ information gain using train test split:")
RT_entropy_train_split, RT_entropy_validation_split, RT_entropy_testing_training_time, RT_entropy_testing_validation_time = evaluate_model_n_print_info(entropy_random_forest, X_train, y_train_vect, X_validation, y_validation_vect)


########## K-fold cross-validation to fine tune parameters ##########
## Create new DT for k-fold cross-validation
RF_gini_k_fold = RandomForestClassifier()
RF_entropy_k_fold = RandomForestClassifier(criterion = "entropy")

## K-fold cross-validation (k = 10) on gini index random forest
print("Performance for Random Forest w/ gini index using K-fold cross-validation w/ K=10:")
RF_gini_k_fold_result = k_fold_cross_validation(RF_gini_k_fold, X_train, y_train_vect)

## K-fold cross-validation (k = 10) on info gain random forest
print("Performance for Random Forest w/ information gain using K-fold cross-validation w/ K=10:")
RF_gini_k_fold_result = k_fold_cross_validation(RF_entropy_k_fold, X_train, y_train_vect)

####################### Grid-Search that uses K-fold cross-validation to fine tune parameters #######################
## Note this takes a while to run, but I do graph this later on so might have an error when displaying RF, but works
"""
RF_parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(2, 11)) + [None],
    'n_estimators': [15, 25, 50, 75, 100] # the default is 100
}
RF_best_search_grid, RF_search_grid_info = grid_serach_helper(RandomForestClassifier(), RF_parameters, X_train, y_train_vect)
evaluate_model_n_print_info(RF_best_search_grid, X_train, y_train_vect, X_validation, y_validation_vect)
print(f'Best paramters: {RF_search_grid_info.best_params_}')
"""

### Display Random Forest Models 

In [None]:
#### These will take a while to run & might be laggy
## Display random forest w/ gini metric (gini impurity)
# Since we are not able to display all tree, we will randomly select the first 5 of them to display
fig, axes = plt.subplots(nrows = 1,ncols = 5,figsize = (10,2), dpi=3000)
for index in range(0, 5):
    tree.plot_tree(gini_random_forest.estimators_[index], feature_names = list(hot_encoder.get_feature_names(list(dataset.columns[1:]))),
                class_names = ['e', 'p'], filled = True, ax = axes[index])
    axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig.savefig('giniRandomForestPNG.png')
fig.show()


## Display random forest w/ entropy metric (gini impurity)
# Since we are not able to display all tree, we will randomly select the first 5 of them to display
fig2, axes2 = plt.subplots(nrows = 1,ncols = 5,figsize = (10,2), dpi=3000)
for index in range(0, 5):
    tree.plot_tree(entropy_random_forest.estimators_[index], feature_names = list(hot_encoder.get_feature_names(list(dataset.columns[1:]))),
                class_names = ['e', 'p'], filled = True, ax = axes2[index])
    axes2[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig2.savefig('entropyRandomForestPNG.png')
fig2.show()

"""
## Display random forest w/ Grid Search
## Note: You need to run the GridSearchCSV from above to run this code
fig3, axes3 = plt.subplots(nrows = 1,ncols = 5,figsize = (10,2), dpi=3000)
for index in range(0, 5):
    tree.plot_tree(RF_best_search_grid.estimators_[index], feature_names = list(hot_encoder.get_feature_names(list(dataset.columns[1:]))),
                class_names = ['e', 'p'], filled = True, ax = axes3[index])
    axes3[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig2.savefig('gridSearchRandomForestPNG.png')
"""


## Possibly add this instead of .view()
# # Convert to png using system command (requires Graphviz)
# from subprocess import call
# call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# # Display in jupyter notebook
# from IPython.display import Image
# Image(filename = 'tree.png')

### Analyze Random Forest: Feature Importance

In [None]:
# Display the numerical values of the Importance Scores
print("Random Forest Feature Importance Scores:")
encoded_columns = hot_encoder.get_feature_names(list(dataset.columns[1:]))
print("Feature {0:25} Gini RF {0:20} Entropy RF ".format(" "))
for i, v in enumerate(gini_random_forest.feature_importances_):
    print("{0:<25} {1:^25} {2:>25}".
          format(encoded_columns[i], v, entropy_random_forest.feature_importances_[i], " "))

# Display the numerical values of the Importance Scores of Grid Search
print("Random Forest Feature Importance Scores:")
encoded_columns = hot_encoder.get_feature_names(list(dataset.columns[1:]))
print("Feature {0:25} Grid Search RF ".format(" "))
for i, v in enumerate(RF_best_search_grid.feature_importances_):
    print("{0:<25} {1:^25}".
          format(encoded_columns[i], v, " "))

In [None]:
## Display the numerical values of the Importance Scores in a bar graph
RF_important_features_df = pd.DataFrame(
    data = {"Gini RF" : gini_random_forest.feature_importances_, 
            "Entropy RF" : entropy_random_forest.feature_importances_}, index = encoded_columns)
RF_important_features_df.drop(RF_important_features_df[(RF_important_features_df['Gini RF'] == 0) 
                                                       & (RF_important_features_df['Entropy RF'] == 0)].index, inplace = True)

RF_important_features_df.sort_values(by=["Gini RF"], ascending = False, inplace = True)

# display(RF_important_features_df)
bar_RF_important_feat = RF_important_features_df.plot.bar(rot=0, fontsize=15, figsize = (500, 10))


"""
##### THIS WILL GIVE YOU AN ERROR IF YOU DO NOT FINISH RUNNING THE RF FROM ABOVE #####
## Display the numerical values of the Importance Scores  of Grid Search in a bar graph
Grid_Search_RF_important_features_df = pd.DataFrame(
    data = {"Grid Search RF" : RF_best_search_grid.feature_importances_}, index = encoded_columns)
Grid_Search_RF_important_features_df.drop(Grid_Search_RF_important_features_df[(Grid_Search_RF_important_features_df['Grid Search RF'] == 0)].index, inplace = True)

Grid_Search_RF_important_features_df.sort_values(by=["Grid Search RF"], ascending = False, inplace = True)

# display(RF_important_features_df)
bar_grid_search_RF_important_feat = Grid_Search_RF_important_features_df.plot.bar(rot=0, fontsize=15, figsize = (500, 10))
"""


# Logistic Regression

In [None]:
# Create a default instance of a logistic regression
start = time.time()
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train.ravel())
train_log_reg = time.time() - start
print(f"Time elpased to build/train gini DT: {train_log_reg}\n")

### Testing Logistic Regression Model

In [None]:
## Testing gini index decision tree w/ train test split
print("Performance for Logistic Regression Model using train test split:")
log_reg_train_split, log_reg_validation_split, log_reg_testing_training_time, log_reg_testing_validation_time = evaluate_model_n_print_info(log_reg, X_train, y_train, X_validation, y_validation)

########## K-fold cross-validation to fine tune parameters ##########
## Create new DT for k-fold cross-validation
log_reg_k_fold = LogisticRegression()

## K-fold cross-validation (k = 10) on gini index decision tree
print("Performance for Logistic Regression Model using K-fold cross-validation w/ K=10:")
log_reg_k_fold_result = k_fold_cross_validation(log_reg_k_fold, X_train, y_train.ravel())


### Analyze Logistic Regression: Feature Importance
> This is one of the challenges w/ using a model such as logistic regression, other than DT, it is not able to easily able to visualize and find the features

In [None]:
# coefs = np.abs(log_reg.coef_[0])
# Display the numerical values of the Importance Scores
# Posiive coefficients indicate it predicts class 1 and negative score indicate a feature that predicts class 0
print("Logistic Regression Feature Importance Scores:")
encoded_columns = hot_encoder.get_feature_names(list(dataset.columns[1:]))
print("Feature {0:20} Coefficient {0:20} Abs(coefficient)".format(" "))
for i, v in enumerate(log_reg.coef_[0]):
    print("{0:<25} {1:^20} {2:>33}".
          format(encoded_columns[i], v, abs(v), " "))

In [None]:
## Display the numerical values of the Coefficents of log regression in a bar graph
log_reg_coefficient_df = pd.DataFrame(data = {"Coefficient" : log_reg.coef_[0]}, index = encoded_columns)
log_reg_abs_coefficient_df = pd.DataFrame(data = {"Abs(Coefficient)" : abs(log_reg.coef_[0])}, index = encoded_columns)
log_reg_coefficient_df.sort_values(by=["Coefficient"], ascending = False, inplace = True)
log_reg_abs_coefficient_df.sort_values(by=["Abs(Coefficient)"], ascending = False, inplace = True)

bar_log_reg_coefficient = log_reg_coefficient_df.plot.bar(title = "Coefficient val vs. Features Type", rot=0, fontsize=15, figsize = (500, 10))
bar_log_reg_abs_coefficient = log_reg_abs_coefficient_df.plot.bar(title = "Abs(Coefficient) val vs. Feature Type", rot=0, fontsize=15, figsize = (500, 10))

# Summary of Results

In [None]:
def display_summary(dt1, dt2, rf1, rf2, log, label, title, zoom=0):
    summary_data = { 
        "Models" : ["Decision Tree gini", "Decision Tree entropy", "Random Forest gini", "Random Forest entropy", 
                    "Logistic Regression"],
        label : [dt1, dt2, rf1, rf2, log]
    }
    summary_df = pd.DataFrame(data = summary_data)
    plt.figure(figsize = (10,5))
    sns.barplot(x = "Models", y = label, data = summary_df, palette='Set2')
    plt.title(title, fontsize = 25)
    plt.xlabel('Models', fontsize = 15)
    plt.ylabel(label,fontsize = 15)
    plt.xticks(rotation=90, fontsize = 15)
    # zoom set to 1 for first 3 accuracy models
    if zoom:
        plt.ylim(0.95,1.005)
    plt.show()
    display(summary_df)
    
def cal_avg_kfold_accuracy(result):
    return round(sum(result['test_score']) / len(result['test_score']), 4)

def cal_avg_kfold_time(result):
    return round(sum(result['fit_time']) / len(result['fit_time']), 4)

print("Training Set Accuracy by Model")
display_summary(DT_gini_train_split, DT_entropy_train_split, RT_gini_train_split, RT_entropy_train_split, 
                log_reg_train_split, "Accuracy (%)", "Training Set Accuracy by Model", 1)

print("Validation Set Accuracy by Model")
display_summary(DT_gini_validation_split, DT_entropy_validation_split, RT_gini_validation_split, 
                RT_entropy_validation_split, log_reg_validation_split, "Accuracy (%)", "Validation Set Accuracy by Model", 1)

print("Average K-Fold Accuracy (k = 10) by Model")
display_summary(cal_avg_kfold_accuracy(DT_gini_k_fold_result), 
                cal_avg_kfold_accuracy(DT_entropy_k_fold_result), 
                cal_avg_kfold_accuracy(RF_gini_k_fold_result),
                cal_avg_kfold_accuracy(RF_gini_k_fold_result), 
                cal_avg_kfold_accuracy(log_reg_k_fold_result), "Accuracy (%)", "Average K-Fold Accuracy (k = 10) by Model", 1)

print("Time to Train Model")
display_summary(DT_gini_testing_training_time, DT_entropy_testing_training_time, RT_gini_testing_training_time, 
                RT_entropy_testing_training_time, log_reg_testing_training_time, "Time (sec)", "Time to Train Model")

print("Time to Run Training Set on Model")
display_summary(train_gini_DT, train_entropy_DT, train_gini_RF, train_entropy_RF, train_log_reg, "Time (sec)", 
                "Time to Run Training Set on Model")

print("Time to Run Validate Set on Model")
display_summary(DT_gini_testing_validation_time, DT_entropy_testing_validation_time, RT_gini_testing_validation_time, 
                RT_entropy_testing_validation_time, log_reg_testing_validation_time, "Time (sec)", 
                "Time to Run Validate Set on Model")

print("Average Time to Run K-Fold")
display_summary(cal_avg_kfold_time(DT_gini_k_fold_result), 
                cal_avg_kfold_time(DT_entropy_k_fold_result), 
                cal_avg_kfold_time(RF_gini_k_fold_result),
                cal_avg_kfold_time(RF_gini_k_fold_result),
                cal_avg_kfold_time(log_reg_k_fold_result),"Time (sec)", "Average Time to Run K-Fold")