# Part 1: Decision Trees

In [None]:
import numpy as np

from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, max_error, r2_score

### Task 1: Hyperparameter Tuning

In [None]:
'''
Load the Iris dataset.
'''
iris = load_iris()

'''
Split the data into training and testing sets.
'''
iris_X = iris.data
iris_y = iris.target
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size = 0.2, random_state = 42)

'''
Implement a DT classifier.
'''
clf = DecisionTreeClassifier(random_state = 42)
clf.fit(iris_X_train, iris_y_train)

'''
Perform a Random Search to find the best hyperparameters for the DT classifier.
Search for hyperparameters like max depth, min samples split, min samples leaf,
and criterion. HInt: Use the RandomizedSearchCV function from scikit-learn.
'''
max_depth = [int(x) for x in np.linspace(2, 10, 9)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
criterion = ["gini", "entropy", "log_loss"]

random_grid = {
    "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "criterion" : criterion
}

optimal_clf = RandomizedSearchCV(
    estimator = clf,
    param_distributions = random_grid,
    n_iter = 50,
    cv = 5,
    random_state = 42
)

optimal_clf.fit(iris_X_train, iris_y_train)

'''
Print the best hyperparameters and the model’s accuracy with these
hyperparameters.
'''
print("Base model hyperparameters: ", clf.get_params())
print("Accuracy using base model on training data: %.2f" % (accuracy_score(iris_y_train, clf.predict(iris_X_train)) * 100), "%")
print("Accuracy using base model on testing data: %.2f" % (accuracy_score(iris_y_test, clf.predict(iris_X_test)) * 100), "%")
print()
print("Best hyperparameters: ", optimal_clf.best_params_)
print("Accuracy using model with best hyperparameters on training data: %.2f" % (accuracy_score(iris_y_train, optimal_clf.predict(iris_X_train)) * 100), "%")
print("Accuracy using base model on testing data: %.2f" % (accuracy_score(iris_y_test, optimal_clf.predict(iris_X_test)) * 100), "%")

Base model hyperparameters:  {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'best'}
Accuracy using base model on training data: 100.00 %
Accuracy using base model on testing data: 100.00 %

Best hyperparameters:  {'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 9, 'criterion': 'log_loss'}
Accuracy using model with best hyperparameters on training data: 96.67 %
Accuracy using base model on testing data: 100.00 %


### Task 2: Error Analysis

In [None]:
'''
After training the DT model with the best hyperparameters from Task 1, use this
model to make predictions on the test data.
'''
print("Accuracy using base model on testing data: ", accuracy_score(iris_y_test, clf.predict(iris_X_test)) * 100, "%")
print("Accuracy using model with best hyperparameters on testing data: ", accuracy_score(iris_y_test, optimal_clf.predict(iris_X_test)) * 100, "%")
print()

'''
Identify and print the indices of misclassified instances (where the true class
is not equal to the predicted class).
'''
def misclassified_indices(true_classes, predicted_classes, model, data):
  indices = np.argwhere(true_classes != predicted_classes)
  print("Using the %s on the %s, the following indices are misclassified: " % (model, data), end = "")
  if len(indices) > 0:
    i = 0
    while i < len(indices)-1:
      print(indices[i][0], end = ", ")
      i += 1
    print(indices[i][0])
  else:
    print()

misclassified_indices(iris_y_train, clf.predict(iris_X_train), "base model", "training data")
misclassified_indices(iris_y_test, clf.predict(iris_X_test), "base model", "testing data")
print()
misclassified_indices(iris_y_train, optimal_clf.predict(iris_X_train), "model with the best hyperparameters", "training data")
misclassified_indices(iris_y_test, optimal_clf.predict(iris_X_test), "model with the best hyperparameters", "testing data")

Accuracy using base model on testing data:  100.0 %
Accuracy using model with best hyperparameters on testing data:  100.0 %

Using the base model on the training data, the following indices are misclassified: 
Using the base model on the testing data, the following indices are misclassified: 

Using the model with the best hyperparameters on the training data, the following indices are misclassified: 59, 62, 68, 116
Using the model with the best hyperparameters on the testing data, the following indices are misclassified: 


### Task 3: Confusion Matrix

In [None]:
'''
Calculate the confusion matrix for the model’s predictions on the test data.
'''
# pregnant=P=1; not pregnant=N=0 (using pregnancy to think through T/F N/P)
# TP: pregnancy test predicts pregnancy, woman is pregnant; predicted=1, actual=1
# TN: pregnancy test predicts not pregnant, woman is not pregnant; predicted=0, actual=0
# FP: pregnancy test predicts pregnant, woman is not pregnant; predicted=1, actual=0
# FN: pregnancy test predicts not pregnant, woman is pregnant; predicted=0, actual=1

def confusion_matrix_values(true_classes, predicted_classes, positive_class):
  TP = 0
  TN = 0
  FP = 0
  FN = 0
  for i, value in enumerate(predicted_classes):
    if value == positive_class:
      if true_classes[i] == positive_class:
        TP += 1
      else:
        FP += 1
    else:
      if true_classes[i] == positive_class:
        FN += 1
      else:
        TN += 1
  return [positive_class, TP, TN, FP, FN]

def print_confusion_matrix(values, key):
  positive_class = key[int(values[0])]
  negative_classes = " ".join(np.where(key != positive_class, key, ""))
  print("positive (1) class: ", positive_class)
  print("negative (0) classes: ", negative_classes)
  print("\t\t    predicted")
  print("\t\t1\t\t0")
  print("actual\t -------------------------------")
  print("  1\t|    TP:", values[1], "\t|    FN:", values[4], "\t|")
  print("\t|---------------|---------------|")
  print("  0\t|    FP:", values[3], "\t|    TN:", values[2], "\t|")
  print("\t -------------------------------")
  print()
  print()

'''
Print the confusion matrix values (True Positives, True Negatives, False
Positives, False Negatives).
'''
print("Base Classifier Model Using Training Data\n")
for i in range(len(iris.target_names)):
  print_confusion_matrix(confusion_matrix_values(iris_y_train, clf.predict(iris_X_train), i), iris.target_names)
print("****************************************************************************")
print()

print("Classifier Model With Best Hyperparameters Using Training Data\n")
for i in range(len(iris.target_names)):
  print_confusion_matrix(confusion_matrix_values(iris_y_train, optimal_clf.predict(iris_X_train), i), iris.target_names)
print("****************************************************************************")
print()

print("Base Classifier Model Using Testing Data\n")
for i in range(len(iris.target_names)):
  print_confusion_matrix(confusion_matrix_values(iris_y_test, clf.predict(iris_X_test), i), iris.target_names)
print("****************************************************************************")
print()

print("Classifier Model With Best Hyperparameters Using Testing Data\n")
for i in range(len(iris.target_names)):
  print_confusion_matrix(confusion_matrix_values(iris_y_test, optimal_clf.predict(iris_X_test), i), iris.target_names)


Base Classifier Model Using Training Data

positive (1) class:  setosa
negative (0) classes:   versicolor virginica
		    predicted
		1		0
actual	 -------------------------------
  1	|    TP: 40 	|    FN: 0 	|
	|---------------|---------------|
  0	|    FP: 0 	|    TN: 80 	|
	 -------------------------------


positive (1) class:  versicolor
negative (0) classes:  setosa  virginica
		    predicted
		1		0
actual	 -------------------------------
  1	|    TP: 41 	|    FN: 0 	|
	|---------------|---------------|
  0	|    FP: 0 	|    TN: 79 	|
	 -------------------------------


positive (1) class:  virginica
negative (0) classes:  setosa versicolor 
		    predicted
		1		0
actual	 -------------------------------
  1	|    TP: 39 	|    FN: 0 	|
	|---------------|---------------|
  0	|    FP: 0 	|    TN: 81 	|
	 -------------------------------


****************************************************************************

Classifier Model With Best Hyperparameters Using Training Data

positive

### Task 4: Regression with DTs

In [None]:
'''
Load a dataset suitable for regression (e.g., the Boston housing dataset from
scikit-learn).
'''
# The Boston housing dataset has been phased out due to ethical concerns.
# The following dataset is comparable and was recommended in sklearn literature
# as an alternative.
housing = fetch_california_housing()

'''
Split the dataset into training and testing sets.
'''
housing_X = housing.data
housing_y = housing.target
housing_X_train, housing_X_test, housing_y_train, housing_y_test = train_test_split(housing_X, housing_y, test_size = 0.2, random_state = 42)

'''
Implement a DT regression model.
'''
regressor = DecisionTreeRegressor(random_state = 42)

'''
Train the model on the training data.
'''
regressor.fit(housing_X_train, housing_y_train)

'''
Calculate and print the mean squared error (MSE) on the testing data to assess
the model’s performance.
'''
print("Mean Squared Error: ", mean_squared_error(housing_y_test, regressor.predict(housing_X_test)))

Mean Squared Error:  0.495235205629094


### Task 5: Metrics Comparison

In [None]:
'''
Compare the performance of the DT classifier from Task 1 and the DT regression
model from Task 4.
'''

'\nCompare the performance of the DT classifier from Task 1 and the DT regression\nmodel from Task 4.\n'

It is difficult to directly compare the performance of the classifier and the regressor since they were used on 2 completely different sets of data. Furthermore, their performance was measured using two different metrics - accuracy for the classifier and mean squared error for the regressor. I can point out that, when used to predict the testing data, the classifier (both the base classifier and the classifier using the best hyperparameters*) was 100% accurate, where the regressor was not. In this way, perhaps the classifier performed "better," but in reality, the classifier and the regressor are simply meant for different situations.

*Something to note is that the classifier using the best hyperparameters actually performed worse on the training data than did the base classifier. Possibly, this is indicative of overfitting in the base classifier.

In [None]:
'''
Calculate and print relevant evaluation metrics for the classifier (e.g.,
accuracy, precision, recall, F1-score) and the regression model (e.g., MSE).
'''
def clf_evaluation_metrics(values, key):
  positive_class = key[int(values[0])]
  precision = values[1] / (values[1] + values[3])
  recall = values[1] / (values[1] + values[4])
  f1 = 2 * precision * recall / (precision + recall)
  print(positive_class)
  print("Precision: %.2f; Recall: %.2f; F1-Score: %.2f" % (precision, recall, f1))

print("Classifier Metrics")
print()
print("Base classifier")
print("Accuracy: ", accuracy_score(iris_y_test, clf.predict(iris_X_test)))

for i in range(len(iris.target_names)):
  clf_evaluation_metrics(confusion_matrix_values(iris_y_test, clf.predict(iris_X_test), i), iris.target_names)
print()
print()

print("Classifier using best hyperparameters")
print("Accuracy: ", accuracy_score(iris_y_test, optimal_clf.predict(iris_X_test)))

for i in range(len(iris.target_names)):
  clf_evaluation_metrics(confusion_matrix_values(iris_y_test, optimal_clf.predict(iris_X_test), i), iris.target_names)
print()
print("****************************************************************************")
print()

print("Regressor Metrics")
print()
print("MSE: ", mean_squared_error(housing_y_test, regressor.predict(housing_X_test)))
print("Max Error: ", max_error(housing_y_test, regressor.predict(housing_X_test)))
print("R-squared: ", r2_score(housing_y_test, regressor.predict(housing_X_test)))

Classifier Metrics

Base classifier
Accuracy:  1.0
setosa
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
versicolor
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
virginica
Precision: 1.00; Recall: 1.00; F1-Score: 1.00


Classifier using best hyperparameters
Accuracy:  1.0
setosa
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
versicolor
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
virginica
Precision: 1.00; Recall: 1.00; F1-Score: 1.00

****************************************************************************

Regressor Metrics

MSE:  0.495235205629094
Max Error:  4.100009999999999
R-squared:  0.622075845135081


In [None]:
'''
Discuss the results, including which model performed better and why.
'''

'\nDiscuss the results, including which model performed better and why.\n'

Again, it is difficult to directly compare the models because they are used for different purposes, on different datasets, and are evaluated using different metrics. The classifiers have 100% accuracy when predicting the testing data, so they could be said to be better than the regressor. However, the data that is being predicted is simply different. The regressor is used to predict data that is continuous (the value of houses in California), which is more complicated and far harder to predict with absolutely no error. The classifier need only "choose" between 3 options (iris species).

I will say that the regressor's R-squared value of about 0.62 is not very good and suggests that only about 62% of the variation in the housing values can be explained by the regressor model; the rest of the variation is due to error. However, there is still plenty of room to improve both the regressor model and the classifier model.

# Part 2: Support Vector Machine (SVM) - Linear and w/ RBF Kernel

In [None]:
import numpy as np

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

### Task 1: Linear Support Vector Machine

In [None]:
'''
Load the Iris dataset.
'''
iris = load_iris()

'''
Split the data into training and testing sets.
'''
iris_X = iris.data
iris_y = iris.target
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size = 0.2, random_state = 42)

'''
Implement a Linear Support Vector Machine (SVM) classifier using scikit-learn.
'''
linear_clf = SVC(kernel = "linear", random_state = 42)

'''
Train the LSVM model on the training data.
'''
linear_clf.fit(iris_X_train, iris_y_train)

'''
Evaluate the LSVM model’s performance on the test data and report accuracy.
'''
print("Accuracy of LSVM: %.2f" % (accuracy_score(iris_y_test, linear_clf.predict(iris_X_test)) *100), "%")

Accuracy of LSVM: 100.00 %


### Task 2: Support Vector Machine (SVM) with RBF Kernel

In [None]:
'''
Load the Iris dataset.
'''
iris = load_iris()

'''
Split the data into training and testing sets.
'''
iris_X = iris.data
iris_y = iris.target
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size = 0.2, random_state = 42)

'''
Implement a Support Vector Machine (SVM) classifier with an RBF kernel using
scikit-learn.
'''
rbf_clf = SVC(kernel = "rbf", random_state = 42)

'''
Train the SVM model with the RBF kernel on the training data.
'''
rbf_clf.fit(iris_X_train, iris_y_train)
'''
Evaluate the SVM model’s performance on the test data and report accuracy.
'''
print("Accuracy of RBF SVM: %.2f" % (accuracy_score(iris_y_test, rbf_clf.predict(iris_X_test)) *100), "%")


Accuracy of RBF SVM: 100.00 %


### Task 3: Hyperparameter Tuning for SVM with RBF Kernel

In [None]:
'''
Perform hyperparameter tuning for the SVM with an RBF kernel. Search for optimal
values of hyperparameters such as C and γ using Random Search.
'''
C = np.logspace(-2, 10, 13)
gamma = np.logspace(-9, 3, 13)

random_grid = {
    "C" : C,
    "gamma" : gamma
}

optimal_rbf_clf = RandomizedSearchCV(
    estimator = rbf_clf,
    param_distributions = random_grid,
    n_iter = 50,
    cv = 5,
    random_state = 42
)

optimal_rbf_clf.fit(iris_X_train, iris_y_train)

'''
Report the best hyperparameters for the SVM with the RBF kernel.
'''
print("Best hyperparameters: ", optimal_rbf_clf.best_params_)


'''
Train a new SVM model with the best hyperparameters and evaluate its performance
on the test data.
'''
print("Accuracy of RBF SVM using best hyperparameters: %.2f" % (accuracy_score(iris_y_test, optimal_rbf_clf.predict(iris_X_test)) * 100), "%")

Best hyperparameters:  {'gamma': 1e-08, 'C': 10000000000.0}
Accuracy of RBF SVM using best hyperparameters: 100.00 %


### Task 4: Metrics Comparison

In [None]:
'''
Calculate and compare relevant evaluation metrics (e.g., accuracy, precision,
recall, F1-score) for the LSVM from Task 1 and the SVM with an RBF kernel from
Task 2.
'''
print("Training Data")
print()

print("Linear SVM")
print("Accuracy: ", accuracy_score(iris_y_train, linear_clf.predict(iris_X_train)))

for i in range(len(iris.target_names)):
  clf_evaluation_metrics(confusion_matrix_values(iris_y_train, linear_clf.predict(iris_X_train), i), iris.target_names)
print()
print()

print("RBF SVM")
print("Accuracy: ", accuracy_score(iris_y_train, rbf_clf.predict(iris_X_train)))

for i in range(len(iris.target_names)):
  clf_evaluation_metrics(confusion_matrix_values(iris_y_train, rbf_clf.predict(iris_X_train), i), iris.target_names)
print()
print("*******************************************************************")
print()

print("Testing Data")
print()

print("Linear SVM")
print("Accuracy: ", accuracy_score(iris_y_test, linear_clf.predict(iris_X_test)))

for i in range(len(iris.target_names)):
  clf_evaluation_metrics(confusion_matrix_values(iris_y_test, linear_clf.predict(iris_X_test), i), iris.target_names)
print()
print()

print("RBF SVM")
print("Accuracy: ", accuracy_score(iris_y_test, rbf_clf.predict(iris_X_test)))

for i in range(len(iris.target_names)):
  clf_evaluation_metrics(confusion_matrix_values(iris_y_test, rbf_clf.predict(iris_X_test), i), iris.target_names)

Training Data

Linear SVM
Accuracy:  0.975
setosa
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
versicolor
Precision: 0.97; Recall: 0.95; F1-Score: 0.96
virginica
Precision: 0.95; Recall: 0.97; F1-Score: 0.96


RBF SVM
Accuracy:  0.975
setosa
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
versicolor
Precision: 0.97; Recall: 0.95; F1-Score: 0.96
virginica
Precision: 0.95; Recall: 0.97; F1-Score: 0.96

*******************************************************************

Testing Data

Linear SVM
Accuracy:  1.0
setosa
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
versicolor
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
virginica
Precision: 1.00; Recall: 1.00; F1-Score: 1.00


RBF SVM
Accuracy:  1.0
setosa
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
versicolor
Precision: 1.00; Recall: 1.00; F1-Score: 1.00
virginica
Precision: 1.00; Recall: 1.00; F1-Score: 1.00


In [None]:
'''
Not graded: Discuss the differences in performance and characteristics between
these models.
'''

'\nNot graded: Discuss the differences in performance and characteristics between\nthese models.\n'

Based on their evaluation metrics, neither classifier performed better than the other on the training data or the testing data. The iris dataset is very simple, so nothing more complex than a linear svm is required. However, the more complex rbf svm can also classify the data.

The small value of gamma (determined to be one of the best hyperparameters) suggests that the each of the training samples has a large influence on the model. The large value of C (again, determined to be one of the best hyperparameters) suggests that the svm classifier margin is small. The smaller margin means that each point is more accurately classified. Even though the large C makes the model more complex, because the dataset is so small, classification of the test data is not too cumbersome.