<a href="https://colab.research.google.com/github/Matt-Muscedere/Projects/blob/main/COMP_3710_Project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Global Variables

In [None]:
import sklearn
import pandas as pd
import requests
import io
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn import svm, datasets

attribute_names = ['cap-shape','cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']

# @article{scikit-learn,
#  title={Scikit-learn: Machine Learning in {P}ython},
#  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
#          and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
#          and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
#          Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
#  journal={Journal of Machine Learning Research},
#  volume={12},
#  pages={2825--2830},
#  year={2011}
# }


# Preparing the Data

## Importing the training data csv file into Python

In [None]:
print(f', '.join(attribute_names))

X_data = []
y_data = []

data_url = "https://raw.githubusercontent.com/kkatanaga/UW-Coursework/master/COMP%203710/agaricus-lepiota.data"
data_list = pd.read_csv(data_url, header=None).values.tolist()

# Split data_list to 2 lists
for row in data_list:
    X_data.append(row[1:])
    y_data.append(row[0])

## Converting training data into sklearn-readable data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# 80:20 Split
raw_X_train, raw_X_test, raw_y_train, raw_y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=22)

# Encode X
one_hot_enc = OneHotEncoder()
X_train = one_hot_enc.fit_transform(raw_X_train).toarray()
X_test = one_hot_enc.transform(raw_X_test).toarray()

# Encode y
label_enc = LabelEncoder()
y_train = label_enc.fit_transform(raw_y_train)
y_test = label_enc.transform(raw_y_test)

print(raw_X_train)
print(X_train)

## Calculate the ratio of rank 1 instances among the grid search

In [None]:
def rank_one_ratio(ranking):
    rank_one_count = 0
    for i in ranking:
        if i == 1:
            rank_one_count += 1
    total_count = len(ranking)
    print(f'Number of rank 1 instances: {rank_one_count}')
    print(f'Total instances: {total_count}')
    print(f'Rank 1 ratio: {rank_one_count / total_count}')

# Functions & Training

## Randomized Trees

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(n_estimators=10)
print(random_forest_model.get_params())
random_forest_model.fit(X_train, y_train)

y_predict = random_forest_model.predict(X_test)
print("Accuracy:", f1_score(y_test, y_predict, average='micro'))
print(y_test)
print(y_predict)

params = {
    "n_estimators": [1, 3, 10, 100],
    "max_features": [1, 3, 10, None],
    "max_depth": [1, 3, 10, None],
    "max_leaf_nodes": [2, 5, 10, None],
    "bootstrap": [True, False]
}

random_forest_model2 = RandomForestClassifier()
search = GridSearchCV(random_forest_model2, params, scoring='accuracy')
search.fit(X_train, y_train)

print(search.best_estimator_)
print(search.best_score_)
print(search.best_params_)

best_random_forest = search.best_estimator_

# Fit the classifier on the training data using the best hyperparameters
best_random_forest.fit(X_train, y_train)

# Predict on the test set using the best classifier
predictions = best_random_forest.predict(X_test)

# Calculate the accuracy
accuracy = f1_score(y_test, predictions, average='micro')
print("Accuracy: ", accuracy)
rank_one_ratio(search.cv_results_['rank_test_score'])







## Decision Trees

In [None]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.tree import export_text

"""
# Create Decision Tree classifer object
clf = tree.DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

# Predict the response for test dataset
y_predict = clf.predict(X_test)

clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=2)

clf = clf.fit(X_train,y_train)

y_predict = clf.predict(X_test)
"""

# Split dataset into training set and test set
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=1)

# Create Decision Tree classifer object
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=2)

clf = clf.fit(X_train,y_train)

y_predict = clf.predict(X_test)

print(y_predict)
print(y_test)
print("Accuracy:", f1_score(y_test, y_predict, average='micro'))

# Define the hyperparameters to tune
params = {
    "criterion": ['gini', 'entropy'],
    "max_depth": [1, 2, 3, 4, 5, 10, 20, 25, 30],
    "min_samples_split": [2, 3, 4, 5, 10, 20, 30],
    "min_samples_leaf": [1, 2, 3, 4, 5, 10, 20]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(clf, params, cv=5, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameter using gridsearch are ", grid_search.best_params_)
print(grid_search.best_estimator_)
print("Best score: ", grid_search.best_score_)

# Use the best hyperparameters to create a new instance of the classifier
best_clf = tree.DecisionTreeClassifier(criterion=grid_search.best_params_['criterion'],
                                        max_depth=grid_search.best_params_['max_depth'],
                                        min_samples_split=grid_search.best_params_['min_samples_split'],
                                        min_samples_leaf=grid_search.best_params_['min_samples_leaf'])

# Fit the classifier on the training data using the best hyperparameters
best_clf.fit(X_train, y_train)

# Predict on the test set using the best classifier
y_pred = best_clf.predict(X_test)

# Calculate the accuracy
accuracy = f1_score(y_test, y_pred, average='micro')
print("Accuracy: ", accuracy)

r = export_text(best_clf)
print("textual format:")
print(r)

rank_one_ratio(grid_search.cv_results_['rank_test_score'])


## Nearest Neighbors

In [None]:
# KNeighborsClassifier class specificcaly implements the k-nearest neighbors algorithm for classification

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np


# Define the KNN model
knn = KNeighborsClassifier()

# Define hyperparameters to search
param_grid = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance'],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

# Use grid search to find best hyperparameters
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# Print best hyperparameters and corresponding accuracy score
print("Best hyperparameters using grid search are ", grid.best_params_)
print("Best model found: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)


# Use the best model to make predictions on the test set
y_pred = grid.best_estimator_.predict(X_test)

# Print the predicted labels and the true labels for the test set
print("Predicted labels: ")
print(np.array2string(y_pred, separator=', '))
print("True labels: ")
print(np.array2string(y_test, separator=', '))

# Print the accuracy of the best model on the test set
accuracy = grid.best_estimator_.score(X_test, y_test)
print("Accuracy: ", accuracy)
rank_one_ratio(grid.cv_results_['rank_test_score'])

#####
# NearestNeighbors returns indices of the nearest neighbors for each sample in the input data

# from sklearn.neighbors import NearestNeighbors
# import numpy as np
# nn_model = NearestNeighbors(n_neighbors=3)
# nn_model.fit(X_train)
# neighbors = nn_model.kneighbors(X_test, return_distance=False)
# y_predict = np.array([np.bincount(y_train[neighbor]).argmax() for neighbor in neighbors])
# print(y_predict)

#####

# # Define the classifier with desired hyperparameters
# knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski')

# # Train the classifier using the training data
# knn.fit(X_train, y_train)

# # Use the trained classifier to predict labels for the test data
# y_pred = knn.predict(X_test)

# # Compute the F1 score to evaluate the performance of the classifier
# f1 = f1_score(y_test, y_pred, average='weighted')
# print('F1 score:', f1)


## SVM

In [None]:
from sklearn import svm
import numpy as np
import matplotlib.pyplot as plt

svm = svm.SVC()

svm_parameters = {'C': [0.5, 1.0, 2.0, 10.0, 100.0], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],'degree': [2, 3, 4],'gamma': ['scale', 'auto'], 'cache_size': [1000]}

svm_grid = GridSearchCV(svm, svm_parameters, cv=5, scoring='accuracy')
svm_grid.fit(X_train, y_train)

print(f"Best model: {svm_grid.best_estimator_}" )
print(f"Mean score of best model: {svm_grid.best_score_}")

svm_y_predict = svm_grid.best_estimator_.predict(X_test)

svm_score = svm_grid.best_estimator_.score(X_test, y_test)
print(f"average score: {svm_score}")

svm_f1_score = f1_score(y_test, svm_y_predict)
print(f"f1 score: {svm_f1_score}")

svm_f1_micro = f1_score(y_test, svm_y_predict, average='micro')
print(f"f1 micro score: {svm_f1_micro}")

##################################

# svm_parameters = {'C': [0.5, 1.0, 2.0, 10.0, 100.0], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],'degree': [2, 3, 4],'gamma': ['scale', 'auto'], 'cache_size': [1000]}
# Best model: SVC(C=0.5, cache_size=1000, degree=2, kernel='linear')
# Mean score of best model: 1.0
# average score: 1.0
# f1 score: 1.0
# f1 micro score: 1.0

# svm_parameters = {'C': [0.5, 1.0, 2.0, 10.0, 100.0], 'kernel': ['poly', 'rbf', 'sigmoid'],'degree': [2, 3, 4],'gamma': ['scale', 'auto'], 'cache_size': [1000]}
# Best model: SVC(C=0.5, cache_size=1000, kernel='poly')
# Mean score of best model: 1.0
# average score: 1.0
# f1 score: 1.0
# f1 micro score: 1.0

# svm_parameters = {'C': [0.5, 1.0, 2.0, 10.0, 100.0], 'kernel': ['rbf'],'gamma': ['scale', 'auto'], 'cache_size': [1000]}
# Best model: SVC(cache_size=1000)
# Mean score of best model: 1.0
# average score: 1.0
# f1 score: 1.0
# f1 micro score: 1.0

# svm_parameters = {'C': [0.5, 1.0, 2.0, 10.0, 100.0], 'kernel': ['sigmoid'],'gamma': ['scale', 'auto'], 'cache_size': [1000]}
# Best model: SVC(C=100.0, cache_size=1000, gamma='auto', kernel='sigmoid')
# Mean score of best model: 1.0
# average score: 1.0
# f1 score: 1.0
# f1 micro score: 1.0

# svm_parameters = {'C': [0.1, 0.25, 0.5, 0.75, 1.0], 'kernel': ['linear'], 'cache_size': [1000]}
# Best model: SVC(C=0.25, cache_size=1000, kernel='linear')
# Mean score of best model: 1.0
# average score: 1.0
# f1 score: 1.0
# f1 micro score: 1.0

# svm_parameters = {'C': [0.1], 'kernel': ['linear'], 'cache_size': [1000]}
# Best model: SVC(C=0.1, cache_size=1000, kernel='linear')
# Mean score of best model: 0.9992307692307693
# average score: 0.9987692307692307
# f1 score: 0.998689384010485
# f1 micro score: 0.9987692307692307

#
