## ${\color{hotpink} \text{Imports}}$

In [1]:
# for data manipulation
import glob
import numpy as np
from PIL import Image
from sklearn.preprocessing import LabelEncoder # needed for y values for knn to prevent ValueError when fitting

# for preprocessing of data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# all of the classifier methods from the labs (only required 3 of 5)
# from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# from sklearn.model_selection import KFold
# from sklearn.naive_bayes import GaussianNB

# for finding the best hyperparameters
from sklearn.model_selection import GridSearchCV


# for evaluation of the models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## ${\color{hotpink} \text{Load the dataset into X}}$
${\color{lightgreen} \text{As the dataset is structured similar to the digits dataset from lab 5, the same method of preparing the data will be used}}$  
Referenced lab for data extraction  
https://github.com/Kevin-2002/Gesture_Based_UI_Development/blob/main/week5/digits.ipynb

In [2]:
# get all of the filepaths into a list
image_filepath = glob.glob('./fashion/*/*.png')

# read them in the proper order
image_filepath.sort()

# output so large that it does not print but if you copy the blank 
# output and paste it into a text editor you will see the list of 
# filepaths have been stored correctly in order
# print(image_filepath)

# read in the images into a 3-dimensional array
X = np.array([Image.open(image) for image in image_filepath])

# note: shape=(70000, 28, 28), dtype=uint8)
# turn the array into a 2-dimensional array
X = X.reshape(70000, 28*28)
# X[1]

## ${\color{hotpink} \text{Get the y values for each X}}$
${\color{lightgreen} \text{Which is the folder/label it is under.}}$

In [3]:
# . is at split index 0, fashion is at 1 and the category is at 2
y = np.array([image.split("/")[2] for image in image_filepath])
y

array(['ankleboot', 'ankleboot', 'ankleboot', ..., 'tshirt-top',
       'tshirt-top', 'tshirt-top'], shape=(70000,), dtype='<U10')

## ${\color{hotpink} \text{Preprocess the data}}$
>${\color{lightgreen} \text{Usually preprocessing images includes grayscaling and}}$  
>${\color{lightgreen} \text{resizing images to prepare data.}}$  
>${\color{lightgreen} \text{Those steps were already completed in the initial DataSet.}}$  

${\color{lightgreen} \text{Some models require normalization, also known as data scaling. SVM is an example.}}$  

In [4]:
# normalize the data
# according to the documentation 255 is the max value for the pixel (white), so we just divide by 255
# https://pillow.readthedocs.io/en/stable/reference/Image.html  
X = np.divide(X, 255)
# X[1]



## ${\color{hotpink} \text{Split data into train and test sets}}$
${\color{lightgreen} \text{No need to split another time for validation, as Cross-Validation method will be used instead.}}$  

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=401808)#G00 401808

## ${\color{hotpink} \text{Find optimal HyperParamaters, via GridSearchCV}}$


In [6]:
# K-NearestClassifier paramaters search
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
param_grid_knn = {
    'n_neighbors': [3, 5, 7], # number of neighbors
    'weights': ['distance'],  # weighting scheme 'uniform', 
    'metric': ['manhattan']  # distance metric 'euclidean', 
}

# Support Vector Machine paramaters search
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
param_grid_svc = {
    'C': [0.1, 1, 10, 100],  # regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001],  # kernel coefficient
    'kernel': ['rbf', 'poly', 'sigmoid']  # kernel type
}

# Logistic Regression paramaters search
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
param_grid_logistic = {
    'C': [0.1, 1, 10, 100],  # regularization parameter
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # optimization algorithm
    'max_iter': [100, 1000, 10000]  # maximum number of iterations
}

In [7]:
# encode string labels for knn fit to work
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# instantiate the classifiers
knn_model = KNeighborsClassifier()
svc_model = SVC()
logistic_model = LogisticRegression()

${\color{lightgreen} \text{Run the GridSearchCV method now that it is set up.}}$  

In [8]:
knn_grid = GridSearchCV(knn_model, param_grid_knn, cv=5, n_jobs=-1)


In [9]:

knn_grid.fit(X_train, y_train_encoded)

KeyboardInterrupt: 