## ${\color{hotpink} \text{Imports}}$

In [17]:
# for data manipulation
import glob
import numpy as np
from PIL import Image
from sklearn.preprocessing import LabelEncoder # needed for y values for knn to prevent ValueError when fitting

# for preprocessing of data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# all of the classifier methods from the labs (only required 3 of 5)
# from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# from sklearn.model_selection import KFold
# from sklearn.naive_bayes import GaussianNB

# for finding the best hyperparameters
from sklearn.model_selection import GridSearchCV


# for evaluation of the models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## ${\color{hotpink} \text{Load the dataset into X}}$
${\color{lightgreen} \text{As the dataset is structured similar to the digits dataset from lab 5, the same method of preparing the data will be used}}$  
Referenced lab for data extraction  
https://github.com/Kevin-2002/Gesture_Based_UI_Development/blob/main/week5/digits.ipynb

In [18]:
# get all of the filepaths into a list
# image_filepath = glob.glob('./fashion/*/*.png')

image_filepath = glob.glob('./fashion/bag/*.png')
image_filepath.extend(glob.glob('./fashion/coat/*.png'))

# read them in the proper order
image_filepath.sort()

# output so large that it does not print but if you copy the blank 
# output and paste it into a text editor you will see the list of 
# filepaths have been stored correctly in order
# print(image_filepath)

# read in the images into a 3-dimensional array
X = np.array([Image.open(image) for image in image_filepath])

# note: shape=(70000, 28, 28), dtype=uint8)
# turn the array into a 2-dimensional array
X = X.reshape(14000, 28*28)
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(14000, 784), dtype=uint8)

## ${\color{hotpink} \text{Get the y values for each X}}$
${\color{lightgreen} \text{Which is the folder/label it is under.}}$

In [19]:
# . is at split index 0, fashion is at 1 and the category is at 2
y = np.array([image.split("/")[2] for image in image_filepath])
y

array(['bag', 'bag', 'bag', ..., 'coat', 'coat', 'coat'],
      shape=(14000,), dtype='<U4')

## ${\color{hotpink} \text{Preprocess the data}}$
>${\color{lightgreen} \text{Usually preprocessing images includes grayscaling and}}$  
>${\color{lightgreen} \text{resizing images to prepare data.}}$  
>${\color{lightgreen} \text{Those steps were already completed in the initial DataSet.}}$  

${\color{lightgreen} \text{Some models require normalization, also known as data scaling. SVM is an example.}}$  

In [20]:
# normalize the data
# according to the documentation 255 is the max value for the pixel (white), so we just divide by 255
# https://pillow.readthedocs.io/en/stable/reference/Image.html  
X = np.divide(X, 255)

# X[1]

# encode string labels to avoid the ValueError
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

## ${\color{hotpink} \text{Split data into train and test sets}}$
${\color{lightgreen} \text{No need to split another time for validation, as Cross-Validation method will be used instead.}}$  

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=401808)#G00 401808

## ${\color{hotpink} \text{Find optimal HyperParamaters, via GridSearchCV}}$
${\color{lightgreen} \text{The reason the search grid is so little is because of expensive runtime costs}}$  

------

In [22]:
# K-NearestClassifier paramaters search
# find the best amount of neighbors
param_grid_knn = {
    'n_neighbors': [3, 5, 7], # number of neighbors
}

In [23]:
knn_model = KNeighborsClassifier()
knn_grid = GridSearchCV(knn_model, param_grid_knn)

In [24]:
knn_grid.fit(X_train, y_train)

In [25]:
print(knn_grid.best_params_)

{'n_neighbors': 3}


------

In [26]:
# Support Vector Machine paramaters search
# compare 2 at a time because 30 minutes elapsed without output meaning the 
# computational cost of running all 5 types is too much
param_grid_svc = {
    'kernel': ['rbf', 'poly']  # kernel type
}

In [27]:
svc_model = SVC(random_state=401808)
svc_grid = GridSearchCV(svc_model, param_grid_svc)

In [28]:
svc_grid.fit(X_train, y_train)

In [29]:
print(svc_grid.best_params_)

{'kernel': 'rbf'}


------

In [30]:
# Logistic Regression paramaters search
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
param_grid_logistic = {
    
}

------

In [31]:
# change the scale for scv and logistic regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [32]:
# instantiate the classifiers
knn_model = KNeighborsClassifier()
svc_model = SVC(random_state=401808)
logistic_model = LogisticRegression(random_state=401808)

In [33]:
knn_model = knn_model.fit(X_train, y_train)

In [34]:
svc_model = svc_model.fit(X_train, y_train)

In [35]:
logistic_model = logistic_model.fit(X_train, y_train)

In [36]:
print(knn_model.score(X_train, y_train))
print(cross_val_score(knn_model, X_train, y_train).mean())

0.9894642857142857
0.9849107142857143


In [37]:
print(svc_model.score(X_train, y_train))
print(cross_val_score(svc_model, X_train, y_train).mean())

0.994375
0.9900892857142857


In [38]:
print(logistic_model.score(X_train, y_train))
print(cross_val_score(logistic_model, X_train, y_train).mean())

0.9958035714285715
0.9863392857142858
