## ${\color{hotpink} \text{Imports}}$

In [13]:
# for data manipulation
import glob
import numpy as np
from PIL import Image
from sklearn.preprocessing import LabelEncoder # needed for y values for knn to prevent ValueError when fitting

# for preprocessing of data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# all of the classifier methods from the labs (only required 3 of 5)
# from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import KFold
# from sklearn.naive_bayes import GaussianNB

# for finding the best hyperparameters
from sklearn.model_selection import GridSearchCV


# for evaluation of the models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## ${\color{hotpink} \text{Load the dataset into X}}$
${\color{lightgreen} \text{As the dataset is structured similar to the digits dataset from lab 5, the same method of preparing the data will be used}}$  
Referenced lab for data extraction  
https://github.com/Kevin-2002/Gesture_Based_UI_Development/blob/main/week5/digits.ipynb

In [14]:
# Get all of the filepaths into a list
# image_filepath = glob.glob('./fashion/*/*.png')

# portion of the data to use for training for less computational time
image_filepath = glob.glob('./fashion/bag/*.png')
image_filepath.extend(glob.glob('./fashion/coat/*.png'))

image_filepath.sort()

# Define batch size
batch_size = 1000  # Adjust based on your system's capacity

# Initialize empty array to store all images
total_images = len(image_filepath)
# Assuming all images are 28x28 based on your reshape operation
all_images = np.zeros((total_images, 28, 28), dtype=np.uint8)

# Process in batches
for i in range(0, total_images, batch_size):
    end_idx = min(i + batch_size, total_images)
    batch_filepaths = image_filepath[i:end_idx]

    # Load this batch
    batch_images = []
    for img_path in batch_filepaths:
        with Image.open(img_path) as img:
            # Convert to numpy array immediately and close file
            batch_images.append(np.array(img))

    # Store in the main array
    all_images[i:end_idx] = np.array(batch_images)

    # Force cleanup
    batch_images = None

# Reshape to your desired dimensions
X = all_images.reshape(total_images, 28*28)


## ${\color{hotpink} \text{Get the y values for each X}}$
${\color{lightgreen} \text{Which is the folder/label it is under.}}$

In [15]:
# . is at split index 0, fashion is at 1 and the category is at 2
y = np.array([image.split("/")[2] for image in image_filepath])
y

array(['bag', 'bag', 'bag', ..., 'coat', 'coat', 'coat'], dtype='<U4')

## ${\color{hotpink} \text{Preprocess the data}}$
>${\color{lightgreen} \text{Usually preprocessing images includes grayscaling and}}$  
>${\color{lightgreen} \text{resizing images to prepare data.}}$  
>${\color{lightgreen} \text{Those steps were already completed in the initial DataSet.}}$  

${\color{lightgreen} \text{Some models require normalization, also known as data scaling. SVM is an example.}}$  

In [16]:
# normalize the data
# according to the documentation 255 is the max value for the pixel (white), so we just divide by 255
# https://pillow.readthedocs.io/en/stable/reference/Image.html  
X = np.divide(X, 255)

# X[1]

# encode string labels to avoid the ValueError
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

## ${\color{hotpink} \text{Split data into train and test sets}}$
${\color{lightgreen} \text{No need to split another time for validation, as Cross-Validation method will be used instead.}}$  

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=401808)#G00 401808

## ${\color{hotpink} \text{Find optimal HyperParamaters, via GridSearchCV}}$
${\color{lightgreen} \text{The reason the search grid is so little is because of expensive runtime costs}}$  

------

## ${\color{hotpink} \text{KNN Gridsearch}}$

In [18]:
# K-NearestClassifier paramaters search
# find the best amount of neighbors
param_grid_knn = {
    'n_neighbors': [3, 5, 7], # number of neighbors
}

In [19]:
knn_model = KNeighborsClassifier()
knn_grid = GridSearchCV(knn_model, param_grid_knn)

In [20]:
knn_grid.fit(X_train, y_train)

In [21]:
print(knn_grid.best_params_)

{'n_neighbors': 3}


------

## ${\color{hotpink} \text{RandomForest Gridsearch}}$

In [22]:
# Support Vector Machine paramaters search
# compare 2 at a time because 30 minutes elapsed without output meaning the 
# computational cost of running all 5 types is too much
param_grid_RandomForestClassifier = {
    'n_estimators': [50, 100], # Number of trees in the forest
    'max_depth': [10, 20, 30], # Maximum depth of the tree
}

In [23]:
RandomForestClassifier_model = RandomForestClassifier(random_state=401808)
RandomForestClassifier_grid = GridSearchCV(RandomForestClassifier_model, param_grid_RandomForestClassifier)

In [24]:
RandomForestClassifier_grid.fit(X_train, y_train)

In [25]:
print(RandomForestClassifier_grid.best_params_)

{'max_depth': 30, 'n_estimators': 50}


------

## ${\color{hotpink} \text{Logistic regression Gridsearch}}$

In [31]:
# Logistic Regression paramaters search
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
param_grid_logistic = {
    'C': [0.1, 1, 10], # Inverse of regularization strength
    'solver': ['saga', 'liblinear'], # Algorithm to use in the optimization problem
}

In [32]:
logistic_model = LogisticRegression(random_state=401808, max_iter=10000)
logistic_model_grid = GridSearchCV(logistic_model, param_grid_logistic)

In [33]:
logistic_model_grid.fit(X_train, y_train)

In [34]:
print(logistic_model_grid.best_params_)

{'C': 0.1, 'solver': 'saga'}


------

In [None]:
# instantiate the classifiers
knn_model = KNeighborsClassifier()
RandomForestClassifier_model = RandomForestClassifier(random_state=401808)
logistic_model = LogisticRegression(random_state=401808)