In [1]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import cv2 as cv
from skimage.feature import hog
from skimage.transform import resize
from skimage.io import imread, imshow, imsave

np.random.seed(42)

In [3]:
import zipfile
import glob

file = glob.glob('./clothing-dataset-small-master.zip')

with zipfile.ZipFile(file[0], 'r') as zip_ref:
    zip_ref.extractall('data/')

In [None]:
for data_type in ["train", "validation", "test"]:
    folder_path = "./data/clothing-dataset-small-master/" + data_type
    
    # combine validation and train
    new_folder_path = "./data/clothing-dataset/" + ("test" if data_type == "test" else "train")

    for folder in os.listdir(folder_path):
        new_path = os.path.join(new_folder_path, folder)
        if not os.path.exists(new_path):
            os.makedirs(new_path)

        for filename in os.listdir(os.path.join(folder_path, folder)):
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(folder_path, folder, filename)
                img = Image.open(image_path)
                new_image_path = os.path.join(new_folder_path, folder, filename)
                img.save(new_image_path)
                                              
                # flip all images besides t-shirts to balance the data
                if folder != "t-shirt":
                    img = img.transpose(Image.FLIP_LEFT_RIGHT)
                    new_image_path = os.path.join(new_folder_path, folder, "flipped_" + filename)
                    img.save(new_image_path)

In [None]:
# Generate HOG data from above data
data_types = ["train", "test"]

for data_type in data_types:
    folder_path = "./data/clothing-dataset/" + data_type
    new_folder_path = "./data/clothing-dataset-hog/" + data_type

    for folder in os.listdir(folder_path):
        new_path = os.path.join(new_folder_path, folder)
        if not os.path.exists(new_path):
            os.makedirs(new_path)

        for filename in os.listdir(os.path.join(folder_path, folder)):
            # Open the image
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(folder_path, folder, filename)
                img = cv.imread(image_path,cv.IMREAD_GRAYSCALE)
                resized_img  = cv.resize(img, (64, 128),interpolation =cv.INTER_LINEAR)
                fd, hog_image = hog(resized_img , orientations=9, pixels_per_cell=(8, 8), 
                    cells_per_block=(2, 2), visualize=True)
                new_image_path = os.path.join(new_folder_path, folder, filename)
                cv.imwrite(new_image_path, hog_image)

In [2]:
# Feature: HOG
X_train = []
y_train = []
X_test = []
y_test = []

data_types = ["train", "test"]

image = None
count = 0
for data_type in data_types:
    folder_path = "./data/clothing-dataset-hog/" + data_type

    for folder in os.listdir(folder_path):
        for filename in os.listdir(os.path.join(folder_path, folder)):
            # Open the image
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(folder_path, folder, filename)
                image = imread(image_path)
                result = np.array(image).ravel()
                if data_type == "test":
                    X_test.append(result)
                    y_test.append(folder)
                else:
                    X_train.append(result)
                    y_train.append(folder)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform

# define the number of folds for cross validation
NUM_FOLDS = 5

# define param search space for knn, dt, and rf
knn_param_grid = {'n_neighbors': list(range(2, 100))}
dt_param_grid = {'max_depth': list(range(2, 100))}
rf_param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': list(range(2, 100)),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}
svm_param_grid = {'svm__alpha': uniform(loc=0, scale=0.01), 
                  'svm__loss': ['hinge', 'squared_hinge'], 
                 'svm__penalty': ['l2', 'l1', 'elasticnet']}

# define models
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(max_features=512)
rf = RandomForestClassifier()

# define scoring metric
f1_scorer = make_scorer(f1_score, average='micro')

In [8]:
# KNN
knn_model = RandomizedSearchCV(knn, knn_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, random_state=42)
knn_model.fit(X_train, y_train)

print("KNN:")
print("Best k: ", knn_model.best_params_)
print("Best F1 score: ", knn_model.best_score_)

knn_final = KNeighborsClassifier(n_neighbors=int(knn_model.best_params_['n_neighbors']))
knn_final.fit(X_train, y_train)
y_pred = knn_final.predict(X_test)
print("Test F1 Score: ", f1_score(y_test, y_pred, average='micro'))

KNN:
Best k:  {'n_neighbors': 2}
Best F1 score:  0.5890294874735581
Test F1 Score:  0.44653179190751446


In [9]:
# DT
dt_model = RandomizedSearchCV(dt, dt_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, random_state=42)
dt_model.fit(X_train, y_train)

print("DT:")
print("Best depth: ", dt_model.best_params_)
print("Best F1 score: ", dt_model.best_score_)

dt_final = DecisionTreeClassifier(max_depth=int(dt_model.best_params_['max_depth']), max_features=512)
dt_final.fit(X_train, y_train)
y_pred = dt_final.predict(X_test)
print("Test F1 Score: ", f1_score(y_test, y_pred, average='micro'))

DT:
Best depth:  {'max_depth': 12}
Best F1 score:  0.33238057615685873
Test F1 Score:  0.315028901734104


In [4]:
# Random Forest (RF)
rf_model = RandomizedSearchCV(rf, rf_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, random_state=42)
rf_model.fit(X_train, y_train)

print("Random Forest:")
print("Best parameters: ", rf_model.best_params_)
print("Best F1 score: ", rf_model.best_score_)

rf_final = RandomForestClassifier(**rf_model.best_params_)
rf_final.fit(X_train, y_train)
y_pred_rf = rf_final.predict(X_test)
print("Test F1 Score: ", f1_score(y_test, y_pred_rf, average='micro'))

Random Forest:
Best parameters:  {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 74}
Best F1 score:  0.579101075232278
Test F1 Score:  0.4884393063583815


In [11]:
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

# SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SGDClassifier())
])

svm_model = RandomizedSearchCV(pipeline, svm_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, random_state=42)
svm_model.fit(X_train, y_train)

print("SVM")
print("Best params: ", svm_model.best_params_)
print("Best F1 score: ", svm_model.best_score_)

svm_final = make_pipeline(StandardScaler(), SGDClassifier(alpha=svm_model.best_params_['svm__alpha'], loss=svm_model.best_params_['svm__loss'], penalty=svm_model.best_params_['svm__penalty']))
svm_final.fit(X_train, y_train)
y_pred = svm_final.predict(X_test)
print("Test F1 Score: ", f1_score(y_test, y_pred, average='micro'))




SVM
Best params:  {'svm__alpha': 0.003745401188473625, 'svm__loss': 'hinge', 'svm__penalty': 'elasticnet'}
Best F1 score:  0.6186484978747384
Test F1 Score:  0.569364161849711
