In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D
import pickle

2024-05-21 22:40:35.683375: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 22:40:35.683595: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 22:40:35.841461: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Load datasets
images_df = pd.read_csv("../input/fashion-product-images-dataset/fashion-dataset/images.csv")
styles_df = pd.read_csv("../input/fashion-product-images-dataset/fashion-dataset/styles.csv", on_bad_lines='skip')


In [4]:
# Create Unique ID in both Dataframes
images_df['id'] = images_df['filename'].apply(lambda x: x.replace(".jpg", "")).astype(int)


In [5]:
# Merging the Two Dataframes
data = styles_df.merge(images_df, on='id', how='left').reset_index(drop=True)
data['filename'] = data['filename'].apply(lambda x: os.path.join("../input/fashion-product-images-dataset/fashion-dataset/images/", x))


In [6]:
image_files = os.listdir("../input/fashion-product-images-dataset/fashion-dataset/images")


In [7]:
# Removing Products for which images are not present
data['file_found'] = data['id'].apply(lambda x: f"{x}.jpg" in image_files)
data = data[data['file_found']].reset_index(drop=True)


In [8]:

# Remove unnecessary columns
data.drop(columns=['productDisplayName', 'link', 'file_found'], inplace=True)

In [9]:
# Train-Val Split
data = data.sample(frac=1).reset_index(drop=True)
n = len(data)
train = data.iloc[:int(n * 0.8), :]
val = data.iloc[int(n * 0.8):, :].reset_index(drop=True)


In [10]:
# Data Generator
datagen = ImageDataGenerator(rescale=1 / 255.)

train_generator = datagen.flow_from_dataframe(dataframe=train,
                                              target_size=(256, 256),
                                              x_col='filename',
                                              class_mode=None,
                                              batch_size=32,
                                              shuffle=False,
                                              classes=['images'])

val_generator = datagen.flow_from_dataframe(dataframe=val,
                                            target_size=(256, 256),
                                            x_col='filename',
                                            class_mode=None,
                                            batch_size=32,
                                            shuffle=False,
                                            classes=['images'])




Found 35535 validated image filenames.
Found 8884 validated image filenames.


In [11]:
# Feature Extraction: Pre-trained VGG16
base_model = VGG16(include_top=False, input_shape=(256, 256, 3))

model = Sequential()
for layer in base_model.layers:
    model.add(layer)
model.add(GlobalAveragePooling2D())
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [None]:
# Extracting Features of Training and Validation Set
train_features = model.predict(train_generator, verbose=1)
val_features = model.predict(val_generator, verbose=1)


  self._warn_if_super_not_called()


[1m 735/1111[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m1:06:30[0m 11s/step

In [None]:
# Combine PCA features with original data
train_pca_df = pd.DataFrame(train_pca)
train_combined = train.iloc[:, 0:10].merge(train_pca_df, left_index=True, right_index=True)

val_pca_df = pd.DataFrame(val_pca)
val_combined = val.iloc[:, 0:10].merge(val_pca_df, left_index=True, right_index=True)

X_train = train_combined.iloc[:, -313:]
y_train = train_combined['id']

X_val = val_combined.iloc[:, -313:]
y_val = val_combined['id']

In [None]:
# K-Nearest Neighbours
neigh = KNeighborsClassifier(n_neighbors=6)
neigh.fit(X_train, y_train

In [None]:
# Save the PCA components and KNN model
with open('final_pca_components.pkl', 'wb') as pca_file:
    pickle.dump(pca, pca_file)

with open('final_knn_model.pkl', 'wb') as knn_file:
    pickle.dump(neigh, knn_file)


In [None]:
import os
import pickle
from urllib.request import urlretrieve

# Define the file names
pca_filename = 'final_pca_components.pkl'
knn_filename = 'final_knn_model.pkl'

# Define the file paths
pca_filepath = os.path.join('working', pca_filename)
knn_filepath = os.path.join('working', knn_filename)

# Check if the files exist and download them
if os.path.exists(pca_filepath):
    urlretrieve(f'file:///{os.path.abspath(pca_filepath)}', pca_filename)
    print(f'{pca_filename} downloaded successfully.')
else:
    print(f'{pca_filename} does not exist.')

if os.path.exists(knn_filepath):
    urlretrieve(f'file:///{os.path.abspath(knn_filepath)}', knn_filename)
    print(f'{knn_filename} downloaded successfully.')
else:
    print(f'{knn_filename} does not exist.')

# Load and verify the files
try:
    with open(pca_filename, 'rb') as pca_file:
        pca = pickle.load(pca_file)
        print(f'{pca_filename} loaded successfully.')
except FileNotFoundError:
    print(f'Failed to load {pca_filename}.')

try:
    with open(knn_filename, 'rb') as knn_file:
        knn = pickle.load(knn_file)
        print(f'{knn_filename} loaded successfully.')
except FileNotFoundError:
    print(f'Failed to load {knn_filename}.')
