In [33]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder


In [2]:
CSV_PATH = "/home/hammad/fyp/dataset_fashion/cleaned_styles_colored.csv"

In [3]:
df = pd.read_csv(CSV_PATH)

In [4]:
df = df.drop_duplicates(subset='id', keep='first')


In [5]:
df['id'] = df['id'].astype(str)

In [6]:
with open('/home/hammad/fyp/image_embeddings_resnet50_tf.pkl','rb') as f:
    embeddings = pickle.load(f)

print("Embeddings loaded: ", len(embeddings))
print("Metadata loaded: ", df.shape)


Embeddings loaded:  22946
Metadata loaded:  (22951, 10)


In [7]:
df_filtered = df[df['id'].astype(str).isin(embeddings.keys())]

In [8]:
df_filtered.shape

(22946, 10)

In [9]:
## converting embeddings dictionary into DF
embeddings_df = pd.DataFrame.from_dict(embeddings,orient='index')
embeddings_df.index.name = 'id'
embeddings_df.reset_index(inplace=True)

In [10]:
embeddings_df['id'] = embeddings_df['id'].astype(str)

In [11]:
embeddings_df.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,15970,0.325631,0.961932,0.0,0.549981,0.0,0.056022,0.508034,0.38849,0.008825,...,0.229524,0.223776,0.124558,2.046997,0.0,0.0,0.43307,0.234684,0.06411,1.254325
1,39386,0.670931,1.461553,0.154584,0.383389,0.46085,0.0089,0.81233,0.004201,0.344588,...,0.373247,0.065233,1.082466,0.231429,0.107079,0.002723,0.579104,0.0,0.0,1.954522
2,21379,0.195744,0.803393,0.004324,2.545139,0.075638,0.10998,2.759297,0.664183,0.075709,...,0.610984,0.0,5.3e-05,0.1674,0.071858,0.0,0.659216,0.0,0.110951,3.961849
3,53759,0.096383,1.703673,0.056809,1.617223,0.011546,0.0,1.127666,0.307086,0.0,...,0.505315,0.205928,0.140345,1.801834,0.0,0.0,0.082535,0.078035,0.242948,1.523945
4,1855,0.035023,1.326588,0.0,0.991414,0.0,0.0,0.770858,0.419605,0.10325,...,0.983012,0.004316,0.195267,1.943198,0.046978,0.0,0.003337,0.024109,0.403093,0.126377


In [12]:
merged_df = pd.merge(df_filtered,embeddings_df, on='id')

In [13]:
merged_df.head()

Unnamed: 0,id,baseColour,gender_encoded,category_bottom,category_footwear,category_top,style_casual,style_formal,style_sportswear,baseColour_encoded,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,15970,Navy Blue,1,0,0,1,1,0,0,25,...,0.229524,0.223776,0.124558,2.046997,0.0,0.0,0.43307,0.234684,0.06411,1.254325
1,39386,Blue,1,1,0,0,1,0,0,2,...,0.373247,0.065233,1.082466,0.231429,0.107079,0.002723,0.579104,0.0,0.0,1.954522
2,21379,Black,1,1,0,0,1,0,0,1,...,0.610984,0.0,5.3e-05,0.1674,0.071858,0.0,0.659216,0.0,0.110951,3.961849
3,53759,Grey,1,0,0,1,1,0,0,13,...,0.505315,0.205928,0.140345,1.801834,0.0,0.0,0.082535,0.078035,0.242948,1.523945
4,1855,Grey,1,0,0,1,1,0,0,13,...,0.983012,0.004316,0.195267,1.943198,0.046978,0.0,0.003337,0.024109,0.403093,0.126377


In [14]:
merged_df.shape

(22946, 2058)

In [15]:
def get_category_label(row):
    if row['category_bottom'] == 1:
        return 0
    elif row['category_footwear'] == 1:
        return 1
    elif row['category_top'] == 1:
        return 2


In [16]:
def get_style_label(row):
    if row['style_casual'] == 1:
        return 0
    elif row['style_formal'] == 1:
        return 1
    elif row['style_sportswear'] == 1:
        return 2

In [17]:
merged_df.loc[:, 'style_label'] = merged_df.apply(get_style_label, axis=1)
merged_df.loc[:, 'category_label'] = merged_df.apply(get_category_label, axis=1)


In [18]:
merged_df.head()

Unnamed: 0,id,baseColour,gender_encoded,category_bottom,category_footwear,category_top,style_casual,style_formal,style_sportswear,baseColour_encoded,...,2040,2041,2042,2043,2044,2045,2046,2047,style_label,category_label
0,15970,Navy Blue,1,0,0,1,1,0,0,25,...,0.124558,2.046997,0.0,0.0,0.43307,0.234684,0.06411,1.254325,0,2
1,39386,Blue,1,1,0,0,1,0,0,2,...,1.082466,0.231429,0.107079,0.002723,0.579104,0.0,0.0,1.954522,0,0
2,21379,Black,1,1,0,0,1,0,0,1,...,5.3e-05,0.1674,0.071858,0.0,0.659216,0.0,0.110951,3.961849,0,0
3,53759,Grey,1,0,0,1,1,0,0,13,...,0.140345,1.801834,0.0,0.0,0.082535,0.078035,0.242948,1.523945,0,2
4,1855,Grey,1,0,0,1,1,0,0,13,...,0.195267,1.943198,0.046978,0.0,0.003337,0.024109,0.403093,0.126377,0,2


In [19]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22946 entries, 0 to 22945
Columns: 2060 entries, id to category_label
dtypes: float32(2048), int64(10), object(2)
memory usage: 181.4+ MB


In [20]:
# CATEGORY prediction dataset
category_df = merged_df.drop(columns=[
    'style_casual',
    'style_formal',
    'style_sportswear',
    'style_label',
    'category_bottom',
    'category_top',
    'category_footwear',
    'baseColour_encoded',
    'baseColour'
])

In [21]:
# Style prediction dataset
style_df = merged_df.drop(columns=[
    'category_bottom',
    'category_footwear',
    'category_top',
    'category_label',
    'style_casual',
    'style_formal',
    'style_sportswear',
    'baseColour_encoded',
    'baseColour'
    
])

In [22]:
# Color prediction dataset
color_df = merged_df.drop(columns=[
    'category_bottom',
    'category_footwear',
    'category_top',
    'category_label',
    'style_casual',
    'style_formal',
    'style_sportswear',
    'style_label',
    'gender_encoded',
])

In [23]:
category_df.head()

Unnamed: 0,id,gender_encoded,0,1,2,3,4,5,6,7,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,category_label
0,15970,1,0.325631,0.961932,0.0,0.549981,0.0,0.056022,0.508034,0.38849,...,0.223776,0.124558,2.046997,0.0,0.0,0.43307,0.234684,0.06411,1.254325,2
1,39386,1,0.670931,1.461553,0.154584,0.383389,0.46085,0.0089,0.81233,0.004201,...,0.065233,1.082466,0.231429,0.107079,0.002723,0.579104,0.0,0.0,1.954522,0
2,21379,1,0.195744,0.803393,0.004324,2.545139,0.075638,0.10998,2.759297,0.664183,...,0.0,5.3e-05,0.1674,0.071858,0.0,0.659216,0.0,0.110951,3.961849,0
3,53759,1,0.096383,1.703673,0.056809,1.617223,0.011546,0.0,1.127666,0.307086,...,0.205928,0.140345,1.801834,0.0,0.0,0.082535,0.078035,0.242948,1.523945,2
4,1855,1,0.035023,1.326588,0.0,0.991414,0.0,0.0,0.770858,0.419605,...,0.004316,0.195267,1.943198,0.046978,0.0,0.003337,0.024109,0.403093,0.126377,2


In [24]:
style_df.head()

Unnamed: 0,id,gender_encoded,0,1,2,3,4,5,6,7,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,style_label
0,15970,1,0.325631,0.961932,0.0,0.549981,0.0,0.056022,0.508034,0.38849,...,0.223776,0.124558,2.046997,0.0,0.0,0.43307,0.234684,0.06411,1.254325,0
1,39386,1,0.670931,1.461553,0.154584,0.383389,0.46085,0.0089,0.81233,0.004201,...,0.065233,1.082466,0.231429,0.107079,0.002723,0.579104,0.0,0.0,1.954522,0
2,21379,1,0.195744,0.803393,0.004324,2.545139,0.075638,0.10998,2.759297,0.664183,...,0.0,5.3e-05,0.1674,0.071858,0.0,0.659216,0.0,0.110951,3.961849,0
3,53759,1,0.096383,1.703673,0.056809,1.617223,0.011546,0.0,1.127666,0.307086,...,0.205928,0.140345,1.801834,0.0,0.0,0.082535,0.078035,0.242948,1.523945,0
4,1855,1,0.035023,1.326588,0.0,0.991414,0.0,0.0,0.770858,0.419605,...,0.004316,0.195267,1.943198,0.046978,0.0,0.003337,0.024109,0.403093,0.126377,0


In [25]:
color_df.head()

Unnamed: 0,id,baseColour,baseColour_encoded,0,1,2,3,4,5,6,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,15970,Navy Blue,25,0.325631,0.961932,0.0,0.549981,0.0,0.056022,0.508034,...,0.229524,0.223776,0.124558,2.046997,0.0,0.0,0.43307,0.234684,0.06411,1.254325
1,39386,Blue,2,0.670931,1.461553,0.154584,0.383389,0.46085,0.0089,0.81233,...,0.373247,0.065233,1.082466,0.231429,0.107079,0.002723,0.579104,0.0,0.0,1.954522
2,21379,Black,1,0.195744,0.803393,0.004324,2.545139,0.075638,0.10998,2.759297,...,0.610984,0.0,5.3e-05,0.1674,0.071858,0.0,0.659216,0.0,0.110951,3.961849
3,53759,Grey,13,0.096383,1.703673,0.056809,1.617223,0.011546,0.0,1.127666,...,0.505315,0.205928,0.140345,1.801834,0.0,0.0,0.082535,0.078035,0.242948,1.523945
4,1855,Grey,13,0.035023,1.326588,0.0,0.991414,0.0,0.0,0.770858,...,0.983012,0.004316,0.195267,1.943198,0.046978,0.0,0.003337,0.024109,0.403093,0.126377


In [27]:
color_df.drop(columns=['baseColour_encoded'], inplace=True)

In [28]:
# Step 2: Rename 'baseColour' to 'original_baseColour'
color_df.rename(columns={'baseColour': 'original_baseColour'}, inplace=True)


In [29]:
reverse_color_mapping = {
    'Beige': 'Brown', 'Black': 'Black', 'Blue': 'Blue', 'Bronze': 'Brown', 'Brown': 'Brown',
    'Burgundy': 'Red', 'Charcoal': 'Grey', 'Coffee Brown': 'Brown', 'Copper': 'Brown',
    'Cream': 'White', 'Fluorescent Green': 'Green', 'Gold': 'Yellow', 'Green': 'Green',
    'Grey': 'Grey', 'Grey Melange': 'Grey', 'Khaki': 'Green', 'Lavender': 'Purple',
    'Lime Green': 'Green', 'Magenta': 'Pink', 'Maroon': 'Red', 'Mauve': 'Purple',
    'Metallic': 'Grey', 'Multi': 'Multi', 'Mushroom Brown': 'Brown', 'Mustard': 'Yellow',
    'Navy Blue': 'Blue', 'Nude': 'Brown', 'Off White': 'White', 'Olive': 'Green',
    'Orange': 'Orange', 'Peach': 'Pink', 'Pink': 'Pink', 'Purple': 'Purple',
    'Red': 'Red', 'Rust': 'Red', 'Sea Green': 'Green', 'Silver': 'Grey',
    'Tan': 'Brown', 'Taupe': 'Brown', 'Teal': 'Blue', 'Turquoise Blue': 'Blue',
    'White': 'White', 'Yellow': 'Yellow'
}

In [30]:
color_df['baseColour_grouped'] = color_df['original_baseColour'].map(reverse_color_mapping)


In [31]:
unmapped_colors = color_df[color_df['baseColour_grouped'].isnull()]['original_baseColour'].unique()
if len(unmapped_colors) > 0:
    print(f"Unmapped colors found: {unmapped_colors}")


In [34]:
le = LabelEncoder()
color_df['baseColour_encoded'] = le.fit_transform(color_df['baseColour_grouped'])


In [35]:
color_df.head()

Unnamed: 0,id,original_baseColour,0,1,2,3,4,5,6,7,...,2040,2041,2042,2043,2044,2045,2046,2047,baseColour_grouped,baseColour_encoded
0,15970,Navy Blue,0.325631,0.961932,0.0,0.549981,0.0,0.056022,0.508034,0.38849,...,0.124558,2.046997,0.0,0.0,0.43307,0.234684,0.06411,1.254325,Blue,1
1,39386,Blue,0.670931,1.461553,0.154584,0.383389,0.46085,0.0089,0.81233,0.004201,...,1.082466,0.231429,0.107079,0.002723,0.579104,0.0,0.0,1.954522,Blue,1
2,21379,Black,0.195744,0.803393,0.004324,2.545139,0.075638,0.10998,2.759297,0.664183,...,5.3e-05,0.1674,0.071858,0.0,0.659216,0.0,0.110951,3.961849,Black,0
3,53759,Grey,0.096383,1.703673,0.056809,1.617223,0.011546,0.0,1.127666,0.307086,...,0.140345,1.801834,0.0,0.0,0.082535,0.078035,0.242948,1.523945,Grey,4
4,1855,Grey,0.035023,1.326588,0.0,0.991414,0.0,0.0,0.770858,0.419605,...,0.195267,1.943198,0.046978,0.0,0.003337,0.024109,0.403093,0.126377,Grey,4


In [37]:
# Create mappings
color_to_index = dict(zip(le.classes_, le.transform(le.classes_)))
index_to_color = dict(zip(le.transform(le.classes_), le.classes_))

# Display mappings
print("Color to Index Mapping:")
for k, v in color_to_index.items():
    print(f"{v}: {k}")


Color to Index Mapping:
0: Black
1: Blue
2: Brown
3: Green
4: Grey
5: Multi
6: Orange
7: Pink
8: Purple
9: Red
10: White
11: Yellow


In [38]:
color_df['baseColour_encoded'].value_counts()

baseColour_encoded
0     4742
1     4501
10    3759
4     2173
2     1884
9     1572
3     1457
7      930
8      884
11     705
6      238
5      101
Name: count, dtype: int64

In [39]:
color_df = color_df.drop(columns=['baseColour_grouped', 'original_baseColour'])

In [40]:
color_df.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,baseColour_encoded
0,15970,0.325631,0.961932,0.0,0.549981,0.0,0.056022,0.508034,0.38849,0.008825,...,0.223776,0.124558,2.046997,0.0,0.0,0.43307,0.234684,0.06411,1.254325,1
1,39386,0.670931,1.461553,0.154584,0.383389,0.46085,0.0089,0.81233,0.004201,0.344588,...,0.065233,1.082466,0.231429,0.107079,0.002723,0.579104,0.0,0.0,1.954522,1
2,21379,0.195744,0.803393,0.004324,2.545139,0.075638,0.10998,2.759297,0.664183,0.075709,...,0.0,5.3e-05,0.1674,0.071858,0.0,0.659216,0.0,0.110951,3.961849,0
3,53759,0.096383,1.703673,0.056809,1.617223,0.011546,0.0,1.127666,0.307086,0.0,...,0.205928,0.140345,1.801834,0.0,0.0,0.082535,0.078035,0.242948,1.523945,4
4,1855,0.035023,1.326588,0.0,0.991414,0.0,0.0,0.770858,0.419605,0.10325,...,0.004316,0.195267,1.943198,0.046978,0.0,0.003337,0.024109,0.403093,0.126377,4


In [41]:
category_df.to_csv('category_prediction_dataset.csv',index=False)
style_df.to_csv('style_prediction_dataset.csv',index=False)
color_df.to_csv('color_prediction_dataset.csv', index=False)