# Importing required libraries and infor about dataset

In [None]:
import pandas as pd
import os
import shutil
import cv2
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import BatchNormalization,Dense, Flatten, Dropout
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall
import os

### Reading train data

In [None]:
df = pd.read_parquet('category_attributes.parquet')
#print(df)
attribute_list = df['Attribute_list']
for i in range(len(attribute_list)):
  print(attribute_list[i])

In [None]:
train_data=pd.read_csv("train.csv")

# Convert 'id' column to string and pad with leading zeros to make each entry 6 digits
train_data['id'] = train_data['id'].astype(str).str.zfill(6)
print(train_data.head(20))
#print(len(train_data))  70213

### Creating individual dataframe for all category

In [None]:
category_dfs = {category: train_data[train_data['Category'] == category] for category in train_data['Category'].unique()}
#print(category_dfs)

#making dataframes according to categories
men_tshirt_df = category_dfs['Men Tshirts']
saree_df=category_dfs['Sarees']
kurtis_df=category_dfs['Kurtis']
women_tshirts_df=category_dfs['Women Tshirts']
women_dresses_df=category_dfs['Women Tops & Tunics']


#Filtering out empty attributes
def filtering_column(df):
    empty_cols = [col for col in df.columns if df[col].isnull().all()]
    df = df.drop(columns=empty_cols)
    df = df.drop(columns=["len"])     # len is number of attributes the data have
    return df

df_dict = {
    'men_tshirt_df': men_tshirt_df,
    'saree_df': saree_df,
    'kurtis_df': kurtis_df,
    'women_tshirts_df': women_tshirts_df,
    'women_dresses_df': women_dresses_df
}

for df_name, df in df_dict.items():
    df_dict[df_name] = filtering_column(df)  # Update the dictionary with the filtered DataFrame
    #print(df_dict[df_name])  # Print the updated DataFrame

# Access the updated DataFrames using their original names
men_tshirt_df = df_dict['men_tshirt_df']  #['color' 'neck' 'pattern' 'print_or_pattern_type' 'sleeve_length']
saree_df = df_dict['saree_df']      #['blouse_pattern' 'border' 'border_width' 'color' 'occasion' 'ornamentation' 'pallu_details' 'pattern' 'print_or_pattern_type' 'transparency']
kurtis_df=df_dict['kurtis_df']      #['color' 'fit_shape' 'length' 'occasion' 'ornamentation' 'pattern' 'print_or_pattern_type' 'sleeve_length' 'sleeve_styling']
women_tshirts_df=df_dict['women_tshirts_df']    #['color' 'fit_shape' 'length' 'pattern' 'print_or_pattern_type' 'sleeve_length' 'sleeve_styling' 'surface_styling']
women_dresses_df=df_dict['women_dresses_df']    #['color' 'fit_shape' 'length' 'neck_collar' 'ocassion' 'pattern' 'print_or_pattern_type' 'sleeve_length' 'sleeve_styling' 'surface_styling']

### Removing rows with even one Nan value

In [None]:
df=women_dresses_df

# Step 1: Define the attribute columns to check
attribute_columns = df.columns[2:]

# Step 2: Identify rows where more than 3 attributes are NaN
rows_to_remove = df[attribute_columns].isna().sum(axis=1) > 3
rows_to_remove_indices = df[rows_to_remove].index
print("Indices of rows with more than 3 'NaN' values:", rows_to_remove_indices.tolist())

# Step 3: Remove those rows from the DataFrame
df_cleaned = df[~rows_to_remove]

print("\nDataFrame after removing rows with more than 3 'NaN' values:")
print(df_cleaned)

## Augmentation

In [None]:
df = df_cleaned
# Set up paths and output directory
input_folder = "train_images"
augmented_folder = "augmented_folder"
os.makedirs(augmented_folder, exist_ok=True)

# Initialize the data generator with augmentation options
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode="nearest"
)

# Load your DataFrame

attribute_columns = df.columns[2:]  # Adjust if 'id' and 'category' are the first two columns

# Create a list to store augmented data entries
augmented_data = []

# Loop through each image in the DataFrame
for idx, row in df.iterrows():
    img_id = row['id']
    img_path = os.path.join(input_folder, f"{img_id}.jpg")

    # Load image and resize for consistency
    img = cv2.imread(img_path)
    if img is None:
        print(f"Image {img_id} not found, skipping.")
        continue
    img = cv2.resize(img, (224, 224))  # Resize if needed for model input

    # Copy the original image to the augmented folder
    original_img_path = os.path.join(augmented_folder, f"{img_id}.jpg")
    shutil.copy(img_path, original_img_path)

    # Append original image info to augmented data list
    original_row = row.copy()
    augmented_data.append(original_row)

    # Reshape to add batch dimension (needed for ImageDataGenerator)
    img = np.expand_dims(img, axis=0)

    # Generate 3 augmented images per original image
    aug_iter = datagen.flow(img, batch_size=1)
    for i in range(2):
        # Generate augmented image and remove batch dimension
        aug_img = next(aug_iter)[0].astype(np.uint8)

        # Create a unique filename for the augmented image
        aug_img_id = f"{img_id}_aug_{i+1}"
        aug_img_path = os.path.join(augmented_folder, f"{aug_img_id}.jpg")

        # Save the augmented image
        cv2.imwrite(aug_img_path, aug_img)

        # Append the new image info to the augmented data list
        augmented_row = row.copy()
        augmented_row['id'] = aug_img_id
        augmented_data.append(augmented_row)

# Convert augmented data list to a DataFrame and append to original DataFrame
df_augmented = pd.DataFrame(augmented_data, columns=df.columns)
#df_augmented.to_csv("augmented_df.csv")

### Resizing the images

In [None]:
images = []

for img_id in df_augmented['id']:
    img_path = f"augmented_folder/{img_id}.jpg"  # Ensure path and extension match your files
    img = cv2.imread(img_path)
    #print(img)
    if img is None:
        print(f"Image with ID {img_id} not found at {img_path}")
        continue  # Skip to the next image if this one is missing
    img = cv2.resize(img, (224, 224))  # Resize to 224x224 for most image models

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # doing this because of cv2.imread
    img = img / 255.0  # Normalize pixel values

    images.append(img)
    print("Resizing")

X_images = np.array(images)

### Splitting the data

In [None]:
attribute_columns=df_cleaned.columns[2:]
# Assuming you have the final DataFrame with one-hot encoded attributes
# Split the DataFrame into features and labels
# 'id' is usually not used as a feature, and you should only include your one-hot encoded attributes as labels
y_label = df_augmented[attribute_columns]  # This will contain all the one-hot encoded columns

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X_images, y_label, test_size=0.2, random_state=42)

print(f"X Training set size: {X_train.shape[0]}")
print(f"X Testing set size: {X_test.shape[0]}")
print(f"Y Training set size: {y_train.shape[0]}")
print(f"Y Testing set size: {y_test.shape[0]}")

## Onehot encoding

In [None]:
df=y_label
num_classes_attr1 = len(df['attr_1'].unique())
num_classes_attr2 = len(df['attr_2'].unique())
num_classes_attr3 = len(df['attr_3'].unique())
num_classes_attr4 = len(df['attr_4'].unique())
num_classes_attr5 = len(df['attr_5'].unique())
num_classes_attr6 = len(df['attr_6'].unique())
num_classes_attr7 = len(df['attr_7'].unique())
num_classes_attr8 = len(df['attr_8'].unique())
num_classes_attr9 = len(df['attr_9'].unique())
num_classes_attr10 = len(df['attr_10'].unique())

In [None]:
print(num_classes_attr1)
print(num_classes_attr2)
print(num_classes_attr3)
print(num_classes_attr4)
print(num_classes_attr5)
print(num_classes_attr6)
print(num_classes_attr7)
print(num_classes_attr8)
print(num_classes_attr9)
#print(num_classes_attr10)


In [None]:
# Create separate encoders for each attribute
encoder1 = OneHotEncoder(sparse_output=False)
encoder2 = OneHotEncoder(sparse_output=False)
encoder3 = OneHotEncoder(sparse_output=False)
encoder4 = OneHotEncoder(sparse_output=False)
encoder5 = OneHotEncoder(sparse_output=False)
encoder6 = OneHotEncoder(sparse_output=False)
encoder7 = OneHotEncoder(sparse_output=False)
encoder8 = OneHotEncoder(sparse_output=False)
encoder9 = OneHotEncoder(sparse_output=False)
encoder10 = OneHotEncoder(sparse_output=False)

# Encode each attribute with its own encoder for training data
y_train_attr1 = encoder1.fit_transform(y_train['attr_1'].values.reshape(-1, 1))
y_train_attr2 = encoder2.fit_transform(y_train['attr_2'].values.reshape(-1, 1))
y_train_attr3 = encoder3.fit_transform(y_train['attr_3'].values.reshape(-1, 1))
y_train_attr4 = encoder4.fit_transform(y_train['attr_4'].values.reshape(-1, 1))
y_train_attr5 = encoder5.fit_transform(y_train['attr_5'].values.reshape(-1, 1))
y_train_attr6 = encoder6.fit_transform(y_train['attr_6'].values.reshape(-1, 1))
y_train_attr7 = encoder7.fit_transform(y_train['attr_7'].values.reshape(-1, 1))
y_train_attr8 = encoder8.fit_transform(y_train['attr_8'].values.reshape(-1, 1))
y_train_attr9 = encoder9.fit_transform(y_train['attr_9'].values.reshape(-1, 1))
y_train_attr10 = encoder10.fit_transform(y_train['attr_10'].values.reshape(-1, 1))

# Do the same for test data using the fitted encoders (use transform, not fit_transform)
y_test_attr1 = encoder1.transform(y_test['attr_1'].values.reshape(-1, 1))
y_test_attr2 = encoder2.transform(y_test['attr_2'].values.reshape(-1, 1))
y_test_attr3 = encoder3.transform(y_test['attr_3'].values.reshape(-1, 1))
y_test_attr4 = encoder4.transform(y_test['attr_4'].values.reshape(-1, 1))
y_test_attr5 = encoder5.transform(y_test['attr_5'].values.reshape(-1, 1))
y_test_attr6 = encoder6.transform(y_test['attr_6'].values.reshape(-1, 1))
y_test_attr7 = encoder7.transform(y_test['attr_7'].values.reshape(-1, 1))
y_test_attr8 = encoder8.transform(y_test['attr_8'].values.reshape(-1, 1))
y_test_attr9 = encoder9.transform(y_test['attr_9'].values.reshape(-1, 1))
y_test_attr10 = encoder10.transform(y_test['attr_10'].values.reshape(-1, 1))


In [None]:
print(len(y_train_attr1))
print(len(y_train_attr1[0]))

print(len(y_train_attr2))
print(len(y_train_attr2[0]))

print(len(y_train_attr3))
print(len(y_train_attr3[0]))

print(len(y_train_attr4))
print(len(y_train_attr4[0]))

print(len(y_train_attr5))
print(len(y_train_attr5[0]))

print(len(y_train_attr6))
print(len(y_train_attr6[0]))

print(len(y_train_attr7))
print(len(y_train_attr7[0]))

print(len(y_train_attr8))
print(len(y_train_attr8[0]))

"""print(len(y_train_attr9))
print(len(y_train_attr9[0]))"""

"""print(len(y_train_attr10))
print(len(y_train_attr10[0]))"""

In [None]:
print(len(y_test_attr1))
print(len(y_test_attr1[0]))

print(len(y_test_attr2))
print(len(y_test_attr2[0]))

print(len(y_test_attr3))
print(len(y_test_attr3[0]))

print(len(y_test_attr4))
print(len(y_test_attr4[0]))

print(len(y_test_attr5))
print(len(y_test_attr5[0]))

print(len(y_test_attr6))
print(len(y_test_attr6[0]))

print(len(y_test_attr7))
print(len(y_test_attr7[0]))

print(len(y_test_attr8))
print(len(y_test_attr8[0]))

"""print(len(y_test_attr9))
print(len(y_test_attr9[0]))"""

"""print(len(y_test_attr10))
print(len(y_test_attr10[0]))"""

## Training model

In [None]:
# Load base model
weights_path = os.path.abspath('resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5')
base_model = ResNet50(weights=weights_path, include_top=False, input_shape=(224, 224, 3))

# Flatten the output of base model
x = Flatten()(base_model.output)

# Add a fully connected layer
x = Dense(256, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)


# Add separate output layers for each attribute
output_attr1 = Dense(num_classes_attr1, activation='softmax', name='output_1')(x)
output_attr2 = Dense(num_classes_attr2, activation='softmax', name='output_2')(x)
output_attr3 = Dense(num_classes_attr3, activation='softmax', name='output_3')(x)
output_attr4 = Dense(num_classes_attr4, activation='softmax', name='output_4')(x)
output_attr5 = Dense(num_classes_attr5, activation='softmax', name='output_5')(x)
output_attr6 = Dense(num_classes_attr6, activation='softmax', name='output_6')(x)
output_attr7 = Dense(num_classes_attr7, activation='softmax', name='output_7')(x)
output_attr8 = Dense(num_classes_attr8, activation='softmax', name='output_8')(x)
output_attr9 = Dense(num_classes_attr9, activation='softmax', name='output_9')(x)
output_attr10 = Dense(num_classes_attr10, activation='softmax', name='output_10')(x)

In [None]:
print(output_attr1)
print(output_attr2)
print(output_attr3)
print(output_attr4)
print(output_attr5)
print(output_attr6)
print(output_attr7)
print(output_attr8)
print(output_attr9)
#print(output_attr10)


In [None]:
# First, let's be explicit about the model outputs
outputs = {
    'output_1': output_attr1,
    'output_2': output_attr2,
    'output_3': output_attr3,
    'output_4': output_attr4,
    'output_5': output_attr5,
    'output_6': output_attr6,
    'output_7': output_attr7,
    'output_8': output_attr8,
    'output_9': output_attr9,
    'output_10': output_attr10

}

model = Model(inputs=base_model.input, outputs=outputs)

# Freeze early layers but unfreeze some top layers
for layer in base_model.layers[:-30]:  # Unfreeze last 30 layers
    layer.trainable = False


# Then modify the compile section to match these output names
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss={
        'output_1': 'categorical_crossentropy',
        'output_2': 'categorical_crossentropy',
        'output_3': 'categorical_crossentropy',
        'output_4': 'categorical_crossentropy',
        'output_5': 'categorical_crossentropy',
        'output_6': 'categorical_crossentropy',
        'output_7': 'categorical_crossentropy',
        'output_8': 'categorical_crossentropy',
        'output_9': 'categorical_crossentropy',
        'output_10': 'categorical_crossentropy'
    },
    metrics={
        'output_1': [CategoricalAccuracy(), Precision(), Recall()],
        'output_2': [CategoricalAccuracy(), Precision(), Recall()],
        'output_3': [CategoricalAccuracy(), Precision(), Recall()],
        'output_4': [CategoricalAccuracy(), Precision(), Recall()],
        'output_5': [CategoricalAccuracy(), Precision(), Recall()],
        'output_6': [CategoricalAccuracy(), Precision(), Recall()],
        'output_7': [CategoricalAccuracy(), Precision(), Recall()],
        'output_8': [CategoricalAccuracy(), Precision(), Recall()],
        'output_9': [CategoricalAccuracy(), Precision(), Recall()],
        'output_10': [CategoricalAccuracy(), Precision(), Recall()]
    }
)

# And modify the fit to use these same names
history1 = model.fit(
    X_train,
    {
        'output_1': y_train_attr1,
        'output_2': y_train_attr2,
        'output_3': y_train_attr3,
        'output_4': y_train_attr4,
        'output_5': y_train_attr5,
        'output_6': y_train_attr6,
        'output_7': y_train_attr7,
        'output_8': y_train_attr8,
        'output_9': y_train_attr9,
        'output_10': y_train_attr10
    },
    validation_data=(X_test, {
        'output_1': y_test_attr1,
        'output_2': y_test_attr2,
        'output_3': y_test_attr3,
        'output_4': y_test_attr4,
        'output_5': y_test_attr5,
        'output_6': y_test_attr6,
        'output_7': y_test_attr7,
        'output_8': y_test_attr8,
        'output_9': y_test_attr9,
        'output_10': y_test_attr10
    }),
    epochs=12,
    batch_size=35
)

# Phase 2: Fine-tune by unfreezing last 30 layers
model.compile(
    optimizer=Adam(learning_rate=0.00001),
    loss={
        'output_1': 'categorical_crossentropy',
        'output_2': 'categorical_crossentropy',
        'output_3': 'categorical_crossentropy',
        'output_4': 'categorical_crossentropy',
        'output_5': 'categorical_crossentropy',
        'output_6': 'categorical_crossentropy',
        'output_7': 'categorical_crossentropy',
        'output_8': 'categorical_crossentropy',
        'output_9': 'categorical_crossentropy',
        'output_10': 'categorical_crossentropy'
    },
    metrics={
        'output_1': [CategoricalAccuracy(), Precision(), Recall()],
        'output_2': [CategoricalAccuracy(), Precision(), Recall()],
        'output_3': [CategoricalAccuracy(), Precision(), Recall()],
        'output_4': [CategoricalAccuracy(), Precision(), Recall()],
        'output_5': [CategoricalAccuracy(), Precision(), Recall()],
        'output_6': [CategoricalAccuracy(), Precision(), Recall()],
        'output_7': [CategoricalAccuracy(), Precision(), Recall()],
        'output_8': [CategoricalAccuracy(), Precision(), Recall()],
        'output_9': [CategoricalAccuracy(), Precision(), Recall()],
        'output_10': [CategoricalAccuracy(), Precision(), Recall()]
    }
)

# And modify the fit to use these same names
history2 = model.fit(
    X_train,
    {
        'output_1': y_train_attr1,
        'output_2': y_train_attr2,
        'output_3': y_train_attr3,
        'output_4': y_train_attr4,
        'output_5': y_train_attr5,
        'output_6': y_train_attr6,
        'output_7': y_train_attr7,
        'output_8': y_train_attr8,
        'output_9': y_train_attr9,
        'output_10': y_train_attr10
    },
    validation_data=(X_test, {
        'output_1': y_test_attr1,
        'output_2': y_test_attr2,
        'output_3': y_test_attr3,
        'output_4': y_test_attr4,
        'output_5': y_test_attr5,
        'output_6': y_test_attr6,
        'output_7': y_test_attr7,
        'output_8': y_test_attr8,
        'output_9': y_test_attr9,
        'output_10': y_test_attr10
    }),
    epochs=8,
    batch_size=35
)


## Saving model

In [None]:
def model_save(model,name):
    """
    Saves the trained model and the corresponding training labels to disk.

    Args:
        model (Model): Keras model to be saved.
        y_train (pd.DataFrame): DataFrame containing the training labels.
        name (str): Name suffix to use for saving the model and labels.

    Returns:
        None
    """
    model.save(f'model_{name}.keras')

In [None]:
model_save(model,"women_dresses")

# Testing on test images

## i) for single image (other than mn tshirt)

In [None]:
import cv2
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.preprocessing import OneHotEncoder
import re

# Assuming kurtis_df is already loaded
# Initialize separate encoders for each attribute
encoder_attr1 = OneHotEncoder(sparse_output=False)
encoder_attr2 = OneHotEncoder(sparse_output=False)
encoder_attr3 = OneHotEncoder(sparse_output=False)
encoder_attr4 = OneHotEncoder(sparse_output=False)
encoder_attr5 = OneHotEncoder(sparse_output=False)
encoder_attr6 = OneHotEncoder(sparse_output=False)
encoder_attr7 = OneHotEncoder(sparse_output=False)
encoder_attr8 = OneHotEncoder(sparse_output=False)
"""encoder_attr9 = OneHotEncoder(sparse_output=False)
encoder_attr10 = OneHotEncoder(sparse_output=False)"""
df=women_tshirts_df
# Fit each encoder to its respective attribute column in the training data
encoder_attr1.fit(df['attr_1'].values.reshape(-1, 1))
encoder_attr2.fit(df['attr_2'].values.reshape(-1, 1))
encoder_attr3.fit(df['attr_3'].values.reshape(-1, 1))
encoder_attr4.fit(df['attr_4'].values.reshape(-1, 1))
encoder_attr5.fit(df['attr_5'].values.reshape(-1, 1))
encoder_attr6.fit(df['attr_6'].values.reshape(-1, 1))
encoder_attr7.fit(df['attr_7'].values.reshape(-1, 1))
encoder_attr8.fit(df['attr_8'].values.reshape(-1, 1))
"""encoder_attr9.fit(df['attr_9'].values.reshape(-1, 1))
encoder_attr10.fit(df['attr_10'].values.reshape(-1, 1))"""

In [None]:
# Function to preprocess the image for prediction
def preprocess_image(image_path):
    img = cv2.imread(image_path)  # Load image
    img = cv2.resize(img, (224, 224))  # Resize to match model input
    img = img.astype(np.float32) / 255.0  # Normalize to float32
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    return img

# Load the model
model = load_model('model_women_tshirts.keras')  # Replace with your actual model path

# Load an image and preprocess it
image_path = "test_images/015324.jpg"  # Change to your image path
preprocessed_image = preprocess_image(image_path)

# Make predictions for each attribute's output layer
predictions = model.predict(preprocessed_image)

In [None]:
# Attribute names using feature names from each encoder
attribute_names = {
    'attr1': encoder_attr1.get_feature_names_out(['attr1']),
    'attr2': encoder_attr2.get_feature_names_out(['attr2']),
    'attr3': encoder_attr3.get_feature_names_out(['attr3']),
    'attr4': encoder_attr4.get_feature_names_out(['attr4']),
    'attr5': encoder_attr5.get_feature_names_out(['attr5']),
    'attr6': encoder_attr6.get_feature_names_out(['attr6']),
    'attr7': encoder_attr7.get_feature_names_out(['attr7']),
    'attr8': encoder_attr8.get_feature_names_out(['attr8'])
}

# Decode predictions for each attribute
predicted_labels = []
for i, (output_name, attribute_prediction) in enumerate(predictions.items()):
    feature_names = attribute_names[f'attr{i+1}']  # Get the corresponding feature names for the attribute
    predicted_class_index = np.argmax(attribute_prediction)  # Get index of highest probability
    predicted_class_name = feature_names[predicted_class_index]  # Map index to class name
    predicted_labels.append(predicted_class_name)

# Output the predicted attributes
print("Predicted attributes:", predicted_labels)


## ii) for batch of images (except men tshirt)

In [None]:
def preprocess_images_batch(image_paths):
    """
    Preprocess multiple images at once for batch prediction
    
    Args:
        image_paths (list): List of paths to images
        
    Returns:
        np.array: Batch of preprocessed images
    """
    processed_images = []
    for image_path in image_paths:
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"Could not load image: {image_path}")
        img = cv2.resize(img, (224, 224))
        img = img.astype(np.float32) / 255.0
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        processed_images.append(img)
    
    return np.array(processed_images)

def predict_batch(model, image_paths) :
    """
    Make predictions for a batch of images
    
    Args:
        model: Loaded Keras model
        image_paths (list): List of image paths
        attribute_names (dict): Dictionary mapping attribute names to encoder feature names
        
    Returns:
        list: List of predictions for each image
    """
    # Preprocess all images
    batch_images = preprocess_images_batch(image_paths)
    
    # Get predictions for all images at once
    batch_predictions = model.predict(batch_images)
    
    return batch_predictions



def remove_attr_prefix(predicted_labels):
    """
    Remove the 'attrn_' prefix (where n is a number) from all predicted labels and
    replace 'default_n' (where n is a number from 1 to 10) with 'default'.
    
    Args:
        predicted_labels (list): List of predicted labels.
        
    Returns:
        list: Filtered list with 'attrn_' prefix removed and 'default_n' replaced with 'default'.
    """
    cleaned_labels = []
    for label in predicted_labels:
        # Replace 'default_n' (where n is 1-10) with 'default'
        label = re.sub(r'default_\d{1,2}', 'default', label)
        # Remove 'attrn_' prefix (where n is any number)
        label = re.sub(r'^attr\d+_', '', label)
        cleaned_labels.append(label)
    
    return cleaned_labels




In [None]:
# Read and prepare test data
test_data = pd.read_csv("Women Tshirts_test_data.csv")
test_data['id'] = test_data['id'].astype(str).str.zfill(6)

image_paths = [f"test_images/{id}.jpg" for id in test_data['id']]
batch_images = preprocess_images_batch(image_paths)
model = load_model('model_women_tshirts.keras')
attribute_names = {
    'attr1': encoder_attr1.get_feature_names_out(['attr1']),
    'attr2': encoder_attr2.get_feature_names_out(['attr2']),
    'attr3': encoder_attr3.get_feature_names_out(['attr3']),
    'attr4': encoder_attr4.get_feature_names_out(['attr4']),
    'attr5': encoder_attr5.get_feature_names_out(['attr5']),
    'attr6': encoder_attr6.get_feature_names_out(['attr6']),
    'attr7': encoder_attr7.get_feature_names_out(['attr7']),
    'attr8': encoder_attr8.get_feature_names_out(['attr8'])
}

# Get predictions for all images at once
batch_predictions = predict_batch(model, image_paths)
key_list = list(batch_predictions.keys())

# Get all predictions at once using vectorized operations
predictions = {k: np.argmax(batch_predictions[k], axis=1) for k in key_list}

# Pre-calculate feature names mapping
feature_names_mapping = {
    f'attr{i+1}': attribute_names[f'attr{i+1}']
    for i in range(len(key_list))
}

# Process all images in the batch and update test_data
for batch_idx, img_id in enumerate(test_data['id']):
    # Get predictions for all attributes for current image
    current_predictions = []
    for i, k in enumerate(key_list):
        pred_idx = predictions[k][batch_idx]
        feature_names = feature_names_mapping[f'attr{i+1}']
        current_predictions.append(feature_names[pred_idx])
    
    # Clean the predictions
    cleaned_pred = remove_attr_prefix(current_predictions)
    
    # Ensure exactly 10 entries (padding with dummy_value if needed)
    cleaned_pred += ["dummy_value"] * (10 - len(cleaned_pred))
    
    # Update the DataFrame for the current image
    test_data.loc[test_data['id'] == img_id, 
                 ["attr_1", "attr_2", "attr_3", "attr_4", "attr_5",
                  "attr_6", "attr_7", "attr_8", "attr_9", "attr_10"]] = cleaned_pred
    
    print(f"Image {img_id} predictions:", cleaned_pred)

### iii) Code for single image (men tshirt)

In [None]:
import cv2
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.preprocessing import OneHotEncoder

# Initialize separate encoders for each attribute
encoder_attr1 = OneHotEncoder(sparse_output=False)
encoder_attr2 = OneHotEncoder(sparse_output=False)
encoder_attr3 = OneHotEncoder(sparse_output=False)
encoder_attr4 = OneHotEncoder(sparse_output=False)
encoder_attr5 = OneHotEncoder(sparse_output=False)

df=men_tshirt_df
# Fit each encoder to its respective attribute column in the training data
encoder_attr1.fit(df['attr_1'].values.reshape(-1, 1))
encoder_attr2.fit(df['attr_2'].values.reshape(-1, 1))
encoder_attr3.fit(df['attr_3'].values.reshape(-1, 1))
encoder_attr4.fit(df['attr_4'].values.reshape(-1, 1))
encoder_attr5.fit(df['attr_5'].values.reshape(-1, 1))


In [None]:
# Function to preprocess the image for prediction
def preprocess_image(image_path):
    img = cv2.imread(image_path)  # Load image
    img = cv2.resize(img, (224, 224))  # Resize to match model input
    img = img.astype(np.float32) / 255.0  # Normalize to float32
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    return img

# Load the model
model = load_model('model_men_tshirt.keras')  # Replace with your actual model path

# Load an image and preprocess it
image_path = "test_images/000196.jpg"  # Change to your image path
preprocessed_image = preprocess_image(image_path)

# Make predictions for each attribute's output layer
predictions = model.predict(preprocessed_image)

# Attribute names using feature names from each encoder
attribute_names = {
    'attr1': encoder_attr1.get_feature_names_out(['attr1']),
    'attr2': encoder_attr2.get_feature_names_out(['attr2']),
    'attr3': encoder_attr3.get_feature_names_out(['attr3']),
    'attr4': encoder_attr4.get_feature_names_out(['attr4']),
    'attr5': encoder_attr5.get_feature_names_out(['attr5'])
}
# Decode predictions for each attribute
predicted_labels = []
for i, (attr_name, feature_names) in enumerate(attribute_names.items()):
    attribute_prediction = predictions[i]  # Prediction for the current attribute
    predicted_class_index = np.argmax(attribute_prediction)  # Get index of highest probability
    predicted_class_name = feature_names[predicted_class_index]  # Map index to class name
    predicted_labels.append(predicted_class_name)

# Output the predicted attributes
print("Predicted attributes:", predicted_labels)

### iv) Test for batch of images (for men tshirt)

In [None]:
import re

def preprocess_images_batch(image_paths):
    """
    Preprocess multiple images at once for batch prediction
    
    Args:
        image_paths (list): List of paths to images
        
    Returns:
        np.array: Batch of preprocessed images
    """
    processed_images = []
    for image_path in image_paths:
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"Could not load image: {image_path}")
        img = cv2.resize(img, (224, 224))
        img = img.astype(np.float32) / 255.0
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        processed_images.append(img)
    
    return np.array(processed_images)

def clean_prediction_labels(predictions):
    """
    Remove 'attrn_' prefix from prediction labels in a nested list.
    
    Args:
        predictions (list of list): List of lists where each inner list contains prediction labels with 'attrn_' prefix.
        
    Returns:
        list of list: Cleaned prediction labels in the same nested list structure.
    """
    cleaned_predictions = []
    for pred_list in predictions:
        cleaned_pred_list = []
        for pred in pred_list:
            # Replace 'attrn_default_n' with 'default'
            if re.match(r'attr\d+_default_\d+', pred):
                cleaned_pred_list.append("default")
            else:
                # Remove prefix 'attrn_'
                cleaned_pred_list.append(re.sub(r'attr\d+_', '', pred))
        cleaned_predictions.append(cleaned_pred_list)
    
    return cleaned_predictions



def predict_batch(model, image_paths, attribute_names):
    """
    Make predictions for a batch of images
    
    Args:
        model: Loaded Keras model
        image_paths (list): List of image paths
        attribute_names (dict): Dictionary mapping attribute names to encoder feature names
        
    Returns:
        list: List of predictions for each image
    """
    # Preprocess all images
    batch_images = preprocess_images_batch(image_paths)
    
    # Get predictions for all images at once
    batch_predictions = model.predict(batch_images)
    
    # Process predictions for all images
    all_predictions = []
    num_images = len(image_paths)
    
    # If model outputs a list of predictions (one per attribute)
    if isinstance(batch_predictions, list):
        for img_idx in range(num_images):
            image_predictions = []
            for attr_idx, (attr_name, feature_names) in enumerate(attribute_names.items()):
                prediction = batch_predictions[attr_idx][img_idx]
                predicted_class_index = np.argmax(prediction)
                predicted_class_name = feature_names[predicted_class_index]
                image_predictions.append(predicted_class_name)
            all_predictions.append(image_predictions)
    else:
        # If model outputs a single array
        for img_idx in range(num_images):
            image_predictions = []
            for attr_idx, (attr_name, feature_names) in enumerate(attribute_names.items()):
                prediction = batch_predictions[img_idx][attr_idx]
                predicted_class_index = np.argmax(prediction)
                predicted_class_name = feature_names[predicted_class_index]
                image_predictions.append(predicted_class_name)
            all_predictions.append(image_predictions)
    
    return all_predictions

In [None]:
# Load the model
model = load_model('model_men_tshirt.keras')

# Process test data
test_data = pd.read_csv("Men Tshirts_test_data.csv")
test_data['id'] = test_data['id'].astype(str).str.zfill(6)

# Create list of image paths
image_paths = [f"test_images/{id}.jpg" for id in test_data['id']]

# Attribute names using feature names from each encoder
attribute_names = {
    'attr1': encoder_attr1.get_feature_names_out(['attr1']),
    'attr2': encoder_attr2.get_feature_names_out(['attr2']),
    'attr3': encoder_attr3.get_feature_names_out(['attr3']),
    'attr4': encoder_attr4.get_feature_names_out(['attr4']),
    'attr5': encoder_attr5.get_feature_names_out(['attr5'])
}

# Get predictions for all images at once
predictions = predict_batch(model, image_paths, attribute_names)

for img_id, pred in zip(test_data['id'], predictions):
    # Clean the current prediction list
    cleaned_pred = clean_prediction_labels([pred])[0]  # Get the cleaned inner list for this image
    
    # Ensure the cleaned_pred list has exactly 10 entries (adding 'dummy_value' as needed)
    cleaned_pred += ["dummy_value"] * (10 - len(cleaned_pred))
    
    # Locate the row for the current img_id and assign the cleaned predictions
    test_data.loc[test_data['id'] == img_id, ["attr_1", "attr_2", "attr_3", "attr_4", "attr_5", 
                                              "attr_6", "attr_7", "attr_8", "attr_9", "attr_10"]] = cleaned_pred
    
    print(f"Image {img_id} predictions:", cleaned_pred)


In [None]:
"""test_data.to_csv("predictions.csv",index=False)"""

# Combining data

In [None]:
data=pd.read_csv("predictions.csv")

In [None]:
print(data)

In [None]:
print(test_data)

In [None]:
combined_df = pd.concat([data, test_data], ignore_index=False)

In [None]:
print(combined_df)

In [None]:
combined_df.to_csv("predictions.csv",index=False)