<a href="https://colab.research.google.com/github/JPleal01/Dio_IA/blob/main/Rede_recomenda%C3%A7%C3%A3o.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kaggle

In [None]:
%%writefile kaggle.json
{"username":"<your kaggle username>","key":"<your kaggle api key>"}

In [None]:
import os

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d paramaggarwal/fashion-product-images-small

!unzip -q fashion-product-images-small.zip -d fashion-product-images-small

In [None]:
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import pandas as pd

# Define the path to the downloaded dataset
dataset_path = 'fashion-product-images-small'
styles_path = os.path.join(dataset_path, 'styles.csv')
images_dir = os.path.join(dataset_path, 'images')

# Load the styles.csv file to get image information and categories
styles_df = pd.read_csv(styles_path, on_bad_lines='skip')

# Select 4 subcategories to use for the recommendation program
# Let's choose some common subcategories from the dataset
selected_subcategories = ['Topwear', 'Bottomwear', 'Shoes', 'Dress']

# Filter the dataframe to include only the selected subcategories
filtered_df = styles_df[styles_df['subCategory'].isin(selected_subcategories)]

# *** Add print statement to check filtered_df ***
print("Shape of filtered_df:", filtered_df.shape)
print("First 5 rows of filtered_df:")
display(filtered_df.head())

# Initialize lists
image_file_paths = []
labels = []
preprocessed_images = []

# Loop through the filtered dataframe to get image paths and labels
for index, row in filtered_df.iterrows():
    image_id = row['id']
    subcategory = row['subCategory']
    # Construct the image path
    image_path = os.path.join(images_dir, f'{image_id}.jpg') # Assuming images are in JPG format

    # Check if the image file exists before processing
    if os.path.exists(image_path):
        image_file_paths.append(image_path)
        labels.append(subcategory)
    else:
        # Print a message if the image is not found
        print(f"Image not found: {image_path}")


# 4. Loop through each image file path
for image_path in image_file_paths:
    try:
        # Load the image
        img = Image.open(image_path).convert('RGB') # Ensure image is in RGB format

        # Resize the image
        img = img.resize((224, 224))

        # Normalize the pixel values
        img_array = np.array(img).astype('float32') / 255.0 # Normalize to [0, 1]

        # Append the preprocessed image
        preprocessed_images.append(img_array)
    except Exception as e:
        print(f"Error loading or processing image {image_path}: {e}")
        # Remove the corresponding label if the image cannot be processed
        # Find the index of the current image_path in the original list
        try:
            idx_to_remove = image_file_paths.index(image_path)
            labels.pop(idx_to_remove)
            image_file_paths.pop(idx_to_remove) # Also remove the path from the list being iterated
        except ValueError:
            # This case should ideally not happen if image_path is from image_file_paths
            print(f"Warning: Could not find {image_path} in the list to remove its label.")


# 5. Convert lists to NumPy arrays
preprocessed_images_np = np.array(preprocessed_images)
labels_np = np.array(labels)

# Print shapes to verify
print("Shape of preprocessed images:", preprocessed_images_np.shape)
print("Shape of labels:", labels_np.shape)

# You can also print the unique labels to confirm the selected categories are present
print("Unique labels (subcategories):", np.unique(labels_np))

In [None]:
import pandas as pd
import os

dataset_path = 'fashion-product-images-small'
styles_path = os.path.join(dataset_path, 'styles.csv')
styles_df = pd.read_csv(styles_path, on_bad_lines='skip')

# Print unique values in the 'subCategory' column
print("Unique values in 'subCategory' column:")
print(styles_df['subCategory'].unique())

In [None]:
'''# 6. Split the preprocessed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_images_np, labels_np, test_size=0.2, random_state=42, stratify=labels_np)

# Print shapes of the splits to verify
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)'''

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

# 1. Define the input shape based on the preprocessed images
input_shape = (224, 224, 3)
input_tensor = Input(shape=input_shape)

# 2. Load the pre-trained ResNet50 model, excluding the top classification layer
base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=input_tensor)

# 3. Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Create a new model with the base model as the feature extractor
feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)

# Print the model summary to verify the architecture and frozen layers
feature_extractor.summary()

In [None]:
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.models import Model
import tensorflow as tf

# 2. Get the output from the feature_extractor model.
x = feature_extractor.output

# 3. Add new layers on top of the frozen base model's output.
x = Flatten()(x)
x = Dense(256, activation='relu')(x) # Example: Adding a dense layer with 256 units and ReLU activation
x = Dropout(0.5)(x) # Example: Adding dropout for regularization
# The final Dense layer should have an output size equal to the number of product categories
num_product_categories = len(np.unique(y_train))
predictions = Dense(num_product_categories, activation='softmax')(x)

# 4. Create the final model by specifying the input from the feature_extractor model's input and the output from the newly added layers.
model = Model(inputs=feature_extractor.input, outputs=predictions)

# 5. Compile the model with an appropriate optimizer, loss function, and metrics.
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Print the model summary to verify the new layers
model.summary()

# 6. Prepare the labels for training.
# Convert the string labels (y_train and y_test) into one-hot encoded format
y_train_one_hot = tf.keras.utils.to_categorical(y_train, num_classes=num_product_categories)
y_test_one_hot = tf.keras.utils.to_categorical(y_test, num_classes=num_product_categories)

# Need to map string labels to integers first for one-hot encoding
label_to_int = {label: i for i, label in enumerate(np.unique(y_train))}
y_train_int = np.array([label_to_int[label] for label in y_train])
y_test_int = np.array([label_to_int[label] for label in y_test])

y_train_one_hot = tf.keras.utils.to_categorical(y_train_int, num_classes=num_product_categories)
y_test_one_hot = tf.keras.utils.to_categorical(y_test_int, num_classes=num_product_categories)

# 7. Train the fine-tuned model.
epochs = 10 # Define the number of epochs
batch_size = 32 # Define the batch size

history = model.fit(X_train, y_train_one_hot,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_test, y_test_one_hot))

In [None]:
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np # Ensure numpy is imported if not already

# 2. Get the output from the feature_extractor model.
x = feature_extractor.output

# 3. Add new layers on top of the frozen base model's output.
x = Flatten()(x)
x = Dense(256, activation='relu')(x) # Example: Adding a dense layer with 256 units and ReLU activation
x = Dropout(0.5)(x) # Example: Adding dropout for regularization
# The final Dense layer should have an output size equal to the number of product categories
num_product_categories = len(np.unique(y_train))
predictions = Dense(num_product_categories, activation='softmax')(x)

# 4. Create the final model by specifying the input from the feature_extractor model's input and the output from the newly added layers.
model = Model(inputs=feature_extractor.input, outputs=predictions)

# 5. Compile the model with an appropriate optimizer, loss function, and metrics.
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Print the model summary to verify the new layers
model.summary()

# 6. Prepare the labels for training.
# Need to map string labels to integers first for one-hot encoding
label_to_int = {label: i for i, label in enumerate(np.unique(y_train))}
y_train_int = np.array([label_to_int[label] for label in y_train])
y_test_int = np.array([label_to_int[label] for label in y_test])

# Convert the integer labels into one-hot encoded format
y_train_one_hot = tf.keras.utils.to_categorical(y_train_int, num_classes=num_product_categories)
y_test_one_hot = tf.keras.utils.to_categorical(y_test_int, num_classes=num_product_categories)

# 7. Train the fine-tuned model.
epochs = 10 # Define the number of epochs
batch_size = 32 # Define the batch size

history = model.fit(X_train, y_train_one_hot,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_test, y_test_one_hot))

In [None]:
# 1. Create a new model for feature extraction
# Access the layer just before the final classification layer
feature_extraction_model = Model(inputs=model.input, outputs=model.layers[-2].output)

# 2. Use the feature extraction model to predict feature vectors for all preprocessed images
# Combine X_train and X_test for extracting features from the full dataset
all_images = np.concatenate((X_train, X_test), axis=0)
all_labels = np.concatenate((y_train, y_test), axis=0) # Also combine labels for storage

image_features = feature_extraction_model.predict(all_images)

# Print shapes to verify
print("Shape of extracted features:", image_features.shape)
print("Shape of corresponding labels:", all_labels.shape)

# 3. Store the extracted feature vectors and their corresponding labels
# You can store them as NumPy arrays or in a pandas DataFrame
# Storing as NumPy arrays is straightforward here
stored_features = image_features
stored_labels = all_labels

# Display the shapes to confirm they are stored
print("Shape of stored features:", stored_features.shape)
print("Shape of stored labels:", stored_labels.shape)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the pairwise cosine similarity matrix
similarity_matrix = cosine_similarity(stored_features)

# Print the shape of the similarity matrix to verify
print("Shape of similarity matrix:", similarity_matrix.shape)

In [None]:
# Store the calculated similarity matrix
# It is already stored in the variable `similarity_matrix`
# No explicit saving to a file is requested in this subtask, so keeping it in memory is sufficient.

# Verify the variable exists
print("Similarity matrix is stored in the variable 'similarity_matrix'.")

In [None]:
import numpy as np

def recommend_products(query_product_index, n_recommendations):
    """
    Recommends similar products based on a query product's index.

    Args:
        query_product_index: The index of the query product in the stored_features array.
        n_recommendations: The number of recommendations desired.

    Returns:
        A list of recommended product labels.
    """
    # 2. Retrieve the similarity scores for the query product
    query_similarity_scores = similarity_matrix[query_product_index]

    # 3. Get the indices of the top 'n' most similar products (excluding the query product itself)
    # Use argsort to get indices that would sort the array in descending order
    sorted_indices = np.argsort(query_similarity_scores)[::-1]

    # Exclude the query product itself and get the top n recommendations
    # Slice the sorted indices to get the top n+1 (including the query product)
    top_n_indices = sorted_indices[1:n_recommendations + 1]

    # 4. Use these indices to retrieve the labels of the recommended products
    recommended_product_labels = [stored_labels[i] for i in top_n_indices]

    # 5. Return a list of the recommended product labels
    return recommended_product_labels

# 6. Test the function with a sample query product index and number of recommendations
sample_query_index = 0  # Example: Use the first product as the query
num_recommendations = 3 # Example: Request 3 recommendations

recommended_items = recommend_products(sample_query_index, num_recommendations)
print(f"Recommended products for product at index {sample_query_index}: {recommended_items}")