In [26]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

data_path = '/content/drive/MyDrive/dataset/FinalFashionDataset.csv'
data = pd.read_csv(data_path)

FileNotFoundError: [Errno 2] No such file or directory: 'FinalFashioDataset.csv'

In [42]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Select relevant columns
df = data[['user_id', 'id', 'gender', 'mastercategory', 'subcategory', 'articletype',
           'basecolour', 'season', 'year', 'usage', 'ratings', 'Price (USD)']]

# Encode user and item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['id'] = item_encoder.fit_transform(df['id'])

# One-hot encode additional categorical columns
df = pd.get_dummies(df, columns=['gender', 'mastercategory', 'subcategory',
                                  'articletype', 'basecolour', 'season', 'usage'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_id'] = user_encoder.fit_transform(df['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = item_encoder.fit_transform(df['id'])


In [30]:
# Normalize continuous features
scaler = MinMaxScaler()
df[['year', 'Price (USD)']] = scaler.fit_transform(df[['year', 'Price (USD)']])

# Split data into train and test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Prepare input arrays
X_train = [train['user_id'].values, train['id'].values, train.drop(columns=['user_id', 'id', 'ratings']).values]
X_test = [test['user_id'].values, test['id'].values, test.drop(columns=['user_id', 'id', 'ratings']).values]
y_train = train['ratings'].values
y_test = test['ratings'].values

In [31]:
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense, Dropout, Concatenate, BatchNormalization
from keras.regularizers import l2

# Define embedding size and regularization
embedding_size = 50
reg = l2(1e-6)  # L2 regularization

# User input and embedding
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=len(user_encoder.classes_),
                           output_dim=embedding_size, embeddings_regularizer=reg,
                           name='user_embedding')(user_input)
user_vec = Flatten(name='flatten_user')(user_embedding)

# Item input and embedding
item_input = Input(shape=(1,), name='item_input')
item_embedding = Embedding(input_dim=len(item_encoder.classes_),
                           output_dim=embedding_size, embeddings_regularizer=reg,
                           name='item_embedding')(item_input)
item_vec = Flatten(name='flatten_item')(item_embedding)

# Additional features input
additional_input = Input(shape=(X_train[2].shape[1],), name='additional_input')

# Concatenate all inputs
concat = Concatenate()([user_vec, item_vec, additional_input])

# Add dense layers with batch normalization and dropout
dense1 = Dense(256, activation='relu', kernel_regularizer=reg)(concat)
batch_norm1 = BatchNormalization()(dense1)
dropout1 = Dropout(0.4)(batch_norm1)

dense2 = Dense(128, activation='relu', kernel_regularizer=reg)(dropout1)
batch_norm2 = BatchNormalization()(dense2)
dropout2 = Dropout(0.3)(batch_norm2)

dense3 = Dense(64, activation='relu', kernel_regularizer=reg)(dropout2)
batch_norm3 = BatchNormalization()(dense3)
dropout3 = Dropout(0.2)(batch_norm3)

output = Dense(1, activation='linear', name='output')(dropout3)

# Compile the model
model = Model(inputs=[user_input, item_input, additional_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()


In [32]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np

# Convert datasets to TensorFlow tensors
X_train_user = tf.convert_to_tensor(X_train[0], dtype=tf.int32)
X_train_item = tf.convert_to_tensor(X_train[1], dtype=tf.int32)
X_train_features = tf.convert_to_tensor(X_train[2], dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)

X_test_user = tf.convert_to_tensor(X_test[0], dtype=tf.int32)
X_test_item = tf.convert_to_tensor(X_test[1], dtype=tf.int32)
X_test_features = tf.convert_to_tensor(X_test[2], dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

# Define hyperparameters
batch_size = 256
epochs = 50
learning_rate = 0.001

# Define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Get the number of batches
num_batches = len(X_train_user) // batch_size

# Create an instance of the MeanSquaredError loss function
mse_loss = tf.keras.losses.MeanSquaredError()

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    epoch_loss = 0.0

    # Shuffle the training data
    indices = np.arange(len(X_train_user))
    np.random.shuffle(indices)
    X_train_user = tf.gather(X_train_user, indices)
    X_train_item = tf.gather(X_train_item, indices)
    X_train_features = tf.gather(X_train_features, indices)
    y_train_tensor = tf.gather(y_train_tensor, indices)

    # Mini-batch gradient descent
    for i in tqdm(range(num_batches)):
        start = i * batch_size
        end = start + batch_size

        # Create mini-batches
        batch_user = X_train_user[start:end]
        batch_item = X_train_item[start:end]
        batch_features = X_train_features[start:end]
        batch_labels = y_train_tensor[start:end]

        with tf.GradientTape() as tape:
           # Forward pass
           predictions = model([batch_user, batch_item, batch_features], training=True)

           # Compute loss using the instantiated object
           loss = mse_loss(batch_labels, predictions) # Changed line to call the object

        # Compute gradients and update weights
        gradients = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(gradients, model.trainable_weights))

        # Accumulate loss for monitoring
        epoch_loss += loss.numpy()

    # Calculate average loss for the epoch
    epoch_loss /= num_batches
    print(f"Epoch {epoch + 1} Loss: {epoch_loss:.4f}")

    # Evaluate the model on the test set (optional)
    test_predictions = model([X_test_user, X_test_item, X_test_features], training=False)
    # Call the mse function directly
    test_loss = tf.reduce_mean(tf.keras.losses.mse(y_test_tensor, test_predictions))
    print(f"Test Loss: {test_loss:.4f}")


Epoch 1/50


100%|██████████| 114/114 [00:20<00:00,  5.54it/s]


Epoch 1 Loss: 11.4302
Test Loss: 4.9446
Epoch 2/50


100%|██████████| 114/114 [00:23<00:00,  4.92it/s]


Epoch 2 Loss: 3.8289
Test Loss: 1.3012
Epoch 3/50


100%|██████████| 114/114 [00:21<00:00,  5.37it/s]


Epoch 3 Loss: 1.9645
Test Loss: 1.0855
Epoch 4/50


100%|██████████| 114/114 [00:24<00:00,  4.70it/s]


Epoch 4 Loss: 1.5466
Test Loss: 0.9480
Epoch 5/50


100%|██████████| 114/114 [00:20<00:00,  5.51it/s]


Epoch 5 Loss: 1.2089
Test Loss: 0.8903
Epoch 6/50


100%|██████████| 114/114 [00:21<00:00,  5.40it/s]


Epoch 6 Loss: 0.9043
Test Loss: 0.9316
Epoch 7/50


100%|██████████| 114/114 [00:21<00:00,  5.42it/s]


Epoch 7 Loss: 0.7043
Test Loss: 0.9144
Epoch 8/50


100%|██████████| 114/114 [00:21<00:00,  5.33it/s]


Epoch 8 Loss: 0.5804
Test Loss: 0.8867
Epoch 9/50


100%|██████████| 114/114 [00:21<00:00,  5.26it/s]


Epoch 9 Loss: 0.4951
Test Loss: 0.8841
Epoch 10/50


100%|██████████| 114/114 [00:20<00:00,  5.43it/s]


Epoch 10 Loss: 0.4191
Test Loss: 0.8756
Epoch 11/50


100%|██████████| 114/114 [00:22<00:00,  5.18it/s]


Epoch 11 Loss: 0.3604
Test Loss: 0.8594
Epoch 12/50


100%|██████████| 114/114 [00:21<00:00,  5.40it/s]


Epoch 12 Loss: 0.3245
Test Loss: 0.8571
Epoch 13/50


100%|██████████| 114/114 [00:22<00:00,  5.04it/s]


Epoch 13 Loss: 0.2888
Test Loss: 0.8565
Epoch 14/50


100%|██████████| 114/114 [00:20<00:00,  5.49it/s]


Epoch 14 Loss: 0.2662
Test Loss: 0.8482
Epoch 15/50


100%|██████████| 114/114 [00:22<00:00,  5.15it/s]


Epoch 15 Loss: 0.2411
Test Loss: 0.8483
Epoch 16/50


100%|██████████| 114/114 [00:21<00:00,  5.38it/s]


Epoch 16 Loss: 0.2252
Test Loss: 0.8442
Epoch 17/50


100%|██████████| 114/114 [00:21<00:00,  5.32it/s]


Epoch 17 Loss: 0.2060
Test Loss: 0.8403
Epoch 18/50


100%|██████████| 114/114 [00:21<00:00,  5.27it/s]


Epoch 18 Loss: 0.1930
Test Loss: 0.8439
Epoch 19/50


100%|██████████| 114/114 [00:21<00:00,  5.36it/s]


Epoch 19 Loss: 0.1846
Test Loss: 0.8306
Epoch 20/50


100%|██████████| 114/114 [00:22<00:00,  5.17it/s]


Epoch 20 Loss: 0.1696
Test Loss: 0.8287
Epoch 21/50


100%|██████████| 114/114 [00:20<00:00,  5.48it/s]


Epoch 21 Loss: 0.1631
Test Loss: 0.8302
Epoch 22/50


100%|██████████| 114/114 [00:22<00:00,  5.15it/s]


Epoch 22 Loss: 0.1560
Test Loss: 0.8364
Epoch 23/50


100%|██████████| 114/114 [00:20<00:00,  5.48it/s]


Epoch 23 Loss: 0.1464
Test Loss: 0.8316
Epoch 24/50


100%|██████████| 114/114 [00:21<00:00,  5.23it/s]


Epoch 24 Loss: 0.1423
Test Loss: 0.8247
Epoch 25/50


100%|██████████| 114/114 [00:20<00:00,  5.45it/s]


Epoch 25 Loss: 0.1339
Test Loss: 0.8306
Epoch 26/50


100%|██████████| 114/114 [00:21<00:00,  5.41it/s]


Epoch 26 Loss: 0.1317
Test Loss: 0.8290
Epoch 27/50


100%|██████████| 114/114 [00:22<00:00,  5.16it/s]


Epoch 27 Loss: 0.1286
Test Loss: 0.8199
Epoch 28/50


100%|██████████| 114/114 [00:20<00:00,  5.51it/s]


Epoch 28 Loss: 0.1236
Test Loss: 0.8247
Epoch 29/50


100%|██████████| 114/114 [00:21<00:00,  5.21it/s]


Epoch 29 Loss: 0.1234
Test Loss: 0.8233
Epoch 30/50


100%|██████████| 114/114 [00:20<00:00,  5.51it/s]


Epoch 30 Loss: 0.1172
Test Loss: 0.8271
Epoch 31/50


100%|██████████| 114/114 [00:22<00:00,  5.14it/s]


Epoch 31 Loss: 0.1150
Test Loss: 0.8233
Epoch 32/50


100%|██████████| 114/114 [00:21<00:00,  5.42it/s]


Epoch 32 Loss: 0.1135
Test Loss: 0.8204
Epoch 33/50


100%|██████████| 114/114 [00:21<00:00,  5.30it/s]


Epoch 33 Loss: 0.1078
Test Loss: 0.8236
Epoch 34/50


100%|██████████| 114/114 [00:21<00:00,  5.26it/s]


Epoch 34 Loss: 0.1085
Test Loss: 0.8286
Epoch 35/50


100%|██████████| 114/114 [00:20<00:00,  5.47it/s]


Epoch 35 Loss: 0.1071
Test Loss: 0.8226
Epoch 36/50


100%|██████████| 114/114 [00:22<00:00,  5.11it/s]


Epoch 36 Loss: 0.1034
Test Loss: 0.8299
Epoch 37/50


100%|██████████| 114/114 [00:20<00:00,  5.47it/s]


Epoch 37 Loss: 0.1035
Test Loss: 0.8119
Epoch 38/50


100%|██████████| 114/114 [00:22<00:00,  5.07it/s]


Epoch 38 Loss: 0.1022
Test Loss: 0.8236
Epoch 39/50


100%|██████████| 114/114 [00:21<00:00,  5.42it/s]


Epoch 39 Loss: 0.0979
Test Loss: 0.8204
Epoch 40/50


100%|██████████| 114/114 [00:21<00:00,  5.19it/s]


Epoch 40 Loss: 0.0973
Test Loss: 0.8213
Epoch 41/50


100%|██████████| 114/114 [00:21<00:00,  5.34it/s]


Epoch 41 Loss: 0.0963
Test Loss: 0.8113
Epoch 42/50


100%|██████████| 114/114 [00:21<00:00,  5.42it/s]


Epoch 42 Loss: 0.0981
Test Loss: 0.8288
Epoch 43/50


100%|██████████| 114/114 [00:21<00:00,  5.19it/s]


Epoch 43 Loss: 0.0951
Test Loss: 0.8235
Epoch 44/50


100%|██████████| 114/114 [00:21<00:00,  5.34it/s]


Epoch 44 Loss: 0.0932
Test Loss: 0.8206
Epoch 45/50


100%|██████████| 114/114 [00:22<00:00,  5.03it/s]


Epoch 45 Loss: 0.0906
Test Loss: 0.8250
Epoch 46/50


100%|██████████| 114/114 [00:21<00:00,  5.40it/s]


Epoch 46 Loss: 0.0924
Test Loss: 0.8318
Epoch 47/50


100%|██████████| 114/114 [00:22<00:00,  5.13it/s]


Epoch 47 Loss: 0.0894
Test Loss: 0.8403
Epoch 48/50


100%|██████████| 114/114 [00:20<00:00,  5.57it/s]


Epoch 48 Loss: 0.0861
Test Loss: 0.8261
Epoch 49/50


100%|██████████| 114/114 [00:21<00:00,  5.19it/s]


Epoch 49 Loss: 0.0844
Test Loss: 0.8275
Epoch 50/50


100%|██████████| 114/114 [00:21<00:00,  5.26it/s]


Epoch 50 Loss: 0.0845
Test Loss: 0.8253


In [33]:
# Prepare input arrays
X_train = [train['user_id'].values, train['id'].values, train.drop(columns=['user_id', 'id', 'ratings']).values]
X_test = [test['user_id'].values, test['id'].values, test.drop(columns=['user_id', 'id', 'ratings']).values]
y_train = train['ratings'].values
y_test = test['ratings'].values

# ----> Convert X_test elements to TensorFlow tensors <----
X_test = [tf.convert_to_tensor(X_test[0], dtype=tf.int32),
          tf.convert_to_tensor(X_test[1], dtype=tf.int32),
          tf.convert_to_tensor(X_test[2], dtype=tf.float32)]

# Evaluate on test data
loss, mae = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss (MSE): {loss:.4f}")
print(f"Test Mean Absolute Error (MAE): {mae:.4f}")


Test Loss (MSE): 0.8298
Test Mean Absolute Error (MAE): 0.6780


In [44]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_items(input_user_id, input_item_id, num_recommendations=5, filters={}):
    """
    Recommends items based on cosine similarity with the input item,
    personalized for the user, and optionally filtered by conditions.

    Args:
        input_user_id: The ID of the user.
        input_item_id: The ID of the input item.
        num_recommendations: The number of recommendations to generate.
        filters: A dictionary of filters for additional conditions
                 (e.g., {'masterCategory': 'Clothing', 'season': 'Winter'}).

    Returns:
        A DataFrame containing the recommendations.
    """
    # Get item index for the given item ID
    item_idx = item_encoder.transform([input_item_id])[0]

    # Extract item embeddings from the model
    item_embedding_weights = model.get_layer('item_embedding').get_weights()[0]

    # Get embedding for the input item
    input_item_embedding = item_embedding_weights[item_idx].reshape(1, -1)

    # Compute cosine similarity with all items
    similarities = cosine_similarity(input_item_embedding, item_embedding_weights)[0]

    # Get top-N recommendations by similarity score
    top_indices = similarities.argsort()[-(num_recommendations * 10 + 1):][::-1][1:]  # More for filtering
    recommended_item_ids = item_encoder.inverse_transform(top_indices)

    # Retrieve recommended item details
    recommendations = df[df['id'].isin(recommended_item_ids)]

    # Apply additional filters
    for key, value in filters.items():
        # Check if the filter key is in the recommendations columns
        if key in recommendations.columns:
            recommendations = recommendations[recommendations[key] == value]
        # Handle the case where the filter key is 'mastercategory'
        # and the DataFrame might have 'masterCategory' instead
        elif key == 'mastercategory' and 'masterCategory' in recommendations.columns:
            recommendations = recommendations[recommendations['masterCategory'] == value]

    # Ensure items are not the same as the input item
    recommendations = recommendations[recommendations['id'] != input_item_id]

    # Filter by user preferences if `user_id` is part of the dataset
    if 'user_id' in df.columns:
        user_preferences = df[df['user_id'] == input_user_id]
         # Handle the case where the DataFrame might have 'masterCategory' instead of 'mastercategory'
        if 'mastercategory' in user_preferences.columns:
            preferred_categories = user_preferences['mastercategory'].unique()
        elif 'masterCategory' in user_preferences.columns:
            preferred_categories = user_preferences['masterCategory'].unique()
        else:
            preferred_categories = []  # Handle case where neither column exists


        # Recommend items only within the user's preferred categories
        if 'masterCategory' in recommendations.columns:
            recommendations = recommendations[recommendations['mastercategory'].isin(preferred_categories)]

    # Sort by similarity score, ratings, and price (if columns exist)
    if 'ratings' in recommendations.columns and 'Price (USD)' in recommendations.columns:
        recommendations['similarity_score'] = recommendations['id'].map(dict(zip(recommended_item_ids, similarities[top_indices])))
        recommendations = recommendations.sort_values(by=['similarity_score', 'ratings', 'Price (USD)'], ascending=[False, False, True])

    # Select top recommendations and desired columns
    recommendations = recommendations.head(num_recommendations)
    available_cols = ['id', 'ratings', 'Price (USD)'] + \
                     [col for col in ['mastercategory', 'subcategory', 'articletype', 'season', 'basecolour']
                      if col in recommendations.columns]
    recommendations = recommendations[available_cols]

    # Add a combined 'Category' column for masterCategory and subCategory
    if 'mastercategory' in recommendations.columns and 'subcategory' in recommendations.columns:
        recommendations['Category'] = recommendations['mastercategory'] + ' - ' + recommendations['subcategory']

    return recommendations

# Example usage with filters
filters = {
    'mastercategory': 'Clothing',
    'season': 'Winter'
}
recommendations = recommend_items(input_user_id=101, input_item_id=3940, num_recommendations=5, filters=filters)
print(recommendations)


          id  ratings  Price (USD)
8502   27353        3           32
35087  17607        5           44
35538  35788        4           41
16498   6083        4           32
34483   5679        1           17


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Predict on test set
y_pred = model.predict(X_test)

# Convert predictions to binary classes
y_pred_classes = (y_pred > 0.5).astype(int)

# Calculate metrics
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')
accuracy = accuracy_score(y_test, y_pred_classes)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")


[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
Precision: 0.0111
Recall: 0.1054
F1-score: 0.0201
Accuracy: 0.1054


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
