In [74]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Flatten, Dense, Concatenate, Multiply, Dropout, Activation, Dot
)
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
from tqdm import tqdm
import re
# Load datasets
user_activity = pd.read_csv('../vibrent/user_activity_triplets.csv', sep=';', engine='python', encoding='utf-8', on_bad_lines='skip', dtype=str, header=0)
outfits = pd.read_csv('../vibrent/outfits.csv', sep=';', engine='python', encoding='utf-8', on_bad_lines='skip', dtype=str, header=0)
picture_triplets = pd.read_csv('../vibrent/picture_triplets.csv', sep=';', engine='python', encoding='utf-8', on_bad_lines='skip', dtype=str, header=0)


## Create mappings because customer.id and outfit.id are strings and not in order.
# Map IDs to indices
user_id_mapping = {id: idx for idx, id in enumerate(user_activity['customer.id'].unique())}
item_id_mapping = {id: idx for idx, id in enumerate(user_activity['outfit.id'].unique())}
# Apply mappings
user_activity['user_idx'] = user_activity['customer.id'].map(user_id_mapping)
user_activity['item_idx'] = user_activity['outfit.id'].map(item_id_mapping)

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Preprocess metadata


## Clean descriptions
## First makes all lowercase, then fills NA with empty string before replacing everything not a-z with empty string before removing leading and trailing whitespace
outfits['description'] = outfits['description'].str.lower().fillna('').replace(r'[^a-z\s]', '', regex=True).str.strip()

# Initialize TF-IDF
tfidf_vectorizer = TfidfVectorizer(
    max_features=500,  # Limit vocabulary to 500 terms
    stop_words='english',  # Remove stopwords
    ngram_range=(1, 2)  # Include unigrams and bigrams
)

# Generate TF-IDF matrix
tfidf_features = tfidf_vectorizer.fit_transform(outfits['description']).toarray()

In [76]:
# Preprocess metadata

# Define the cleaning function. Outfit tags have many brackets we want to remove. 
clean_str = lambda x: re.sub(r"[\'\[\]]", "", x).strip()

#Remove brackets and split the tags on the comma
outfits['tags_list'] = outfits['outfit_tags'].apply(lambda x: [clean_str(tag) for tag in x.split(',')])

# Create a matrix of all the unique tags and outfits.
mlb = MultiLabelBinarizer()
tag_features = mlb.fit_transform(outfits['tags_list'])

## Outfit Group mapped to a numerical value
group_encoder = LabelEncoder()
outfits['group_encoded'] = group_encoder.fit_transform(outfits['group'])

## Numerical Features
numerical_features = ['retailPrice', 'pricePerWeek', 'pricePerMonth']
# FIll NA with median values
outfits[numerical_features] = outfits[numerical_features].apply(pd.to_numeric, errors='coerce').apply(lambda col: col.fillna(col.median()))

# Scale the numerical features
scaler = MinMaxScaler()
scaled_numerical_features = scaler.fit_transform(outfits[numerical_features])

## Combine all item features by cocatenation
item_features = np.concatenate([
    tag_features,
    outfits['group_encoded'].values.reshape(-1, 1),
    scaled_numerical_features,
    tfidf_features
], axis=1)

# Output the dimensionality
print("Final feature matrix dimensions:", item_features.shape)

item_feature_dim = item_features.shape[1]

Final feature matrix dimensions: (15649, 1156)


In [77]:
# Map outfit IDs to item indices, same as we did with user_activity
outfits['item_idx'] = outfits['id'].map(item_id_mapping)

# Create item feature matrix aligned with item indices (item_idx)
num_items = len(item_id_mapping)
item_features_aligned = np.zeros((num_items, item_feature_dim))

for idx, row in outfits.iterrows():
    item_idx = row['item_idx']
    if np.isnan(item_idx):
        continue
    item_idx = int(item_idx)
    item_features_aligned[item_idx] = item_features[idx]

# Generate negative samples, essentially negative sample = outfit user has not interacted with
def generate_negatives(df, num_negatives):
    user_item_set = set(zip(df['user_idx'], df['item_idx']))
    all_items = set(df['item_idx'].unique())
    negatives = []

    for user in tqdm(df['user_idx'].unique()):
        items_rented = set(df[df['user_idx'] == user]['item_idx'])
        non_interacted_items = list(all_items - items_rented)
        sampled_negatives = np.random.choice(
            non_interacted_items, size=num_negatives, replace=True
        )
        negatives.extend([(user, item, 0) for item in sampled_negatives]) # 0 is used in testing and validation

    return negatives

number_negative_per_positive = 5
num_negatives = 28*number_negative_per_positive # Approximately 28 is the average number of rented items per customer.

# Positive samples
user_activity['label'] = 1 # Set all rented items to be 1 for testing and validation
positive_samples = user_activity[['user_idx', 'item_idx', 'label']] # Use mappings and label.

# Negative samples
negative_samples = generate_negatives(user_activity, num_negatives)
negative_df = pd.DataFrame(negative_samples, columns=['user_idx', 'item_idx', 'label'])

# Combine and shuffle the dataset containing both positive and negative samples.
data = pd.concat([positive_samples, negative_df])
data = data.sample(frac=1).reset_index(drop=True)

100%|██████████| 2293/2293 [00:02<00:00, 847.31it/s]


In [78]:
# Split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Prepare training data
train_user = train_data['user_idx'].values
train_item = train_data['item_idx'].values
train_label = train_data['label'].values
train_item_metadata = item_features_aligned[train_item]

# Prepare testing data
test_user = test_data['user_idx'].values
test_item = test_data['item_idx'].values
test_label = test_data['label'].values
test_item_metadata = item_features_aligned[test_item]

In [80]:
import keras_tuner as kt
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from tensorflow.keras.regularizers import l2

num_users = len(user_id_mapping)

def build_model(hp):
    # Hyperparameters to tune, the embedding dimensions, number of layers in MLP, dropout rate and learning rate
    gmf_embedding_dim = hp.Choice('gmf_embedding_dim', values=[12, 18, 24, 30]) # Too big = missing contextual data, too small = loosing valuable data
    mlp_embedding_dim = hp.Choice('mlp_embedding_dim', values=[32, 40, 48, 56]) # Too big = missing contextual data, too small = loosing valuable data
    num_layers = hp.Int('num_layers', min_value=1, max_value=3) # Input layer + 1-3 hidden layers + output layer
    dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.2, step=0.05) # To prevent overfitting, randomly turn of neurons
    learning_rate = hp.Choice('learning_rate', values=[5e-3, 1e-3, 5e-4, 1e-4]) # Find optimal learning rate
    
    # Inputs (a user, an outfit with its metadata)
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')
    item_metadata_input = Input(shape=(item_feature_dim,), name='item_metadata_input')
    
    # Embeddings
    user_embedding_gmf = Embedding(num_users, gmf_embedding_dim, name='user_embedding_dim_gmf', embeddings_regularizer=l2(1e-5))(user_input)
    item_embedding_gmf = Embedding(num_items, gmf_embedding_dim, name='item_embedding_gmf', embeddings_regularizer=l2(1e-5))(item_input)
    user_embedding_mlp = Embedding(num_users, mlp_embedding_dim, name='user_embedding_dim_mlp', embeddings_regularizer=l2(1e-4))(user_input)
    item_embedding_mlp = Embedding(num_items, mlp_embedding_dim, name='item_embedding_mlp', embeddings_regularizer=l2(1e-4))(item_input)

    
    # Flatten embeddings
    user_latent_gmf = Flatten()(user_embedding_gmf)
    item_latent_gmf = Flatten()(item_embedding_gmf)
    user_latent_mlp = Flatten()(user_embedding_mlp)
    item_latent_mlp = Flatten()(item_embedding_mlp)
    
    # Integrate metadata into MLP part
    item_latent_mlp = Concatenate()([item_latent_mlp, item_metadata_input]) #With metadata
    #item_latent_mlp = item_latent_mlp #Without Metadata

    
    # GMF and MLP interaction layers

    #mf_vector = Dot(axes=1)([user_latent_gmf, item_latent_gmf]) # Dot product is a viable option to Multiply which does element wise product

    # GMF
    gmf_vector = Multiply()([user_latent_gmf, item_latent_gmf])
    gmf_vector = Dropout(dropout_rate)(gmf_vector)  # Add dropout to GMF component
    #gmf_vector = Activation('sigmoid')(gmf_vector) # Paper wanter us to use sigmoid activation function for the generalized mf vector. However we got better results not using it. 

    # Concatenate user and item for mlp
    mlp_vector = Concatenate()([user_latent_mlp, item_latent_mlp])
    
    # MLP layers
    for i in range(num_layers):
        units = hp.Int(f'units_{i}', min_value=4, max_value=12, step=2) # Random, tries to find optimal in HPT.
        mlp_vector = Dense(units, activation='relu')(mlp_vector) # ReLu worked best
        mlp_vector = Dropout(dropout_rate)(mlp_vector)  # Dropout after each dense layer
    
    # Combine GMF and MLP parts
    neu_vector = Concatenate()([gmf_vector, mlp_vector])
    neu_vector = Dropout(dropout_rate)(neu_vector)  # Additional dropout before the final dense layer
    
    # Output layer using sigmoid to scale the results from 0-1
    output = Dense(1, activation='sigmoid', name='output')(neu_vector)
    
    # Compile model
    model = Model(inputs=[user_input, item_input, item_metadata_input], outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy', #
        metrics=[
            'accuracy',
            tf.keras.metrics.AUC(name='auc'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
        ]
    )
    
    return model

# Define the tuner
tuner = kt.RandomSearch(
    build_model,
    objective=kt.Objective('val_auc', direction='max'),
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='neumf_tuning_v2_v8' #'neumf_tuning_v2_v7 or v8' best one yet
)

# Prepare training and validation data
train_inputs = [train_user, train_item, train_item_metadata]
train_labels = train_label

val_inputs = [test_user, test_item, test_item_metadata]
val_labels = test_label

# Assign class weights because of imbalanced dataset
positive_class_weight_multiplier = number_negative_per_positive*1.5  # Adjust this as needed
class_weights = {0: 1.0, 1: positive_class_weight_multiplier * len(train_label) / (2 * sum(train_label))}

# Start the hyperparameter search
tuner.search(
    train_inputs,
    train_labels,
    epochs=10,
    validation_data=(val_inputs, val_labels),
    class_weight=class_weights,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]
)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the optimal hyperparameters
model = tuner.hypermodel.build(best_hps)

# Train the model
history = model.fit(
    train_inputs,
    train_labels,
    batch_size=256,
    epochs=25,
    validation_data=(val_inputs, val_labels),
    class_weight=class_weights,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)]
)


Reloading Tuner from tuner_dir/neumf_tuning_v2_v8/tuner0.json
Epoch 1/25




[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.2103 - auc: 0.5033 - loss: 5.1081 - precision: 0.1674 - recall: 0.9335 - val_accuracy: 0.1676 - val_auc: 0.8044 - val_loss: 1.1541 - val_precision: 0.1676 - val_recall: 1.0000
Epoch 2/25
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.1655 - auc: 0.7312 - loss: 1.9778 - precision: 0.1655 - recall: 1.0000 - val_accuracy: 0.1676 - val_auc: 0.8171 - val_loss: 1.2708 - val_precision: 0.1676 - val_recall: 1.0000
Epoch 3/25
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.1673 - auc: 0.7767 - loss: 1.8926 - precision: 0.1673 - recall: 1.0000 - val_accuracy: 0.1676 - val_auc: 0.8225 - val_loss: 1.3206 - val_precision: 0.1676 - val_recall: 1.0000
Epoch 4/25
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.1676 - auc: 0.8224 - loss: 1.8367 - precision: 0.1676 - recall: 1.0000 - v

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)

In [None]:
from sklearn.metrics import precision_recall_curve, roc_auc_score

# Get predicted probabilities for the test set
y_probs = model.predict([test_user, test_item, test_item_metadata])
# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(test_label, y_probs)
# Find threshold with the best F1 score
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Evaluate the model on the test set
test_pred_probs = model.predict(val_inputs, batch_size=1024).flatten()
test_pred = (test_pred_probs >= optimal_threshold).astype(int)

# Confusion Matrix
cm = confusion_matrix(val_labels, test_pred)
tn, fp, fn, tp = cm.ravel()

print(f"""
Confusion Matrix (Test Set):
[[TN FP]
 [FN TP]]
{cm}

True Negatives: {tn}
False Positives: {fp}
False Negatives: {fn}
True Positives: {tp}
""")

# Classification Report
print(classification_report(val_labels, test_pred, digits=4))

# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(val_labels, test_pred_probs)
auc_score = roc_auc_score(val_labels, test_pred_probs)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = {:.2f})'.format(auc_score))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.savefig("ROC Curve")
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
# Calculate the AUC score
auc_score = roc_auc_score(val_labels, test_pred_probs)

# Print the AUC score
print(f"AUC Score: {auc_score}")

In [None]:
# Plot accuracy
plt.figure(figsize=(8, 4))
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Neural MF Model Accuracy with metadata')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.savefig("Accuracy")
plt.show()

# Plot loss
plt.figure(figsize=(8, 4))
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Neural MF Model Loss with metadata')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.savefig("Model loss")
plt.show()

In [None]:
import seaborn as sns
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, 
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.savefig('CMatrix')
plt.show()