In [None]:
import pandas as pd
import random
from faker import Faker
import os
from dotenv import load_dotenv
from ibm_watsonx_ai import APIClient, Credentials
from ibm_watsonx_ai.foundation_models import Embeddings
import csv

In [3]:
fake = Faker()

ont_cities = [
    "Toronto", "Ottawa"
]

# Definitions
brands = ['Zentrax', 'FootFlex', 'StrideOne', 'Loopic', 'RunXpress']
types = ['Running', 'Walking']
classes = ['Men', 'Women']
materials = ['Synthetic', 'Knit']
colors = ['Black', 'White']
arch_supports = ['High', 'Flat']
weather_resistances = ['Waterproof', 'Resistant']
sizes = [round(s, 1) for s in range(6, 13)] + [s + 0.5 for s in range(6, 13)]
store_ids = range(1, 21)

# Helper: create a fake product name
def create_product_name(brand, shoe_type):
    return f"{brand} {random.choice(['Ultra', 'Flex', 'Pro', 'X', 'Max'])} {shoe_type}"

# Helper: create fake keywords
def generate_keywords(shoe_type, material):
    keywords = [shoe_type.lower(), material.lower()]
    keywords += random.sample(['lightweight', 'durable', 'breathable', 'cushioned', 'supportive', 'flexible'], 3)
    return ', '.join(keywords)

def generate_shoe_data(n=500):
    data = []
    used_skus = set()

    for _ in range(n):
        brand = random.choice(brands)
        shoe_type = random.choice(types)
        shoe_class = random.choice(classes)
        material = random.choice(materials)
        size = random.choice(sizes)
        color = random.choice(colors)
        arch = random.choice(arch_supports)
        weather = random.choice(weather_resistances)
        store_id = random.choice(store_ids)
        city = random.choice(ont_cities)
                
        price = round(random.uniform(29.99, 149.99), 2)
        rating = round(random.uniform(3.0, 5.0), 1)
        product_name = create_product_name(brand, shoe_type)

        # Ensure SKU uniqueness
        while True:
            sku = f"{brand[:3].upper()}-{random.randint(1000, 9999)}"
            if sku not in used_skus:
                used_skus.add(sku)
                break

        data.append({
            'SKU': sku,
            'PRODUCT_NAME': product_name,
            'BRAND': brand,
            'CLASS': shoe_class,
            'TYPE': shoe_type,
            'MATERIAL': material,
            'COLOR': color,
            'WEATHER_RESISTANCE': weather,
            'ARCH_SUPPORT': arch,
            'SIZE': size,
            'PRICE': price,
            'RATING': rating,
            'STORE_ID': store_id,
            'CITY': city
        })

    return pd.DataFrame(data)

# Generate and save
df_shoes = generate_shoe_data(500)
# sq_shoes.to_csv("shoes.csv", index=False)
# print("Dataset saved as 'shoes.csv'")

In [4]:
df_shoes.head()

Unnamed: 0,SKU,PRODUCT_NAME,BRAND,CLASS,TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,SIZE,PRICE,RATING,STORE_ID,CITY
0,ZEN-5999,Zentrax X Walking,Zentrax,Women,Walking,Synthetic,White,Resistant,High,8.0,108.65,4.3,1,Ottawa
1,RUN-4723,RunXpress Max Walking,RunXpress,Women,Walking,Synthetic,White,Waterproof,High,8.5,103.22,3.0,20,Ottawa
2,ZEN-8968,Zentrax Pro Running,Zentrax,Men,Running,Synthetic,Black,Waterproof,High,7.5,100.25,3.7,13,Toronto
3,RUN-7569,RunXpress X Walking,RunXpress,Men,Walking,Synthetic,Black,Waterproof,Flat,7.5,83.84,3.2,4,Ottawa
4,RUN-1083,RunXpress X Walking,RunXpress,Women,Walking,Synthetic,Black,Resistant,Flat,12.5,139.7,3.8,2,Toronto


In [5]:
embedding_cols = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT']

In [6]:
df_shoes[embedding_cols].head()

Unnamed: 0,TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT
0,Walking,Synthetic,White,Resistant,High
1,Walking,Synthetic,White,Waterproof,High
2,Running,Synthetic,Black,Waterproof,High
3,Walking,Synthetic,Black,Waterproof,Flat
4,Walking,Synthetic,Black,Resistant,Flat


# Generating embedding vetors for the shoes

Combine all embedding columns into a single string for each row, including column names

In [7]:
df_shoes['COMBINED'] = df_shoes.apply(
    lambda row: ' [SEP] '.join([f"{col_name}: {row[col_name]}" for col_name in embedding_cols]), 
    axis=1
)

In [8]:
cols_to_show = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'COMBINED']
df_shoes[cols_to_show].head()

Unnamed: 0,TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,COMBINED
0,Walking,Synthetic,White,Resistant,High,TYPE: Walking [SEP] MATERIAL: Synthetic [SEP] ...
1,Walking,Synthetic,White,Waterproof,High,TYPE: Walking [SEP] MATERIAL: Synthetic [SEP] ...
2,Running,Synthetic,Black,Waterproof,High,TYPE: Running [SEP] MATERIAL: Synthetic [SEP] ...
3,Walking,Synthetic,Black,Waterproof,Flat,TYPE: Walking [SEP] MATERIAL: Synthetic [SEP] ...
4,Walking,Synthetic,Black,Resistant,Flat,TYPE: Walking [SEP] MATERIAL: Synthetic [SEP] ...


In [9]:
df_shoes.iloc[0]['COMBINED']

'TYPE: Walking [SEP] MATERIAL: Synthetic [SEP] COLOR: White [SEP] WEATHER_RESISTANCE: Resistant [SEP] ARCH_SUPPORT: High'

Setting up `wx.ai` embedding API connection

In [10]:
load_dotenv(os.getcwd()+"/.env", override=True)
credentials = Credentials(
                url = "https://us-south.ml.cloud.ibm.com",
                api_key = os.getenv("WATSONX_APIKEY", "")
                )

client = APIClient(credentials)

project_id = os.getenv("WATSONX_PROJECT", "")
client.set.default_project(project_id)

embeddings = Embeddings(
    model_id=client.foundation_models.EmbeddingModels.MULTILINGUAL_E5_LARGE,
    credentials=credentials,
    project_id=project_id,
)

Showing a few sample rows with their embedding vectors

In [12]:
row_combined = df_shoes['COMBINED'].tolist()
shoe_vectors = embeddings.embed_documents(texts=row_combined)
df_shoes['EMBEDDING'] = shoe_vectors
df_shoes['EMBEDDING'] = df_shoes['EMBEDDING'].apply(lambda x: '[' + ', '.join(map(str, x)) + ']')
df_shoes.drop(['COMBINED'], axis=1, inplace=True)

In [13]:
cols_to_show = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'EMBEDDING']
df_shoes[cols_to_show].head()

Unnamed: 0,TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,EMBEDDING
0,Walking,Synthetic,White,Resistant,High,"[0.037853196, 0.0047503645, -0.042236183, -0.0..."
1,Walking,Synthetic,White,Waterproof,High,"[0.035730753, 0.0032045494, -0.039743163, -0.0..."
2,Running,Synthetic,Black,Waterproof,High,"[0.023130681, 0.0013610307, -0.042692233, -0.0..."
3,Walking,Synthetic,Black,Waterproof,Flat,"[0.02354851, 0.004738677, -0.03867846, -0.0353..."
4,Walking,Synthetic,Black,Resistant,Flat,"[0.02477072, 0.0077166175, -0.041874062, -0.02..."


In [15]:
# df_shoes.iloc[0]['EMBEDDING']

In [26]:
df_shoes.columns

Index(['SKU', 'PRODUCT_NAME', 'BRAND', 'CLASS', 'TYPE', 'MATERIAL', 'COLOR',
       'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'SIZE', 'PRICE', 'RATING',
       'STORE_ID', 'CITY', 'EMBEDDING'],
      dtype='object')

Save the shoes dataframe into a .csv file

In [16]:
df_shoes.to_csv(
    'shoes-vectors.csv',
    index=False,
    quoting=csv.QUOTE_NONNUMERIC
)