In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create dataset -> DataFrame
### **Inference dataset for fashion recommendation**
---

## Define Value Pools

In [30]:
BRANDS = ["ZARA", "Adidas", "Nike", "HnM", "Polo", "Tommy Hilfiger"]

SUBCATEGORIES = {
    "tops": [
        "printed_tshirts",
        "solid_tshirts",
        "printed_hoodies",
        "casual_shirts",
        "formal_shirts",
    ],
    "bottoms": [
        "formal_pants",
        "jeans",
        "men_cargos",
    ]
}

SLEEVE_BY_SUBCATEGORY = {
    "printed_tshirts": ["short"],
    "solid_tshirts": ["short"],
    "printed_hoodies": ["long"],
    "casual_shirts": ["long"],
    "formal_shirts": ["long"],
    "formal_pants": [None],
    "jeans": [None],
    "men_cargos": [None],
}

SEASONS = ["summer", "winter", "all-season"]
FABRICS = ["cotton", "denim", "polyester", "fleece", "linen"]
OCCASIONS = ["casual", "office", "party"]
FORMALITY = ["low", "medium", "high"]
SIZES = ["XS", "S", "M", "L", "XL", "XXL"]

In [31]:
import random

# Helper Functions (Realistic Numbers)
def random_length(subcategory):
    if subcategory in ["printed_tshirts", "solid_tshirts"]:
        return random.randint(65, 75)
    if subcategory == "printed_hoodies":
        return random.randint(70, 80)
    if subcategory in ["casual_shirts", "formal_shirts"]:
        return random.randint(68, 78)
    if subcategory in ["formal_pants", "jeans", "men_cargos"]:
        return random.randint(95, 110)
    return None

def random_price(subcategory):
    base = {
        "printed_tshirts": 150_000,
        "solid_tshirts": 130_000,
        "printed_hoodies": 320_000,
        "casual_shirts": 250_000,
        "formal_shirts": 300_000,
        "formal_pants": 350_000,
        "jeans": 330_000,
        "men_cargos": 280_000,
    }
    return int(np.random.normal(base[subcategory], base[subcategory] * 0.15))

# Generate ONE Random Item (Core Logic)

In [32]:
def generate_item(idx):
    category = random.choice(list(SUBCATEGORIES.keys()))
    subcategory = random.choice(SUBCATEGORIES[category])

    return {
        "item_id": f"TNC_{idx:06d}",
        "category": category,
        "subcategory": subcategory,
        "sleeve_type": random.choice(SLEEVE_BY_SUBCATEGORY[subcategory]),
        "season": random.choice(SEASONS),
        "fabric": random.choice(FABRICS),
        "occasion": random.choice(OCCASIONS),
        "formality_level": random.choice(FORMALITY),
        "size_range": random.choice(SIZES),
        "brand": random.choice(BRANDS),
        "view_count": int(np.random.exponential(800)),
        "click_count": int(np.random.exponential(150)),
        "purchase_count": int(np.random.exponential(30)),
        "length_cm": random_length(subcategory),
        "price": random_price(subcategory),
        "stocks": random.randint(0, 200),
    }

##  Generate 300,000 Rows

In [33]:
from tqdm import trange

N_ROWS = 300_000
data = [generate_item(i) for i in trange(1, N_ROWS + 1)]
df = pd.DataFrame(data)

100%|██████████| 300000/300000 [00:02<00:00, 138397.63it/s]


## Enforce Data Types

In [34]:
NUMERIC_COLS = [
    "view_count",
    "click_count",
    "purchase_count",
    "length_cm",
    "price",
    "stocks",
]

OBJECT_COLS = [
    "item_id",
    "category",
    "subcategory",
    "sleeve_type",
    "season",
    "fabric",
    "occasion",
    "formality_level",
    "size_range",
    "brand",
]

df[NUMERIC_COLS] = df[NUMERIC_COLS].apply(pd.to_numeric, errors="coerce")
df[OBJECT_COLS] = df[OBJECT_COLS].astype("string")

In [37]:
df.to_csv("fashion_store.csv")

In [36]:
df.isnull().sum()

item_id                 0
category                0
subcategory             0
sleeve_type        150577
season                  0
fabric                  0
occasion                0
formality_level         0
size_range              0
brand                   0
view_count              0
click_count             0
purchase_count          0
length_cm               0
price                   0
stocks                  0
dtype: int64