In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
from typing import Literal
from embedding_manager import *

EMBEDDING_TEXT_TYPE = "long"
USE_FLATTEN_EMBEDDING = False
USE_METADATA = True
MODEL: Literal["xgboost", "random_forest", "catboost"] = "catboost"

In [2]:
data = pd.read_csv("data/transformed_attribute_data.csv")
data

Unnamed: 0,cod_modelo_color,cane_height_type,closure_placement,heel_shape_type,knit_structure,length_type,neck_lapel_type,silhouette_type,sleeve_length_type,toecap_type,waist_type,woven_structure
0,81_1034451,INVALID,INVALID,INVALID,INVALID,INVALID,Hawaiano/Bowling,Regular,Corta,INVALID,INVALID,INVALID
1,81_1034525,INVALID,INVALID,INVALID,INVALID,INVALID,Hawaiano/Bowling,Regular,Corta,INVALID,INVALID,INVALID
2,81_1035318,INVALID,INVALID,INVALID,INVALID,Largo,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID
3,81_1035321,INVALID,INVALID,INVALID,INVALID,Largo,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID
4,81_1035361,INVALID,INVALID,INVALID,INVALID,Largo,Capucha,INVALID,INVALID,INVALID,INVALID,INVALID
...,...,...,...,...,...,...,...,...,...,...,...,...
33438,86_9893076,INVALID,INVALID,INVALID,INVALID,Standard,Redondo,Slim,Tirante Fino,INVALID,INVALID,Ligero
33439,86_9893077,INVALID,INVALID,INVALID,INVALID,Standard,Redondo,Slim,Tirante Fino,INVALID,INVALID,Ligero
33440,86_9893868,INVALID,INVALID,INVALID,INVALID,Standard,Redondo,Slim,Tirante Fino,INVALID,INVALID,Ligero
33441,86_9893898,INVALID,Cierre Delantero,INVALID,INVALID,Standard,Pico,Recto,INVALID,INVALID,INVALID,Medio


In [3]:
meta_df = pd.read_csv("data/product_data.csv")
meta_df

Unnamed: 0,cod_modelo_color,cod_color,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,des_filename,des_color
0,83_1124642,82,Female,Kids,KIDS,TRICOT,Tops,Sweaters and Cardigans,Sweater,Sweater,83_1124642_17074019-82_B.jpg,ROSA LIGHT
1,86_1215223,01,Female,Kids,KIDS,WOVEN,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,Dresses,Dress,86_1215223_47014392-01_B.jpg,BLANCO
2,84_1167695,70,Female,Kids,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,84_1167695_27005150-70_B.jpg,ROJO
3,82_1108473,01,Female,Teen,KIDS,JEANS,Bottoms,Jeans,Jeans,Jeans,82_1108473_87076320-01_B.jpg,BLANCO
4,83_1137778,37,Male,Adult,MAN,CIRCULAR,Tops,Sweaters and Cardigans,Sweatshirts,Sweatshirt,83_1137778_17014765-37_.jpg,KHAKI
...,...,...,...,...,...,...,...,...,...,...,...,...
61479,84_1171997,12,Female,Adult,WOMAN,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,84_1171997_27004019-12_B.jpg,AMARILLO
61480,84_1179998,50,Female,Kids,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,84_1179998_27028257-50_B.jpg,CELESTE
61481,83_1131136,52,Female,Baby,KIDS,ACCESSORIES,"Accesories, Swim and Intimate",Accessories,"Hats, scarves and gloves",Headband,83_1131136_17062885-52_B.jpg,AZUL
61482,82_1103532,56,Female,Adult,WOMAN,WOVEN,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,Jumpsuit,Jumpsuit,82_1103532_87075684-56_B.jpg,NAVY


In [4]:
def add_metadata_to_df(df):
    meta_df = pd.read_csv("data/product_data.csv")
    meta_df = meta_df.drop(columns=["des_filename"]).drop_duplicates()

    RELEVANT_META = ["des_product_family", "des_line", "des_fabric"]
    meta_df = meta_df[RELEVANT_META + ["cod_modelo_color"]]

    df = df.merge(meta_df, on="cod_modelo_color", how="left")

    return df

def get_df_processed(data, attr_slice, N_min=5, embedding_kind="long"):
    df = data[["cod_modelo_color", attr_slice]].copy()

    df = add_embeddings_to_df(df)
    df = add_attr_sim(df, embedding_kind=embedding_kind)
    df = add_subattr_sim(df, attr_slice, embedding_kind=embedding_kind)

    # drop all attributes that appear less than N_min times
    drop_vals = df[attr_slice].value_counts()[df[attr_slice].value_counts() < N_min].index
    df = df[~df[attr_slice].isin(drop_vals)]

    if USE_METADATA:
        df = add_metadata_to_df(df)

    df = df.drop(columns=["cod_modelo_color"])

    # convert to categorical all string columns
    for col in df.columns:
        if df[col].dtype == "object" and col != "embedding":
            df[col] = df[col].astype("category")

    return df

In [5]:
df = get_df_processed(data, "closure_placement")
df

Unnamed: 0,closure_placement,embedding,closure_placement_sim,Cuello_sim,Sin cierre_sim,Cierre Delantero_sim,Cierre Trasero_sim,Cierre Hombro_sim,Lateral_sim,des_product_family,des_line,des_fabric
0,INVALID,"[-0.17474917, 0.16217516, 0.40260923, -0.86747...",0.182966,0.178880,0.217037,0.180620,0.139369,0.183897,0.194742,Shirt,MAN,WOVEN
1,INVALID,"[-0.24003744, 0.06450614, 0.8397347, -1.057063...",0.181320,0.165888,0.230283,0.168859,0.170516,0.174156,0.193994,Shirt,MAN,WOVEN
2,INVALID,"[-0.18954061, 0.13276234, 1.1418515, -1.095528...",0.177768,0.175540,0.216890,0.180656,0.135699,0.204574,0.186123,Shirt,MAN,WOVEN
3,INVALID,"[-0.4249166, -0.15395549, 0.89751333, -0.96329...",0.166384,0.149718,0.206450,0.171543,0.141575,0.181411,0.181713,Shirt,MAN,WOVEN
4,INVALID,"[-0.26688784, -0.051651932, 0.7943201, -0.1294...",0.195209,0.208943,0.213915,0.226990,0.153924,0.195322,0.215576,Jackets,MAN,WOVEN
...,...,...,...,...,...,...,...,...,...,...,...,...
61477,INVALID,"[0.17486537, -0.17013556, 0.49440274, -0.78616...",0.179577,0.185574,0.241253,0.197587,0.222677,0.204423,0.195709,Tops,WOMAN,CIRCULAR
61478,Cierre Delantero,"[-0.06623721, 0.6834561, 0.6908933, -0.3276321...",0.216187,0.203205,0.233912,0.215987,0.173976,0.185537,0.226511,Vest,WOMAN,WOVEN
61479,Cierre Delantero,"[0.1940485, 0.1506823, 0.34528264, -0.7869217,...",0.259711,0.275195,0.267674,0.283169,0.302348,0.271179,0.257393,Vest,WOMAN,WOVEN
61480,Cierre Delantero,"[0.24107921, -0.58900714, 0.58061886, 0.381187...",0.209612,0.170613,0.218666,0.198337,0.153136,0.156128,0.222702,Blazers,WOMAN,WOVEN


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

all_attrs = data.columns[1:]
models = {}
accuracies = {}
label_encoders = {}

N_folds = 5

for attr in all_attrs:
    df = get_df_processed(data, attr, embedding_kind=EMBEDDING_TEXT_TYPE)
    # df = df[df[attr] != "INVALID"].reset_index(drop=True)

    # define the features and target

    if USE_FLATTEN_EMBEDDING:
        df = pd.concat([df, df["embedding"].apply(pd.Series)], axis=1)

    X = df.drop(columns=[attr, "embedding"])
    y = df[attr]

    # encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

    # define the model, enable categorical features
    if MODEL == "xgboost":
        model = xgb.XGBClassifier(tree_method="hist", enable_categorical=True)
    elif MODEL == "random_forest":
        model = RandomForestClassifier()
    elif MODEL == "catboost":
        categorical_columns = X.select_dtypes(include=['category']).columns.tolist()
        model = CatBoostClassifier(iterations=500, 
                           learning_rate=0.1, 
                           depth=10, 
                           loss_function='MultiClass',
                           cat_features=categorical_columns,
                           verbose=False)
        
    model.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)

    # calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {attr}: {accuracy}")

    models[attr] = model
    label_encoders[attr] = label_encoder
    accuracies[attr] = accuracy

mean_accuracy = np.mean(list(accuracies.values()))
print(f"===  Mean accuracy: {mean_accuracy}  ===")

Accuracy for cane_height_type: 0.9940635927461983


In [None]:
test_path = "data/test_data.csv"
test_data = pd.read_csv(test_path)

df_test = test_data[["cod_modelo_color", "attribute_name", "test_id"]]

df_test = add_embeddings_to_df(df_test)

nan_rows = df_test[df_test["embedding"].isna()]

df_test = df_test[~df_test["embedding"].isna()].reset_index(drop=True)

In [None]:
test_path = "data/test_data.csv"
test_data = pd.read_csv(test_path)

df_test = test_data[["cod_modelo_color", "attribute_name", "test_id"]]

df_test = add_embeddings_to_df(df_test)

nan_rows = df_test[df_test["embedding"].isna()]

df_test = df_test[~df_test["embedding"].isna()].reset_index(drop=True)

# flatten the embeddings into multiple columns
if USE_FLATTEN_EMBEDDING:
    df_test = pd.concat([df, df["embedding"].apply(pd.Series)], axis=1)

# group by attribute_name, add the subattr_sim and attr_sim and predict the attribute
df_test_grouped = {attr: group.drop(columns=["cod_modelo_color"]) for attr, group in df_test.groupby("attribute_name") }

# out_res contains test_id: predicted_attribute
out_res = {}

for attr, group in df_test_grouped.items():

    group.rename(columns={"attribute_name": attr}, inplace=True)

    group = add_attr_sim(group)
    group = add_subattr_sim(group, attr)

    model = models[attr]


    y_pred = model.predict(group.drop(columns=["test_id", "embedding", attr]))
    y_pred = label_encoders[attr].inverse_transform(y_pred)
    out_res.update({test_id: pred for test_id, pred in zip(group["test_id"], y_pred)})


# add the nan rows back and set the attribute to "INVALID"
out_res.update({test_id: "INVALID" for test_id in nan_rows["test_id"]})

In [None]:
# out df has cols ["test_id","des_value"]
out_df = pd.DataFrame.from_dict(out_res, orient="index", columns=["des_value"]).reset_index()
out_df.rename(columns={"index": "test_id"}, inplace=True)
out_df.to_csv("submissions/xgboost_text_encodings.csv", index=False)

(71819, 2)