In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
from embedding_manager import *

EMBEDDING_TEXT_TYPE = "long"

In [2]:
data = pd.read_csv("data/transformed_attribute_data.csv")
data

Unnamed: 0,cod_modelo_color,cane_height_type,closure_placement,heel_shape_type,knit_structure,length_type,neck_lapel_type,silhouette_type,sleeve_length_type,toecap_type,waist_type,woven_structure
0,81_1034451,INVALID,INVALID,INVALID,INVALID,INVALID,Hawaiano/Bowling,Regular,Corta,INVALID,INVALID,INVALID
1,81_1034525,INVALID,INVALID,INVALID,INVALID,INVALID,Hawaiano/Bowling,Regular,Corta,INVALID,INVALID,INVALID
2,81_1035318,INVALID,INVALID,INVALID,INVALID,Largo,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID
3,81_1035321,INVALID,INVALID,INVALID,INVALID,Largo,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID
4,81_1035361,INVALID,INVALID,INVALID,INVALID,Largo,Capucha,INVALID,INVALID,INVALID,INVALID,INVALID
...,...,...,...,...,...,...,...,...,...,...,...,...
33438,86_9893076,INVALID,INVALID,INVALID,INVALID,Standard,Redondo,Slim,Tirante Fino,INVALID,INVALID,Ligero
33439,86_9893077,INVALID,INVALID,INVALID,INVALID,Standard,Redondo,Slim,Tirante Fino,INVALID,INVALID,Ligero
33440,86_9893868,INVALID,INVALID,INVALID,INVALID,Standard,Redondo,Slim,Tirante Fino,INVALID,INVALID,Ligero
33441,86_9893898,INVALID,Cierre Delantero,INVALID,INVALID,Standard,Pico,Recto,INVALID,INVALID,INVALID,Medio


In [3]:
def get_df_processed(data, attr_slice, N_min=5, embedding_kind="long"):
    df = data[["cod_modelo_color", attr_slice]].copy()

    df = add_embeddings_to_df(df)
    df = add_attr_sim(df, embedding_kind=embedding_kind)
    df = add_subattr_sim(df, attr_slice, embedding_kind=embedding_kind)

    df = df.drop(columns=["cod_modelo_color"])

    # drop all attributes that appear less than N_min times
    drop_vals = df[attr_slice].value_counts()[df[attr_slice].value_counts() < N_min].index
    df = df[~df[attr_slice].isin(drop_vals)]

    return df

In [4]:
df = get_df_processed(data, "woven_structure")
print(data.shape)
print(df.shape)

(33443, 12)
(61482, 7)


In [5]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

all_attrs = data.columns[1:]
models = {}
accuracies = {}
label_encoders = {}

for attr in all_attrs:
    df = get_df_processed(data, attr, embedding_kind=EMBEDDING_TEXT_TYPE)
    # df = df[df[attr] != "INVALID"].reset_index(drop=True)

    # define the features and target
    X = df.drop(columns=[attr, "embedding"])
    y = df[attr]

    # encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.05, random_state=42, stratify=y_encoded)

    # define the model
    model = xgb.XGBClassifier(objective="multi:softmax", num_class=len(y.unique()))
    model.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)

    # calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {attr}: {accuracy}")

    models[attr] = model
    label_encoders[attr] = label_encoder
    accuracies[attr] = accuracy

mean_accuracy = np.mean(list(accuracies.values()))
print(f"===  Mean accuracy: {mean_accuracy}  ===")

Accuracy for cane_height_type: 0.9938211382113821
Accuracy for closure_placement: 0.6751219512195122
Accuracy for heel_shape_type: 0.9892682926829268
Accuracy for knit_structure: 0.8695934959349594
Accuracy for length_type: 0.6230894308943089
Accuracy for neck_lapel_type: 0.6662329212752115
Accuracy for silhouette_type: 0.5559531554977228
Accuracy for sleeve_length_type: 0.8009756097560976
Accuracy for toecap_type: 0.9811382113821138
Accuracy for waist_type: 0.816910569105691
Accuracy for woven_structure: 0.5134959349593496
===  Mean accuracy: 0.771418246447207  ===


In [11]:
test_path = "data/test_data.csv"
test_data = pd.read_csv(test_path)

df_test = test_data[["cod_modelo_color", "attribute_name", "test_id"]]

df_test = add_embeddings_to_df(df_test)

# group by attribute_name, add the subattr_sim and attr_sim and predict the attribute
df_test_grouped = {attr: group.drop(columns=["cod_modelo_color"]) for attr, group in df_test.groupby("attribute_name") }

# out_res contains test_id: predicted_attribute
out_res = {}

for attr, group in df_test_grouped.items():

    group.rename(columns={"attribute_name": attr}, inplace=True)
    
    group_not_null = group[~group["embedding"].isnull()].copy()
    group_null = group[group["embedding"].isnull()].copy()

    group_not_null = add_attr_sim(group_not_null)
    group_not_null = add_subattr_sim(group_not_null, attr)

    model = models[attr]


    y_pred = model.predict(group_not_null.drop(columns=["test_id", "embedding", attr]))
    y_pred = label_encoders[attr].inverse_transform(y_pred)
    out_res.update({test_id: pred for test_id, pred in zip(group_not_null["test_id"], y_pred)})

    out_res.update({test_id: "INVALID" for test_id in group_null["test_id"]})

In [None]:
# out df has cols ["test_id","des_value"]
out_df = pd.DataFrame.from_dict(out_res, orient="index", columns=["des_value"]).reset_index()
out_df.rename(columns={"index": "test_id"}, inplace=True)
out_df.to_csv("submissions/xgboost_text_encodings.csv", index=False)

(71819, 2)