In [1]:
# Lets automate the process
import pickle as pkl
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import os
import catboost

device = "cuda" if torch.cuda.is_available() else "cpu"


def create_train_dataset():
    with open('embeddings/embeddings_fclip.pkl', 'rb') as f:
        embeddings = pkl.load(f)
        
    keys = list(embeddings.keys())
    new_keys = []
    for key in keys:
        new_key = key.split('_')[:2]
        new_key = '_'.join(new_key)
        new_keys.append(new_key)
        
    # create a dictionary with the new keys
    new_embeddings = {}
    for i, key in enumerate(new_keys):
        new_embeddings[key] = embeddings[keys[i]]
        
    pd_embeddings = pd.DataFrame.from_dict(new_embeddings, orient='index')
    pd_embeddings.reset_index(inplace=True)
    
    test_data = pd.read_csv('data/transformed_attribute_data.csv')
    test_data_cat = test_data.columns
    
    augmented_data = pd.read_csv('data/product_data.csv')
    
    test_data = pd.merge(test_data, pd_embeddings, left_on='cod_modelo_color', right_on='index', how='left')
    test_data = pd.merge(test_data, augmented_data[["cod_modelo_color", "des_product_family", "des_line", "des_fabric"]], left_on='cod_modelo_color', right_on='cod_modelo_color', how='left')
    test_data.columns = test_data.columns.astype(str)
    
    return test_data, test_data_cat[1:]

def train_valid_invalid_classifier(train_dataset, cat, model='xgboost'):
    # drop rows that have unique y labels to avoid problems with stratify
    for label in train_dataset[cat].unique():
        if len(train_dataset[train_dataset[cat] == label]) == 1:
            train_dataset = train_dataset[train_dataset[cat] != label]
    
    enc = LabelEncoder()
    enc.fit(train_dataset[cat])
    
    train, test = train_test_split(train_dataset, test_size=0.2, random_state=234, stratify=train_dataset[cat])
    
    X_train = train.drop([cat], axis=1)
    Y_train = enc.transform(train[cat])

    if model == 'xgboost':
        model = xgb.XGBClassifier(objective='binary:logistic', enable_categorical=True, device=device)    
    elif model == 'catboost':
        cat_features = ["des_product_family", "des_line", "des_fabric"]

        if device == 'cuda':
            model = catboost.CatBoostClassifier(iterations=1000, task_type='GPU', devices='0:1', cat_features=cat_features, verbose=False)
        else:
            model = catboost.CatBoostClassifier(iterations=1000, cat_features=cat_features, verbose=False)

    model.fit(X_train, Y_train)

    X_test = test.drop([cat], axis=1)
    Y_test = enc.transform(test[cat])

    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(Y_test, y_pred)
    
    return model, enc, accuracy

class Net(nn.Module):
    def __init__(self, input_size, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)

        # dropout
        self.dropout = nn.Dropout(0.2)


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

def train_type_classifier(train_dataset, cat, epochs=100, Model=Net, use_metadata_classifier=False):
    # drop rows that have unique y labels to avoid problems with stratify
    for label in train_dataset[cat].unique():
        if len(train_dataset[train_dataset[cat] == label]) == 1:
            train_dataset = train_dataset[train_dataset[cat] != label]

    if use_metadata_classifier:
        # one hot encode the metadata
        for md in ["des_product_family", "des_line", "des_fabric"]:
            train_dataset = pd.concat([train_dataset, pd.get_dummies(train_dataset[md], prefix=md)], axis=1)
            train_dataset.drop(md, axis=1, inplace=True)
    
    enc = LabelEncoder()
    enc.fit(train_dataset[cat])
    
    train, test = train_test_split(train_dataset, test_size=0.2, random_state=234, stratify=train_dataset[cat])
    
    X_train = train.drop([cat], axis=1)
    Y_train = enc.transform(train[cat])
    
    X_test = test.drop([cat], axis=1)
    Y_test = enc.transform(test[cat])
    
    
    input_size = X_train.shape[1]
    output_size = len(enc.classes_)
    net = Model(input_size, output_size)

    net.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay=1e-5)
    
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32, device=device)
    Y_train_tensor = torch.tensor(Y_train, dtype=torch.long, device=device)

    
    train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    for _ in range(epochs):
        for _, data in enumerate(train_loader):
            inputs, labels = data
            optimizer.zero_grad()
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
    outputs = net(torch.tensor(X_test.values, dtype=torch.float32).to(device))
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(Y_test, predicted.cpu().numpy())
            
    return net, enc, accuracy


def train_and_save(Model=Net, binary_model="xgboost", use_metadata_classifier=False):
    train_dataset, categories = create_train_dataset()
    
    
    metadata = ["des_product_family", "des_line", "des_fabric"]
    
    enc_cat_dict = {}
    enc_val_dict = {}

    metrics = {}
    
    for cat in categories:

        categories_to_drop = [x for x in categories if x != cat]
        train_dataset_valid_invalid = train_dataset.drop(categories_to_drop, axis=1).copy()
        train_dataset_valid_invalid.loc[train_dataset_valid_invalid[cat] != 'INVALID', cat] = 'VALID'
    
        train_dataset_valid_invalid.drop(['index', 'cod_modelo_color'], inplace=True, axis=1)
        
        for ct in train_dataset_valid_invalid.columns:
            if train_dataset_valid_invalid[ct].dtype == 'object':
                train_dataset_valid_invalid[ct] = train_dataset_valid_invalid[ct].astype('category')
    
        model_valid_invalid, enc_val, accuracy_val = train_valid_invalid_classifier(train_dataset_valid_invalid, cat, model=binary_model)
    
        train_dataset_valid = train_dataset[train_dataset[cat] != 'INVALID'].copy()
    
        train_dataset_valid.drop(['index', 'cod_modelo_color'] + categories_to_drop, axis=1, inplace=True)

        if not use_metadata_classifier:
            train_dataset_valid.drop(metadata, axis=1, inplace=True)
    
        model_silhouette, enc_cat, accuracy_class = train_type_classifier(train_dataset_valid, cat, Model=Model, use_metadata_classifier=use_metadata_classifier)
        
        # count the invalids
        invalids = train_dataset[cat].value_counts()['INVALID']
        
        # divide by the total number of samples
        invalids /= len(train_dataset)
    
        if not os.path.exists('models'):
            os.makedirs('models')
        with open(f'models/{cat}_valid_invalid.pkl', 'wb') as f:
            pkl.dump(model_valid_invalid, f)
            
        torch.save(model_silhouette.state_dict(), f'models/{cat}_classifier.pt')

        # save the encoders
        if not os.path.exists('models/encoder'):
            os.makedirs('models/encoder')
        with open(f'models/encoder/{cat}_valid_invalid.pkl', 'wb') as f:
            pkl.dump(enc_val, f)
        with open(f'models/encoder/{cat}_classifier.pkl', 'wb') as f:
            pkl.dump(enc_cat, f)
        
        print(f'{cat} done')
        print(f"Valid/Invalid accuracy: {accuracy_val}, number of invalids: {invalids}")
        print(f"classification accuracy: {accuracy_class}, number of valids: {1-invalids}")
        print(f"Total accuracy: {accuracy_val*invalids + accuracy_class*(1-invalids)}")

        metrics[cat] = {
            'valid_invalid_accuracy': accuracy_val,
            'classification_accuracy': accuracy_class,
            'invalids_ratio': invalids,
            'total_accuracy': accuracy_val*invalids + accuracy_class*(1-invalids)
        }
        
        enc_val_dict[cat] = enc_val
        enc_cat_dict[cat] = enc_cat
        
    return enc_val_dict, enc_cat_dict, metrics


In [2]:
class Net1(nn.Module):
    def __init__(self, input_size, output_size):
        super(Net1, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)

        # dropout
        self.dropout = nn.Dropout(0.2)


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x
    
class WideNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(WideNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, output_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class AttentionNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(AttentionNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.attention = nn.Linear(64, 1)  # Single scalar for attention
        self.fc3 = nn.Linear(64, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        attention_weights = torch.softmax(self.attention(x), dim=0)
        x = x * attention_weights  # Apply attention
        x = self.fc3(x)
        return x


class ResNet2(nn.Module):
    def __init__(self, input_size, output_size):
        super(ResNet2, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, output_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        res = x
        x = torch.relu(self.bn2(self.fc2(x)))
        x += res
        x = self.dropout(x)
        x = self.fc3(x)
        return x


In [None]:
experiments = {
    'Net1_no_metadata_catboost': {"Model": Net1, "use_metadata_classifier": False, "binary_model": "catboost"},
    'Net1_no_metadata_xgboost': {"Model": Net1, "use_metadata_classifier": False, "binary_model": "xgboost"},
    'Net1_metadata_catboost': {"Model": Net1, "use_metadata_classifier": True, "binary_model": "catboost"},
    'Net1_metadata_xgboost': {"Model": Net1, "use_metadata_classifier": True, "binary_model": "xgboost"},
}

overall_metrics = {}
overall_accuracy = {}

print(f"using device: {device}")

for exp in experiments:
    try:
        print(f"{'='*10} Experiment: {exp} {'='*10}")
        enc_val_dic, enc_cat_dic, metrics = train_and_save(**experiments[exp])
        print()
        overall_metrics[exp] = metrics
        overall_accuracy[exp] = sum([x['total_accuracy'] for x in metrics.values()]) / len(metrics)

        print(f"\n{'='*30} Experiment {exp} done, overall accuracy: {overall_accuracy[exp]} {'='*30}\n")

    except Exception as e:
        print(f"Error in experiment {exp}: {e}")
        overall_metrics[exp] = None



using device: cuda


  from .autonotebook import tqdm as notebook_tqdm


cane_height_type done
Valid/Invalid accuracy: 0.9998373587053753, number of invalids: 0.9917051590657732
classification accuracy: 0.9313725490196079, number of valids: 0.008294840934226833
Total accuracy: 0.9992694539994397
closure_placement done
Valid/Invalid accuracy: 0.9481987476620314, number of invalids: 0.537050289506213
classification accuracy: 0.9278060776392061, number of valids: 0.46294971049378697
Total accuracy: 0.9387579669787691
heel_shape_type done
Valid/Invalid accuracy: 0.9996747174107505, number of invalids: 0.9628521241298549
classification accuracy: 0.9540481400437637, number of valids: 0.03714787587014512
Total accuracy: 0.9979797869783421
knit_structure done
Valid/Invalid accuracy: 0.9987801902903147, number of invalids: 0.84438227831631
classification accuracy: 0.9111807732497388, number of valids: 0.15561772168369004
Total accuracy: 0.9851481685896409
length_type done
Valid/Invalid accuracy: 0.9421810197609173, number of invalids: 0.15800858760002603
classificat

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




cane_height_type done
Valid/Invalid accuracy: 0.9998373587053753, number of invalids: 0.9917051590657732
classification accuracy: 0.9313725490196079, number of valids: 0.008294840934226833
Total accuracy: 0.9992694539994397
closure_placement done
Valid/Invalid accuracy: 0.9769049361632919, number of invalids: 0.537050289506213
classification accuracy: 0.9286843492007729, number of valids: 0.46294971049378697
Total accuracy: 0.9545812293891534
heel_shape_type done
Valid/Invalid accuracy: 0.9996747174107505, number of invalids: 0.9628521241298549
classification accuracy: 0.9606126914660832, number of valids: 0.03714787587014512
Total accuracy: 0.9982236461197216
knit_structure done
Valid/Invalid accuracy: 0.9989428315849395, number of invalids: 0.84438227831631
classification accuracy: 0.9148380355276907, number of valids: 0.15561772168369004
Total accuracy: 0.985854634839839
length_type done
Valid/Invalid accuracy: 0.971863056029926, number of invalids: 0.15800858760002603
classificatio

In [None]:
overall_accuracy = {exp: sum([metrics[cat]['total_accuracy'] for cat in metrics]) / len(metrics) for exp, metrics in overall_metrics.items() if metrics is not None}
overall_accuracy

{'Net1_no_metadata': 0.8727587020318133,
 'Net1_metadata': 0.8729628044015801,
 'WideNet_no_metadata': 0.8780592104493408,
 'WideNet_metadata': 0.8781274567441822,
 'AttentionNet_no_metadata': 0.6007183214147359,
 'AttentionNet_metadata': 0.5959071808307935}

In [None]:
from embedding_manager import *
def get_df_processed(data, attr_slice, N_min=5, embedding_kind="long"):
    df = data[["cod_modelo_color", attr_slice]].copy()

    df = add_embeddings_to_df(df)
    df = add_attr_sim(df, embedding_kind=embedding_kind)
    df = add_subattr_sim(df, attr_slice, embedding_kind=embedding_kind)

    df = df.drop(columns=["cod_modelo_color"])

    # drop all attributes that appear less than N_min times
    drop_vals = df[attr_slice].value_counts()[df[attr_slice].value_counts() < N_min].index
    df = df[~df[attr_slice].isin(drop_vals)]

    return df

In [None]:
test_data = pd.read_csv('data/test_data.csv')
test_data.head()

Unnamed: 0,cod_modelo_color,des_filename,cod_color,des_color,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,attribute_name,test_id
0,88_49711373,88_49711373_67080432-99_.jpg,99,NEGRO,Female,Adult,WOMAN,ACCESSORIES,"Accesories, Swim and Intimate",Accessories,Footwear,Boots,cane_height_type,88_49711373_cane_height_type
1,88_49718802,88_49718802_67030656-99_.jpg,99,NEGRO,Male,Adult,MAN,ACCESSORIES,"Accesories, Swim and Intimate",Accessories,Footwear,Ankle Boots,cane_height_type,88_49718802_cane_height_type
2,88_49709572,88_49709572_67030418-01_B.jpg,1,BLANCO,Female,Kids,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,cane_height_type,88_49709572_cane_height_type
3,88_49722701,88_49722701_67066002-02_.jpg,2,OFFWHITE,Female,Baby,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,cane_height_type,88_49722701_cane_height_type
4,88_49724926,88_49724926_67056330-02_B.jpg,2,OFFWHITE,Male,Newborn,KIDS,WOVEN,Tops,Shirts,Shirt,Shirt,cane_height_type,88_49724926_cane_height_type


In [None]:
test_data = test_data[["cod_modelo_color", "attribute_name", "test_id"]]

In [None]:
test_data.head()

Unnamed: 0,cod_modelo_color,attribute_name,test_id
0,88_49711373,cane_height_type,88_49711373_cane_height_type
1,88_49718802,cane_height_type,88_49718802_cane_height_type
2,88_49709572,cane_height_type,88_49709572_cane_height_type
3,88_49722701,cane_height_type,88_49722701_cane_height_type
4,88_49724926,cane_height_type,88_49724926_cane_height_type


In [None]:
test_data = add_embeddings_to_df(test_data)

In [None]:
test_data.shape

(71819, 4)

In [None]:
test_data.head()

Unnamed: 0,cod_modelo_color,attribute_name,test_id,embedding
0,88_49711373,cane_height_type,88_49711373_cane_height_type,"[-0.029145204, -0.21737033, 1.058375, 0.636950..."
1,88_49718802,cane_height_type,88_49718802_cane_height_type,"[0.17777005, -0.2483542, 1.0272813, 0.82432556..."
2,88_49709572,cane_height_type,88_49709572_cane_height_type,"[-0.12836745, 0.033788733, 0.1546534, -0.42457..."
3,88_49722701,cane_height_type,88_49722701_cane_height_type,"[0.15622969, 1.0889915, 0.3226521, -0.3091727,..."
4,88_49724926,cane_height_type,88_49724926_cane_height_type,"[0.061679937, 0.101062894, 0.2176968, -0.86592..."


In [None]:
# transform nan values in embeddings to  empty list
test_data['embedding'] = test_data['embedding'].apply(lambda x: [] if x is np.nan else x)
# flatten the embeddings
test_data = pd.concat([test_data, pd.DataFrame(test_data['embedding'].values.tolist())], axis=1)
test_data = test_data.drop('embedding', axis=1)

In [None]:
test_data.head()

Unnamed: 0,cod_modelo_color,attribute_name,test_id,0,1,2,3,4,5,6,...,502,503,504,505,506,507,508,509,510,511
0,88_49711373,cane_height_type,88_49711373_cane_height_type,-0.029145,-0.21737,1.058375,0.636951,-0.299452,0.407804,0.232239,...,-0.306208,-0.983478,-0.392704,0.056684,0.186838,0.021899,0.345426,0.139764,0.356491,-0.047467
1,88_49718802,cane_height_type,88_49718802_cane_height_type,0.17777,-0.248354,1.027281,0.824326,-0.158293,-0.046536,0.31792,...,-0.250989,-1.183641,-0.374794,-0.714303,0.398507,0.204974,-0.083016,-0.197524,0.534386,-0.217184
2,88_49709572,cane_height_type,88_49709572_cane_height_type,-0.128367,0.033789,0.154653,-0.424574,0.345731,-0.196093,-0.812618,...,-0.194153,-0.384735,-0.201257,-0.770077,0.747774,0.342686,0.180367,0.30984,0.522051,-0.009449
3,88_49722701,cane_height_type,88_49722701_cane_height_type,0.15623,1.088992,0.322652,-0.309173,0.281805,0.330705,-0.60836,...,-0.230646,0.206264,-0.143281,-0.503224,0.58427,0.275445,0.355704,0.523011,0.370885,0.018902
4,88_49724926,cane_height_type,88_49724926_cane_height_type,0.06168,0.101063,0.217697,-0.86593,0.345202,-0.088818,-0.446839,...,-0.118073,-0.007982,-0.042167,-0.964999,1.115877,0.066354,0.15177,0.079699,0.291407,-0.132981


In [None]:
_, categories = create_train_dataset()

In [None]:
test_data[categories] = [0] * len(categories)

In [None]:
test_data.head()

Unnamed: 0,cod_modelo_color,attribute_name,test_id,0,1,2,3,4,5,6,...,closure_placement,heel_shape_type,knit_structure,length_type,neck_lapel_type,silhouette_type,sleeve_length_type,toecap_type,waist_type,woven_structure
0,88_49711373,cane_height_type,88_49711373_cane_height_type,-0.029145,-0.21737,1.058375,0.636951,-0.299452,0.407804,0.232239,...,0,0,0,0,0,0,0,0,0,0
1,88_49718802,cane_height_type,88_49718802_cane_height_type,0.17777,-0.248354,1.027281,0.824326,-0.158293,-0.046536,0.31792,...,0,0,0,0,0,0,0,0,0,0
2,88_49709572,cane_height_type,88_49709572_cane_height_type,-0.128367,0.033789,0.154653,-0.424574,0.345731,-0.196093,-0.812618,...,0,0,0,0,0,0,0,0,0,0
3,88_49722701,cane_height_type,88_49722701_cane_height_type,0.15623,1.088992,0.322652,-0.309173,0.281805,0.330705,-0.60836,...,0,0,0,0,0,0,0,0,0,0
4,88_49724926,cane_height_type,88_49724926_cane_height_type,0.06168,0.101063,0.217697,-0.86593,0.345202,-0.088818,-0.446839,...,0,0,0,0,0,0,0,0,0,0


In [None]:
test_data.columns = test_data.columns.astype(str)
test_data.fillna(0, inplace=True)

for cat in categories:
    with open(f'models/{cat}_valid_invalid.pkl', 'rb') as f:
        model = pkl.load(f)
        
        
    X_data = test_data.drop(['cod_modelo_color', 'attribute_name', 'test_id'] + list(categories), axis=1)
    Y_data = model.predict(X_data)
    
    Y_data = enc_val_dic[cat].inverse_transform(Y_data)
    
    test_data[cat] = Y_data
    

ValueError: feature_names mismatch: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '200', '201', '202', '203', '204', '205', '206', '207', '208', '209', '210', '211', '212', '213', '214', '215', '216', '217', '218', '219', '220', '221', '222', '223', '224', '225', '226', '227', '228', '229', '230', '231', '232', '233', '234', '235', '236', '237', '238', '239', '240', '241', '242', '243', '244', '245', '246', '247', '248', '249', '250', '251', '252', '253', '254', '255', '256', '257', '258', '259', '260', '261', '262', '263', '264', '265', '266', '267', '268', '269', '270', '271', '272', '273', '274', '275', '276', '277', '278', '279', '280', '281', '282', '283', '284', '285', '286', '287', '288', '289', '290', '291', '292', '293', '294', '295', '296', '297', '298', '299', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '310', '311', '312', '313', '314', '315', '316', '317', '318', '319', '320', '321', '322', '323', '324', '325', '326', '327', '328', '329', '330', '331', '332', '333', '334', '335', '336', '337', '338', '339', '340', '341', '342', '343', '344', '345', '346', '347', '348', '349', '350', '351', '352', '353', '354', '355', '356', '357', '358', '359', '360', '361', '362', '363', '364', '365', '366', '367', '368', '369', '370', '371', '372', '373', '374', '375', '376', '377', '378', '379', '380', '381', '382', '383', '384', '385', '386', '387', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '400', '401', '402', '403', '404', '405', '406', '407', '408', '409', '410', '411', '412', '413', '414', '415', '416', '417', '418', '419', '420', '421', '422', '423', '424', '425', '426', '427', '428', '429', '430', '431', '432', '433', '434', '435', '436', '437', '438', '439', '440', '441', '442', '443', '444', '445', '446', '447', '448', '449', '450', '451', '452', '453', '454', '455', '456', '457', '458', '459', '460', '461', '462', '463', '464', '465', '466', '467', '468', '469', '470', '471', '472', '473', '474', '475', '476', '477', '478', '479', '480', '481', '482', '483', '484', '485', '486', '487', '488', '489', '490', '491', '492', '493', '494', '495', '496', '497', '498', '499', '500', '501', '502', '503', '504', '505', '506', '507', '508', '509', '510', '511', 'des_product_family', 'des_line', 'des_fabric'] ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '200', '201', '202', '203', '204', '205', '206', '207', '208', '209', '210', '211', '212', '213', '214', '215', '216', '217', '218', '219', '220', '221', '222', '223', '224', '225', '226', '227', '228', '229', '230', '231', '232', '233', '234', '235', '236', '237', '238', '239', '240', '241', '242', '243', '244', '245', '246', '247', '248', '249', '250', '251', '252', '253', '254', '255', '256', '257', '258', '259', '260', '261', '262', '263', '264', '265', '266', '267', '268', '269', '270', '271', '272', '273', '274', '275', '276', '277', '278', '279', '280', '281', '282', '283', '284', '285', '286', '287', '288', '289', '290', '291', '292', '293', '294', '295', '296', '297', '298', '299', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '310', '311', '312', '313', '314', '315', '316', '317', '318', '319', '320', '321', '322', '323', '324', '325', '326', '327', '328', '329', '330', '331', '332', '333', '334', '335', '336', '337', '338', '339', '340', '341', '342', '343', '344', '345', '346', '347', '348', '349', '350', '351', '352', '353', '354', '355', '356', '357', '358', '359', '360', '361', '362', '363', '364', '365', '366', '367', '368', '369', '370', '371', '372', '373', '374', '375', '376', '377', '378', '379', '380', '381', '382', '383', '384', '385', '386', '387', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '400', '401', '402', '403', '404', '405', '406', '407', '408', '409', '410', '411', '412', '413', '414', '415', '416', '417', '418', '419', '420', '421', '422', '423', '424', '425', '426', '427', '428', '429', '430', '431', '432', '433', '434', '435', '436', '437', '438', '439', '440', '441', '442', '443', '444', '445', '446', '447', '448', '449', '450', '451', '452', '453', '454', '455', '456', '457', '458', '459', '460', '461', '462', '463', '464', '465', '466', '467', '468', '469', '470', '471', '472', '473', '474', '475', '476', '477', '478', '479', '480', '481', '482', '483', '484', '485', '486', '487', '488', '489', '490', '491', '492', '493', '494', '495', '496', '497', '498', '499', '500', '501', '502', '503', '504', '505', '506', '507', '508', '509', '510', '511']
expected des_fabric, des_product_family, des_line in input data

In [None]:
attri_to_size = {
    "cane_height_type": 6,
    "closure_placement": 6,
    "heel_shape_type": 11,
    "knit_structure": 5,
    "length_type": 12,
    "neck_lapel_type": 33,
    "silhouette_type": 33,
    "sleeve_length_type": 6,
    "toecap_type": 4,
    "waist_type": 4,
    "woven_structure": 4
}


In [None]:
for cat in categories:
    net = Net(512, attri_to_size[cat])
    net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
    net.eval()

    X_data = test_data[test_data[cat] != 'INVALID'].drop(['cod_modelo_color', 'attribute_name', 'test_id'] + list(categories), axis=1)
    
    X_data = torch.tensor(X_data.values, dtype=torch.float32)
    outputs = net(X_data)
    _, predicted = torch.max(outputs, 1)
    
    # change the encoded values back to the original values
    test_data.loc[test_data[cat] != 'INVALID', cat] = enc_dic[cat].inverse_transform(predicted.numpy())
    
    # test_data.loc[test_data[cat] == 'VALID', cat] = predicted.numpy()
    

  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))
  net.load_state_dict(torch.load(f'models/{cat}_classifier.pt'))


In [None]:
test_data.head(20)

Unnamed: 0,cod_modelo_color,attribute_name,test_id,0,1,2,3,4,5,6,...,closure_placement,heel_shape_type,knit_structure,length_type,neck_lapel_type,silhouette_type,sleeve_length_type,toecap_type,waist_type,woven_structure
0,88_49711373,cane_height_type,88_49711373_cane_height_type,-0.029145,-0.21737,1.058375,0.636951,-0.299452,0.407804,0.232239,...,INVALID,Plano,INVALID,INVALID,INVALID,INVALID,INVALID,Redonda,INVALID,INVALID
1,88_49718802,cane_height_type,88_49718802_cane_height_type,0.17777,-0.248354,1.027281,0.824326,-0.158293,-0.046536,0.31792,...,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID
2,88_49709572,cane_height_type,88_49709572_cane_height_type,-0.128367,0.033789,0.154653,-0.424574,0.345731,-0.196093,-0.812618,...,INVALID,INVALID,INVALID,Standard,Redondo,Recto,Corta,INVALID,INVALID,Ligero
3,88_49722701,cane_height_type,88_49722701_cane_height_type,0.15623,1.088992,0.322652,-0.309173,0.281805,0.330705,-0.60836,...,INVALID,INVALID,INVALID,Corto,INVALID,Recto,Corta,INVALID,INVALID,Ligero
4,88_49724926,cane_height_type,88_49724926_cane_height_type,0.06168,0.101063,0.217697,-0.86593,0.345202,-0.088818,-0.446839,...,INVALID,INVALID,INVALID,Standard,Redondo,Regular,Corta,INVALID,INVALID,Ligero
5,88_49716781,cane_height_type,88_49716781_cane_height_type,0.100022,0.694086,1.676687,-0.228202,0.273254,-0.192943,-0.269248,...,INVALID,INVALID,Punto grueso,Standard,INVALID,Slim,INVALID,INVALID,INVALID,INVALID
6,88_49714943,cane_height_type,88_49714943_cane_height_type,-0.493105,-1.111359,0.924655,-0.322128,-0.015221,-0.140814,-0.389272,...,INVALID,INVALID,INVALID,Largo,INVALID,Slim,Larga,INVALID,INVALID,INVALID
7,88_49708509,cane_height_type,88_49708509_cane_height_type,-0.105043,-0.161602,0.612901,-0.440658,-0.219802,-0.039285,0.068691,...,INVALID,INVALID,INVALID,Largo,INVALID,Evase,INVALID,INVALID,INVALID,INVALID
8,88_49725582,cane_height_type,88_49725582_cane_height_type,0.517701,0.135636,0.796815,0.214312,0.081013,0.364985,0.105662,...,INVALID,INVALID,INVALID,Largo,INVALID,Recto,INVALID,INVALID,Regular Waist,INVALID
9,88_49728896,cane_height_type,88_49728896_cane_height_type,0.030769,0.464083,1.665426,-0.448196,-0.05131,0.121785,-0.175868,...,INVALID,INVALID,Punto medio,Crop,Pico,Recto,Tirante Ancho,INVALID,INVALID,INVALID


In [None]:
subm = pd.read_csv('data/test_data.csv')
subm.head()

Unnamed: 0,cod_modelo_color,des_filename,cod_color,des_color,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,attribute_name,test_id
0,88_49711373,88_49711373_67080432-99_.jpg,99,NEGRO,Female,Adult,WOMAN,ACCESSORIES,"Accesories, Swim and Intimate",Accessories,Footwear,Boots,cane_height_type,88_49711373_cane_height_type
1,88_49718802,88_49718802_67030656-99_.jpg,99,NEGRO,Male,Adult,MAN,ACCESSORIES,"Accesories, Swim and Intimate",Accessories,Footwear,Ankle Boots,cane_height_type,88_49718802_cane_height_type
2,88_49709572,88_49709572_67030418-01_B.jpg,1,BLANCO,Female,Kids,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,cane_height_type,88_49709572_cane_height_type
3,88_49722701,88_49722701_67066002-02_.jpg,2,OFFWHITE,Female,Baby,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,cane_height_type,88_49722701_cane_height_type
4,88_49724926,88_49724926_67056330-02_B.jpg,2,OFFWHITE,Male,Newborn,KIDS,WOVEN,Tops,Shirts,Shirt,Shirt,cane_height_type,88_49724926_cane_height_type


In [None]:
subm = subm[["cod_modelo_color", "attribute_name", "test_id"]]

In [None]:
subm.head()

Unnamed: 0,cod_modelo_color,attribute_name,test_id
0,88_49711373,cane_height_type,88_49711373_cane_height_type
1,88_49718802,cane_height_type,88_49718802_cane_height_type
2,88_49709572,cane_height_type,88_49709572_cane_height_type
3,88_49722701,cane_height_type,88_49722701_cane_height_type
4,88_49724926,cane_height_type,88_49724926_cane_height_type


In [None]:
output_file = 'submissions/submission_jon.csv'

with open(output_file, 'w') as f:
    f.write('test_id,des_value\n')
    for i, row in subm.iterrows():
        f.write(f'{row["test_id"]},')
        f.write(f'{test_data.loc[test_data["cod_modelo_color"] == row["cod_modelo_color"], row["attribute_name"]].values[0]}\n')

In [None]:
output_file = 'submissions/submission_jon.csv'
temp = pd.read_csv(output_file, encoding='latin-1')

In [None]:
temp.to_csv(output_file, index=False, encoding="utf-8")

In [None]:
pd.read_csv(output_file)

Unnamed: 0,test_id,des_value
0,88_49711373_cane_height_type,Alta
1,88_49718802_cane_height_type,Baja
2,88_49709572_cane_height_type,INVALID
3,88_49722701_cane_height_type,INVALID
4,88_49724926_cane_height_type,INVALID
...,...,...
71814,88_49727540_knit_structure,INVALID
71815,88_49733648_knit_structure,INVALID
71816,88_49735572_knit_structure,INVALID
71817,88_49713624_knit_structure,INVALID
