# Baseline

## Imports

In [1]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import os
import re
import ast
import joblib
from tqdm.auto import tqdm
tqdm.pandas()

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, auc

import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from sentence_transformers import SentenceTransformer

from lightautoml.tasks import Task
import lightautoml
from lightautoml.automl.presets.tabular_presets import TabularAutoML

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Preprocess atributes 

In [5]:
def process_attributes(attributes, test):
    attributes = attributes[(attributes.variantid.isin(test.variantid1) | attributes.variantid.isin(test.variantid2))]
    model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
    attributes['characters'] = attributes.characteristic_attributes_mapping.progress_apply(lambda x: model.encode(x))
    attributes[['variantid', 'characters']].to_parquet('data/processed/attributes.parquet')

## Preprocess text

In [None]:
def clean_description(text, model, tokenizer):
    text = re.sub(r'<[^>]+>', '', text)
    emoji_pattern = re.compile("[" 
                               u"\U0001F600-\U0001F64F"  
                               u"\U0001F300-\U0001F5FF"  
                               u"\U0001F680-\U0001F6FF"  
                               u"\U0001F1E0-\U0001F1FF"  
                               u"\U00002702-\U000027B0"  
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'[$#@]', '', text)
    text = re.sub(r'[^\w\s\-.]', '', text)
    
    return model.encode(text)

def process_text(df, test):
    model = SentenceTransformer("cointegrated/rubert-tiny2")

    df = df[(df.variantid.isin(test.variantid1) | df.variantid.isin(test.variantid2))].fillna('')
    df.description = df.description.apply(lambda x: clean_description(x, model))
    
    df[['variantid', 'name_bert_64', 'description']].to_parquet('data/processed/text.parquet')

## Preprocess images

In [None]:
def process_resnet(resnet, test):
    resnet = resnet[(resnet.variantid.isin(test.variantid1) | resnet.variantid.isin(test.variantid2))].fillna('')
    
    resnet['len_emb_not_main'] = resnet.pic_embeddings_resnet_v1.apply(lambda x: len(x))   
    resnet['pca_not_main'] = resnet.apply(lambda x: pca_transform(x.pic_embeddings_resnet_v1) if x.len_emb_not_main > 0 else np.zeros(128), axis=1)
    resnet.main_pic_embeddings_resnet_v1 = resnet.main_pic_embeddings_resnet_v1.apply(lambda x: x[0])
    resnet[['variantid', 'main_pic_embeddings_resnet_v1', 'pca_not_main']].to_parquet('data/processed/resnet.parquet')

def pca_transform(row):
    n = row.shape[0]
    x = np.concatenate(row, axis=0).reshape(n, 128).T
    pca = PCA(n_components=1)
    x_transformed = pca.fit_transform(x)
    
    return np.concatenate(x_transformed)

## Make data

In [None]:
attributes = pd.read_parquet('data/raw/attributes.parquet')
resnet = pd.read_parquet('data/raw/resnet.parquet')
text_and_bert = pd.read_parquet('data/raw/text_and_bert.parquet')

train = pd.read_parquet('data/raw/train.parquet')
test = pd.read_parquet('data/raw/test.parquet')
train = train[~((train.variantid1.isin(test.variantid1)) | (train.variantid2.isin(test.variantid2)))]
train.to_parquet('data/processed/train.parquet')

In [None]:
process_resnet(resnet, test)
process_text(text_and_bert, test)
process_attributes(attributes, test)

## Merge data

In [None]:
attributes = pd.read_parquet('data/processed/attributes.parquet')
resnet = pd.read_parquet('data/processed/resnet.parquet')
text_and_bert = pd.read_parquet('data/processed/text.parquet')

train = pd.read_parquet('data/processed/train.parquet')

In [4]:
temp1 = text_and_bert.rename(columns={'variantid': 'variantid1', 'description': 'description1', 'name_bert_64': 'name1'})
temp1 = train.merge(temp1, how='inner', on='variantid1')
temp2 = text_and_bert.rename(columns={'variantid': 'variantid2', 'description': 'description2', 'name_bert_64': 'name2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

In [5]:
temp1 = resnet.rename(columns={'variantid': 'variantid1', 'main_pic_embeddings_resnet_v1': 'main1', 'pca_not_main': 'not_main1'})
temp1 = df.merge(temp1, how='inner', on='variantid1')
temp2 = resnet.rename(columns={'variantid': 'variantid2', 'main_pic_embeddings_resnet_v1': 'main2', 'pca_not_main': 'not_main2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

In [6]:
temp1 = attributes.rename(columns={'variantid': 'variantid1', 'characters': 'characters1'})
temp1 = df.merge(temp1, how='inner', on='variantid1')
temp2 = attributes.rename(columns={'variantid': 'variantid2', 'characters': 'characters2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

In [7]:
df.to_parquet('data/processed/merged.parquet')

## Multimodal model -> binary classification

### Dataloaders

In [None]:
df = df.read_parquet('data/processed/merged.parquet')

In [7]:
class MultimodalDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.n = len(df)

    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):
        cur_df = self.df.iloc[idx]
        main1, main2 = torch.Tensor(cur_df.main1), torch.Tensor(cur_df.main2) #main images
        not_main1, not_main2 = torch.Tensor(cur_df.not_main1), torch.Tensor(cur_df.not_main2) # not main images
        attr1, attr2 = torch.tensor(cur_df.characters1), torch.tensor(cur_df.characters2) # attributes
        description1, description2 = torch.Tensor(cur_df.description1), torch.Tensor(cur_df.description2) # descriptions
        name1, name2 = torch.Tensor(cur_df.name1), torch.Tensor(cur_df.name2) # names

        labels = torch.tensor(cur_df.target)
        
        return main1, main2, not_main1, not_main2, attr1, attr2, description1, description2, name1, name2, labels

In [8]:
image_size = 128
description_size = 312
name_size = 64
attribute_size = 512
embedding_size = 256

In [9]:
train_ratio, validation_ratio, test_ratio = 0.75, 0.15, 0.10

train, test = train_test_split(df, test_size=(1 - train_ratio), stratify=df.target)
val, test = train_test_split(test, test_size=test_ratio/(test_ratio + validation_ratio), stratify=test.target)

train_dataset, val_dataset, test_dataset = MultimodalDataset(train), MultimodalDataset(val), MultimodalDataset(test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

### Model params and architecture

In [10]:
class MultimodalModel(nn.Module):
    def __init__(self, image_size=128, description_size=312, name_size=64, attribute_size=512, embedding_size=256, batch_size=32):
        super(MultimodalModel, self).__init__()
        
        self.main_image = nn.Sequential( nn.Linear(image_size, embedding_size), nn.ReLU() ) # [b, 128] -> [b, 256] -> [b, 256, 1]
        self.not_main_image = nn.Sequential( nn.Linear(image_size, embedding_size), nn.ReLU() ) # [b, 128] -> [b, 256] -> [b, 256, 1]
        self.image_embedding = nn.Sequential( nn.Conv1d(embedding_size, 128, kernel_size=2), nn.ReLU() ) # [b, 256, 2] -> [b, 128, 1]

        self.name = nn.Sequential( nn.Linear(name_size, embedding_size), nn.ReLU() ) # [b, 64] -> [b, 256] -> [b, 256, 1]
        self.description = nn.Sequential( nn.Linear(description_size, embedding_size), nn.ReLU() ) # [b, 312] -> [b, 256] -> [b, 256, 1]
        self.text_embedding = nn.Sequential( nn.Conv1d(embedding_size, 128, kernel_size=2), nn.ReLU() ) # [b, 256, 2] -> [b, 128, 1]
        
        self.tabular_embedding = nn.Sequential(
            nn.Linear(attribute_size, embedding_size), # [b, 512] -> [b, 256]
            nn.ReLU(),
            nn.Linear(embedding_size, 128), # [b, 256] -> [b, 128]
            nn.ReLU(),
        ) # # [b, 128] -> [b, 128, 1]
        
        self.embedding = nn.Sequential(
            nn.Conv1d(128, 64, kernel_size=3), # [b, 128, 3] ->  [b, 64, 1] -> [b, 64]
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Linear(64*2, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 2)
        )
            
    def forward(self, main1, main2,
                not_main1, not_main2,
                attr1, attr2, description1,
                description2, name1, name2):

        main_img_emb1, main_img_emb2 = torch.unsqueeze(self.main_image(main1), 2), torch.unsqueeze(self.main_image(main2), 2)
        img_emb1, img_emb2 = torch.unsqueeze(self.not_main_image(not_main1), 2), torch.unsqueeze(self.not_main_image(not_main2), 2)
        
        image_emb1 = self.image_embedding(torch.cat((main_img_emb1, img_emb1), dim=2))
        image_emb2 = self.image_embedding(torch.cat((main_img_emb2, img_emb2), dim=2))

        name_emb1, name_emb2 = torch.unsqueeze(self.name(name1), 2), torch.unsqueeze(self.name(name2), 2)
        desc_emb1, desc_emb2 = torch.unsqueeze(self.description(description1), 2), torch.unsqueeze(self.description(description2), 2)
        text_emb1 = self.text_embedding(torch.cat((name_emb1, desc_emb1), dim=2))
        text_emb2 = self.text_embedding(torch.cat((name_emb2, desc_emb2), dim=2))

        tab_emb1, tab_emb2 = torch.unsqueeze(self.tabular_embedding(attr1), 2), torch.unsqueeze(self.tabular_embedding(attr2), 2)
                
        combined1 = self.embedding(torch.cat((image_emb1, text_emb1, tab_emb1), dim=2))
        combined2 = self.embedding(torch.cat((image_emb2, text_emb2, tab_emb2), dim=2))
                                   
        combined = torch.cat((torch.squeeze(combined1, 2), torch.squeeze(combined2, 2)), 1)
        return self.fc(combined)

In [14]:
# def contrastive_loss(distance, label, margin=1.0):
#     loss = (1 - label) * torch.pow(distance, 2) + \
#            (label) * torch.pow(torch.clamp(margin - distance, min=0.0), 2)
#     return torch.mean(loss)

In [14]:
model = MultimodalModel().to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()
scheduler1 = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
scheduler2 = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)

### Training 

In [15]:
num_epochs = 10

for epoch in range(num_epochs):
    train_auc, valid_auc = 0.0, 0.0
    train_losses, validation_losses = 0.0, 0.0
    
    n1, n2 = len(train_loader), len(val_loader)
    print(f'Epoch {epoch}')
    model.train()
    
    for i, data in enumerate(tqdm(train_loader)):
        main1, main2, not_main1, not_main2, \
        attr1, attr2, description1, description2, \
        name1, name2, labels = data
        
        optimizer.zero_grad()
        output = model(main1.to(device), main2.to(device),
                not_main1.to(device), not_main2.to(device),
                attr1.to(device), attr2.to(device), description1.to(device),
                description2.to(device), name1.to(device), name2.to(device))

        #distance = nn.functional.pairwise_distance(output1, output2)
        loss = criterion(output, labels.to(device))
        loss.backward()
        optimizer.step()
        
        y_pred = np.argmax(F.softmax(output).cpu().detach().numpy(), axis=1)
        preds = (y_pred > 0.5).astype(int)
        precision, recall, _ = precision_recall_curve(labels, preds)
        train_auc += auc(recall, precision)
        train_losses += loss.item()
        
    print(f'Training PR-AUC: {train_auc / n1}. Loss: {train_losses / n1}')
    
    model.eval()
    
    with torch.no_grad():
        for i, data in enumerate(tqdm(val_loader)):
            main1, main2, not_main1, not_main2, \
            attr1, attr2, description1, description2, \
            name1, name2, labels = data
            
            output = model(main1.to(device), main2.to(device),
                not_main1.to(device), not_main2.to(device),
                attr1.to(device), attr2.to(device), description1.to(device),
                description2.to(device), name1.to(device), name2.to(device))
            
            loss = criterion(output, labels.to(device))
            y_pred = np.argmax(F.softmax(output).cpu().detach().numpy(), axis=1)
            preds = (y_pred > 0.5).astype(int)
            precision, recall, _ = precision_recall_curve(labels, preds)
            valid_auc += auc(recall, precision)
            validation_losses += loss.item()

    print(f'Validation PR-AUC: {valid_auc / n2}. Loss: {validation_losses / n2}')

    scheduler1.step()
    scheduler2.step()

Epoch 0


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8122291663285738. Loss: 0.5001284942341234


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8383350108600488. Loss: 0.45312686761583254
Epoch 1


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8394318343247825. Loss: 0.4511466701186657


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8428385424944361. Loss: 0.4368260700989257
Epoch 2


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8470553378927832. Loss: 0.4341748803712436


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8457451212061587. Loss: 0.42841810517756224
Epoch 3


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8531292842151008. Loss: 0.4194670418698639


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.852860141901173. Loss: 0.42525227484386857
Epoch 4


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8587729179645414. Loss: 0.4060947069344967


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8500946700083609. Loss: 0.4201602909385962
Epoch 5


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.864624340527097. Loss: 0.3924646028319893


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8512774420373559. Loss: 0.4156061605920224
Epoch 6


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8720175099495444. Loss: 0.37505712212874237


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8618935861776558. Loss: 0.3985903608208814
Epoch 7


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8790522485556045. Loss: 0.3588877206657029


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8647188369933233. Loss: 0.39614918848699054
Epoch 8


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8840405627434961. Loss: 0.3452025564528483


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8699260380255219. Loss: 0.3942302744733534
Epoch 9


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.888389055406888. Loss: 0.3340813013032531


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8682331856774359. Loss: 0.3964038596161535


In [16]:
torch.save(model.state_dict(), 'binary_classification.pth')

### Test

In [17]:
test_auc = 0
test_losses = 0

with torch.no_grad():
    for i, data in enumerate(tqdm(test_loader)):
        main1, main2, not_main1, not_main2, \
        attr1, attr2, description1, description2, \
        name1, name2, labels = data
        
        output = model(main1.to(device), main2.to(device),
            not_main1.to(device), not_main2.to(device),
            attr1.to(device), attr2.to(device), description1.to(device),
            description2.to(device), name1.to(device), name2.to(device))
        
        loss = criterion(output, labels.to(device))
        y_pred = np.argmax(F.softmax(output).cpu().detach().numpy(), axis=1)
        preds = (y_pred > 0.5).astype(int)
        precision, recall, _ = precision_recall_curve(labels, preds)
        test_auc += auc(recall, precision)
        test_losses += loss.item()

n3 = len(test_loader)
print(f'Test PR-AUC: {test_auc / n3}. Loss: {test_losses / n3}')

  0%|          | 0/1825 [00:00<?, ?it/s]

## LAMA

### Make distances

In [None]:
def dot_product(emb1, emb2, length, n):
    dot = []
    for i in range(n):
        dot.append(np.dot(emb1[i], emb2[i]) / np.sqrt(length))
    return np.array(dot)

In [None]:
def distances(df):
    n = len(df)
    len_image = 128
    len_description = 312
    len_name = 64
    len_chars = 512

    emb1 = np.concatenate(df.main1.to_numpy(), axis=0).reshape(n, len_image)
    emb2 = np.concatenate(df.main2.to_numpy(), axis=0).reshape(n, len_image)
    embedding1, embedding2 = torch.Tensor(emb1), torch.Tensor(emb2)
    df['main_cos_distance'] = F.cosine_similarity(embedding1, embedding2).numpy()
    df['main_eucl_distance'] = F.pairwise_distance(embedding1, embedding2).numpy()
    df['main_dot_distance'] = dot_product(emb1, emb2, len_image, n)

    emb1 = np.concatenate(df.not_main1.to_numpy(), axis=0).reshape(n, len_image)
    emb2 = np.concatenate(df.not_main2.to_numpy(), axis=0).reshape(n, len_image)
    embedding1, embedding2 = torch.Tensor(emb1), torch.Tensor(emb2)
    df['not_main_cos_distance'] = F.cosine_similarity(embedding1, embedding2).numpy()
    df['not_main_eucl_distance'] = F.pairwise_distance(embedding1, embedding2).numpy()
    df['not_main_dot_distance'] = dot_product(emb1, emb2, len_image, n)
    
    emb1 = np.concatenate(df.name1.to_numpy(), axis=0).reshape(n, len_name)
    emb2 = np.concatenate(df.name2.to_numpy(), axis=0).reshape(n, len_name)
    name1, name2 = torch.Tensor(emb1), torch.Tensor(emb2)
    df['name_cos_distance'] =  F.cosine_similarity(name1, name2).numpy()
    df['name_eucl_distance'] = F.pairwise_distance(name1, name2).numpy()
    df['name_dot_distance'] = dot_product(emb1, emb2, len_name, n)

    emb1 = np.concatenate(df.description1.to_numpy(), axis=0).reshape(n, len_description)
    emb2 = np.concatenate(df.description2.to_numpy(), axis=0).reshape(n, len_description)
    description1, description2 = torch.Tensor(emb1), torch.Tensor(emb2)
    df['description_cos_distance'] = F.cosine_similarity(description1, description2).numpy()
    df['description_eucl_distance'] = F.pairwise_distance(description1, description2).numpy()
    df['description_dot_distance'] = dot_product(emb1, emb2, len_description, n)

    emb1 = np.concatenate(df.characters1.to_numpy(), axis=0).reshape(n, len_chars)
    emb2 = np.concatenate(df.characters2.to_numpy(), axis=0).reshape(n, len_chars)
    chars1, chars2 = torch.Tensor(emb1), torch.Tensor(emb2)
    df['chars_cos_distance'] = F.cosine_similarity(description1, description2).numpy()
    df['chars_eucl_distance'] = F.pairwise_distance(description1, description2).numpy()
    df['chars_dot_distance'] = dot_product(emb1, emb2, len_chars, n)

    return df[['variantid1', 'variantid2', 'target', 'main_cos_distance', 'main_eucl_distance', 'main_dot_distance', 'not_main_cos_distance', 'not_main_eucl_distance', 'not_main_dot_distance', \
              'name_cos_distance', 'name_eucl_distance', 'name_dot_distance', 'description_cos_distance', 'description_eucl_distance', 'description_dot_distance', \
              'chars_cos_distance', 'chars_eucl_distance', 'chars_dot_distance']]

In [None]:
df = df.read_parquet('data/processed/merged.parquet')
df = distances(df)

### Train LightAutoml

In [23]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TARGET_NAME = 'target'

In [24]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [32]:
train_data, test_data = train_test_split(
    df,
    test_size=TEST_SIZE,
    stratify=df['target'],
    random_state=RANDOM_STATE
)

In [33]:
columns = train_data.columns.values[3:]
scaler = StandardScaler()

train_data[columns] = scaler.fit_transform(train_data[columns])
test_data[columns] = scaler.fit_transform(test_data[columns])

In [34]:
roles = {'target': 'target',
        'drop': ['variantid1', 'variantid2']}

task = Task('binary', loss='logloss', metric='auc')

In [None]:
automl = TabularAutoML(
    task=task,
    gpu_ids='0',
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
)

In [None]:
train_predictions = automl.fit_predict(train_data, roles=roles, verbose=1)
test_predictions = automl.predict(test_data)
not_nan = np.any(~np.isnan(train_predictions.data), axis=1)

In [None]:
train_labels, train_preds = train_data[roles['target']].values[not_nan], train_predictions.data[not_nan][:, 0]
test_labels, test_preds = test_data[roles['target']].values, test_predictions.data[:, 0]

In [None]:
joblib.dump(automl, 'LAMl.pkl')
#automl=joblib.load(‘model.pkl’)

In [None]:
train_preds = (train_preds > 0.5).astype(int)
test_preds = (test_preds > 0.5).astype(int)

In [None]:
def pr_auc(labels, preds):
    precision, recall, _ = precision_recall_curve(labels, preds)
    return auc(recall, precision)

In [None]:
print("Train PR-AUC: ", pr_auc(train_labels, train_preds))
print("Test PR-AUC: ", pr_auc(test_labels, test_preds))