# Baseline

## Imports

In [1]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import os
import re
import ast
import joblib
from tqdm.auto import tqdm
tqdm.pandas()

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, auc

import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from sentence_transformers import SentenceTransformer

from dataclasses import dataclass
import wandb
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Preprocess atributes 

In [8]:
def convert_cats(row, all_cats):
    d = list(ast.literal_eval(row).values())
    all_cats.append(d)
    return d

In [21]:
attributes = pd.read_parquet('data/raw/attributes.parquet')
train = pd.read_parquet('data/raw/train.parquet')
test = pd.read_parquet('data/raw/test.parquet')
train = train[~((train.variantid1.isin(test.variantid1)) | (train.variantid2.isin(test.variantid2)))]
attributes = attributes[(attributes.variantid.isin(train.variantid1)) | (attributes.variantid.isin(train.variantid2))]

In [22]:
all_cats = []
attributes['vector'] = attributes.categories.progress_apply(lambda x: convert_cats(x, all_cats))

  0%|          | 0/2154258 [00:00<?, ?it/s]

In [5]:
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
attributes['characters'] = attributes.characteristic_attributes_mapping.progress_apply(lambda x: model.encode(x))

In [4]:
def cats_to_emb(row):
    return np.array([cat_dict[r] for r in row])

In [7]:
cat = list(set(np.concatenate(attributes.vector.values)))
cat_dict = {cat[i]: i for i in range(len(cat))}

In [8]:
attributes['vector'] = attributes.vector.progress_apply(cats_to_emb)
attributes[['variantid', 'vector', 'characters']].to_parquet('attributes.parquet')

  0%|          | 0/2154258 [00:00<?, ?it/s]

## Preprocess text

### make description embeddings

In [11]:
# df = text_and_bert[['variantid', 'embedding']]

In [12]:
model = SentenceTransformer("cointegrated/rubert-tiny2")
embeddings = model.encode(text['clean_desc'].tolist(), batch_size=32, show_progress_bar=True)
text['desc_embs'] = list(embeddings)

In [14]:
temp1 = text_df.rename(columns={'variantid': 'variantid1', 'embedding': 'embedding1'})
temp1 = train_df.merge(temp1, how='inner', on='variantid1')

temp2 = text_df.rename(columns={'variantid': 'variantid2', 'embedding': 'embedding2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

## Images

In [5]:
resnet_df = resnet_df[(resnet_df.variantid.isin(train_df.variantid1) | resnet_df.variantid.isin(train_df.variantid2))]

In [6]:
resnet_df = resnet_df.fillna('')

In [7]:
resnet_df['len_emb_not_main'] = resnet_df.pic_embeddings_resnet_v1.apply(lambda x: len(x))

In [21]:
resnet_df['len_emb_main'] = resnet_df.main_pic_embeddings_resnet_v1.apply(lambda x: len(x))

In [8]:
resnet_df['pic_embeddings_resnet_v1'] = resnet_df.apply(lambda x: x.pic_embeddings_resnet_v1.sum() if x.len_emb_not_main > 0 else [0], axis=1)

In [29]:
resnet_df.main_pic_embeddings_resnet_v1 = resnet_df.main_pic_embeddings_resnet_v1.apply(lambda x: x[0])

In [32]:
resnet_df['main_pic_embeddings_resnet_v1'] += resnet_df['pic_embeddings_resnet_v1']

## Multimodal

### prepare data

In [2]:
attributes = pd.read_parquet('attributes.parquet')
resnet = pd.read_parquet('resnet.parquet')
text_and_bert = pd.read_parquet('text_and_bert.parquet')

train = pd.read_parquet('data/raw/train.parquet')
test = pd.read_parquet('data/raw/test.parquet')
train = train[~((train.variantid1.isin(test.variantid1)) | (train.variantid2.isin(test.variantid2)))]

In [3]:
attributes = attributes.drop(['vector'], axis=1)

In [4]:
temp1 = text_and_bert.rename(columns={'variantid': 'variantid1', 'description': 'description1', 'name_bert_64': 'name1'})
temp1 = train.merge(temp1, how='inner', on='variantid1')
temp2 = text_and_bert.rename(columns={'variantid': 'variantid2', 'description': 'description2', 'name_bert_64': 'name2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

In [5]:
temp1 = resnet.rename(columns={'variantid': 'variantid1', 'main_pic_embeddings_resnet_v1': 'main1', 'pca_not_main': 'not_main1'})
temp1 = df.merge(temp1, how='inner', on='variantid1')
temp2 = resnet.rename(columns={'variantid': 'variantid2', 'main_pic_embeddings_resnet_v1': 'main2', 'pca_not_main': 'not_main2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

In [6]:
temp1 = attributes.rename(columns={'variantid': 'variantid1', 'characters': 'characters1'})
temp1 = df.merge(temp1, how='inner', on='variantid1')
temp2 = attributes.rename(columns={'variantid': 'variantid2', 'characters': 'characters2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

In [7]:
# df.to_parquet('merged.parquet')

### train model

In [7]:
class MultimodalDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.n = len(df)

    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):
        cur_df = self.df.iloc[idx]
        main1, main2 = torch.Tensor(cur_df.main1), torch.Tensor(cur_df.main2) #main images
        not_main1, not_main2 = torch.Tensor(cur_df.not_main1), torch.Tensor(cur_df.not_main2) # not main images
        attr1, attr2 = torch.tensor(cur_df.characters1), torch.tensor(cur_df.characters2) # attributes
        description1, description2 = torch.Tensor(cur_df.description1), torch.Tensor(cur_df.description2) # descriptions
        name1, name2 = torch.Tensor(cur_df.name1), torch.Tensor(cur_df.name2) # names

        labels = torch.tensor(cur_df.target)
        
        return main1, main2, not_main1, not_main2, attr1, attr2, description1, description2, name1, name2, labels

In [8]:
image_size = 128
description_size = 312
name_size = 64
attribute_size = 512
embedding_size = 256

In [9]:
train_ratio, validation_ratio, test_ratio = 0.75, 0.15, 0.10

train, test = train_test_split(df, test_size=(1 - train_ratio), stratify=df.target)
val, test = train_test_split(test, test_size=test_ratio/(test_ratio + validation_ratio), stratify=test.target)

train_dataset, val_dataset, test_dataset = MultimodalDataset(train), MultimodalDataset(val), MultimodalDataset(test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [10]:
class MultimodalModel(nn.Module):
    def __init__(self, image_size=128, description_size=312, name_size=64, attribute_size=512, embedding_size=256, batch_size=32):
        super(MultimodalModel, self).__init__()
        
        self.main_image = nn.Sequential( nn.Linear(image_size, embedding_size), nn.ReLU() ) # [b, 128] -> [b, 256] -> [b, 256, 1]
        self.not_main_image = nn.Sequential( nn.Linear(image_size, embedding_size), nn.ReLU() ) # [b, 128] -> [b, 256] -> [b, 256, 1]
        self.image_embedding = nn.Sequential( nn.Conv1d(embedding_size, 128, kernel_size=2), nn.ReLU() ) # [b, 256, 2] -> [b, 128, 1]

        self.name = nn.Sequential( nn.Linear(name_size, embedding_size), nn.ReLU() ) # [b, 64] -> [b, 256] -> [b, 256, 1]
        self.description = nn.Sequential( nn.Linear(description_size, embedding_size), nn.ReLU() ) # [b, 312] -> [b, 256] -> [b, 256, 1]
        self.text_embedding = nn.Sequential( nn.Conv1d(embedding_size, 128, kernel_size=2), nn.ReLU() ) # [b, 256, 2] -> [b, 128, 1]
        
        self.tabular_embedding = nn.Sequential(
            nn.Linear(attribute_size, embedding_size), # [b, 512] -> [b, 256]
            nn.ReLU(),
            nn.Linear(embedding_size, 128), # [b, 256] -> [b, 128]
            nn.ReLU(),
        ) # # [b, 128] -> [b, 128, 1]
        
        self.embedding = nn.Sequential(
            nn.Conv1d(128, 64, kernel_size=3), # [b, 128, 3] ->  [b, 64, 1] -> [b, 64]
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Linear(64*2, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 2)
        )
            
    def forward(self, main1, main2,
                not_main1, not_main2,
                attr1, attr2, description1,
                description2, name1, name2):

        main_img_emb1, main_img_emb2 = torch.unsqueeze(self.main_image(main1), 2), torch.unsqueeze(self.main_image(main2), 2)
        img_emb1, img_emb2 = torch.unsqueeze(self.not_main_image(not_main1), 2), torch.unsqueeze(self.not_main_image(not_main2), 2)
        
        image_emb1 = self.image_embedding(torch.cat((main_img_emb1, img_emb1), dim=2))
        image_emb2 = self.image_embedding(torch.cat((main_img_emb2, img_emb2), dim=2))

        name_emb1, name_emb2 = torch.unsqueeze(self.name(name1), 2), torch.unsqueeze(self.name(name2), 2)
        desc_emb1, desc_emb2 = torch.unsqueeze(self.description(description1), 2), torch.unsqueeze(self.description(description2), 2)
        text_emb1 = self.text_embedding(torch.cat((name_emb1, desc_emb1), dim=2))
        text_emb2 = self.text_embedding(torch.cat((name_emb2, desc_emb2), dim=2))

        tab_emb1, tab_emb2 = torch.unsqueeze(self.tabular_embedding(attr1), 2), torch.unsqueeze(self.tabular_embedding(attr2), 2)
                
        combined1 = self.embedding(torch.cat((image_emb1, text_emb1, tab_emb1), dim=2))
        combined2 = self.embedding(torch.cat((image_emb2, text_emb2, tab_emb2), dim=2))
                                   
        combined = torch.cat((torch.squeeze(combined1, 2), torch.squeeze(combined2, 2)), 1)
        return self.fc(combined)

In [14]:
# def contrastive_loss(distance, label, margin=1.0):
#     loss = (1 - label) * torch.pow(distance, 2) + \
#            (label) * torch.pow(torch.clamp(margin - distance, min=0.0), 2)
#     return torch.mean(loss)

In [14]:
model = MultimodalModel().to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()
scheduler1 = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
scheduler2 = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)

In [15]:
num_epochs = 10

for epoch in range(num_epochs):
    train_auc, valid_auc = 0.0, 0.0
    train_losses, validation_losses = 0.0, 0.0
    
    n1, n2 = len(train_loader), len(val_loader)
    print(f'Epoch {epoch}')
    model.train()
    
    for i, data in enumerate(tqdm(train_loader)):
        main1, main2, not_main1, not_main2, \
        attr1, attr2, description1, description2, \
        name1, name2, labels = data
        
        optimizer.zero_grad()
        output = model(main1.to(device), main2.to(device),
                not_main1.to(device), not_main2.to(device),
                attr1.to(device), attr2.to(device), description1.to(device),
                description2.to(device), name1.to(device), name2.to(device))

        #distance = nn.functional.pairwise_distance(output1, output2)
        loss = criterion(output, labels.to(device))
        loss.backward()
        optimizer.step()
        
        y_pred = np.argmax(F.softmax(output).cpu().detach().numpy(), axis=1)
        preds = (y_pred > 0.5).astype(int)
        precision, recall, _ = precision_recall_curve(labels, preds)
        train_auc += auc(recall, precision)
        train_losses += loss.item()
        
    print(f'Training PR-AUC: {train_auc / n1}. Loss: {train_losses / n1}')
    
    model.eval()
    
    with torch.no_grad():
        for i, data in enumerate(tqdm(val_loader)):
            main1, main2, not_main1, not_main2, \
            attr1, attr2, description1, description2, \
            name1, name2, labels = data
            
            output = model(main1.to(device), main2.to(device),
                not_main1.to(device), not_main2.to(device),
                attr1.to(device), attr2.to(device), description1.to(device),
                description2.to(device), name1.to(device), name2.to(device))
            
            loss = criterion(output, labels.to(device))
            y_pred = np.argmax(F.softmax(output).cpu().detach().numpy(), axis=1)
            preds = (y_pred > 0.5).astype(int)
            precision, recall, _ = precision_recall_curve(labels, preds)
            valid_auc += auc(recall, precision)
            validation_losses += loss.item()

    print(f'Validation PR-AUC: {valid_auc / n2}. Loss: {validation_losses / n2}')

    scheduler1.step()
    scheduler2.step()

Epoch 0


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8122291663285738. Loss: 0.5001284942341234


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8383350108600488. Loss: 0.45312686761583254
Epoch 1


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8394318343247825. Loss: 0.4511466701186657


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8428385424944361. Loss: 0.4368260700989257
Epoch 2


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8470553378927832. Loss: 0.4341748803712436


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8457451212061587. Loss: 0.42841810517756224
Epoch 3


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8531292842151008. Loss: 0.4194670418698639


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.852860141901173. Loss: 0.42525227484386857
Epoch 4


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8587729179645414. Loss: 0.4060947069344967


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8500946700083609. Loss: 0.4201602909385962
Epoch 5


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.864624340527097. Loss: 0.3924646028319893


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8512774420373559. Loss: 0.4156061605920224
Epoch 6


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8720175099495444. Loss: 0.37505712212874237


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8618935861776558. Loss: 0.3985903608208814
Epoch 7


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8790522485556045. Loss: 0.3588877206657029


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8647188369933233. Loss: 0.39614918848699054
Epoch 8


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.8840405627434961. Loss: 0.3452025564528483


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8699260380255219. Loss: 0.3942302744733534
Epoch 9


  0%|          | 0/13685 [00:00<?, ?it/s]

Training PR-AUC: 0.888389055406888. Loss: 0.3340813013032531


  0%|          | 0/2737 [00:00<?, ?it/s]

Validation PR-AUC: 0.8682331856774359. Loss: 0.3964038596161535


In [16]:
torch.save(model.state_dict(), 'binary_classification.pth')

In [17]:
test_auc = 0
test_losses = 0

with torch.no_grad():
    for i, data in enumerate(tqdm(test_loader)):
        main1, main2, not_main1, not_main2, \
        attr1, attr2, description1, description2, \
        name1, name2, labels = data
        
        output = model(main1.to(device), main2.to(device),
            not_main1.to(device), not_main2.to(device),
            attr1.to(device), attr2.to(device), description1.to(device),
            description2.to(device), name1.to(device), name2.to(device))
        
        loss = criterion(output, labels.to(device))
        y_pred = np.argmax(F.softmax(output).cpu().detach().numpy(), axis=1)
        preds = (y_pred > 0.5).astype(int)
        precision, recall, _ = precision_recall_curve(labels, preds)
        test_auc += auc(recall, precision)
        test_losses += loss.item()

  0%|          | 0/1825 [00:00<?, ?it/s]

In [18]:
n3 = len(test_loader)
print(f'Test PR-AUC: {test_auc / n3}. Loss: {test_losses / n3}')

Test PR-AUC: 0.8684373859031199. Loss: 0.3944242146080487


## LAMA

In [16]:
temp1 = resnet_df.rename(columns={'variantid': 'variantid1'})#, 'summed_embeddings': 'embedding1'})
temp1 = train_df.merge(temp1, how='inner', on='variantid1')

temp2 = resnet_df.rename(columns={'variantid': 'variantid2'})#, 'summed_embeddings': 'embedding2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

In [17]:
temp1 = text_df.rename(columns={'variantid': 'variantid1'})#, 'clean_desc': 'clean_desc1', 'name': 'name1'})
temp1 = df.merge(temp1, how='inner', on='variantid1')

temp2 = text_df.rename(columns={'variantid': 'variantid2'})#, 'clean_desc': 'clean_desc2', 'name': 'name2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

In [11]:
a = np.concatenate(attr_df.vector, axis=0).reshape(2252569, 24)
for i in range(24):
    attr_df[str(i+1)] = a[:, i]

In [12]:
attr_df = attr_df.drop('vector', axis=1)

In [18]:
temp1 = attr_df.rename(columns={'variantid': 'variantid1'})
temp1 = df.merge(temp1, how='inner', on='variantid1')

temp2 = attr_df.rename(columns={'variantid': 'variantid2'})
df = temp1.merge(temp2, how='inner', on='variantid2')

In [19]:
for i in range(24):
    mask_eq =  (df[f'{i+1}_x'] == df[f'{i+1}_y']).astype(int)
    df[f'{i+1}'] = mask_eq
    df[f'{i+1}'][(df[f'{i+1}_x'] == 'Not reported') | (df[f'{i+1}_y'] == 'Not reported')] = 0.5
    
    df = df.drop([f'{i+1}_x', f'{i+1}_y'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{i+1}'][(df[f'{i+1}_x'] == 'Not reported') | (df[f'{i+1}_y'] == 'Not reported')] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{i+1}'][(df[f'{i+1}_x'] == 'Not reported') | (df[f'{i+1}_y'] == 'Not reported')] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{i+1}'][(df[f'{i+1}_x'] == 'Not reported') | (df[f'{i+1}_y'] == 'Not reported')] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

In [23]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TARGET_NAME = 'target'

In [24]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [25]:
embedding1 = torch.Tensor(np.concatenate(df.summed_embeddings_x	.to_numpy(), axis=0).reshape(len(df), 128))
embedding2 = torch.Tensor(np.concatenate(df.summed_embeddings_y.to_numpy(), axis=0).reshape(len(df), 128))

distance = F.pairwise_distance(embedding1, embedding2).numpy()
df['image_distance'] = distance
df = df.drop(['summed_embeddings_x', 'summed_embeddings_y'], axis=1)

In [26]:
name1 = torch.Tensor(np.concatenate(df.name_bert_64_x.to_numpy(), axis=0).reshape(len(df), 64))
name2 = torch.Tensor(np.concatenate(df.name_bert_64_y.to_numpy(), axis=0).reshape(len(df), 64))

distance = F.pairwise_distance(name1, name2).numpy()
df['name_distance'] = distance
df = df.drop(['name_bert_64_x', 'name_bert_64_y'], axis=1)

In [27]:
description1 = torch.Tensor(np.concatenate(df.description_rubert_312_x.to_numpy(), axis=0).reshape(len(df), 312))
description2 = torch.Tensor(np.concatenate(df.description_rubert_312_y.to_numpy(), axis=0).reshape(len(df), 312))

distance = F.pairwise_distance(description1, description2).numpy()
df['description_distance'] = distance
df = df.drop(['description_rubert_312_x', 'description_rubert_312_y'], axis=1)

In [29]:
# df = pd.read_parquet('distances2.parquet')

In [31]:
df = df.drop(['variantid1', 'variantid2'], axis=1)

In [32]:
train_data, test_data = train_test_split(
    df,
    test_size=TEST_SIZE,
    stratify=df['target'],
    random_state=RANDOM_STATE
)

In [33]:
X_train, y_train = train_data.drop('target', axis=1), train_data['target']
X_test, y_test = test_data.drop('target', axis=1), test_data['target']

In [34]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [35]:
cols = train_data.columns.values[1:]

In [36]:
train_data[cols] = X_train
test_data[cols] = X_test

In [37]:
roles = {'target': 'target',
        #'drop': ['variantid1', 'variantid2']
        }

In [38]:
task = Task('binary', loss='logloss', metric='auc')

In [42]:
automl = TabularAutoML(
    task=task,
    # timeout = TIMEOUT,
    gpu_ids='0',
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
)

In [43]:
%%time
out_of_fold_predictions = automl.fit_predict(train_data, roles=roles, verbose=1)
predictions = automl.predict(test_data)
not_nan = np.any(~np.isnan(out_of_fold_predictions.data), axis=1)

print('Check scores:')
print('OOF score: {}'.format(roc_auc_score(train_labels, train_preds)))
print('TEST score: {}'.format(roc_auc_score(test_labels, test_preds)))

[16:27:23] Stdout logging level is INFO.
[16:27:23] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[16:27:23] Task: binary

[16:27:23] Start automl preset with listed constraints:
[16:27:23] - time: 3600.00 seconds
[16:27:23] - CPU: 4 cores
[16:27:23] - memory: 16 GB

[16:27:23] [1mTrain data shape: (934163, 28)[0m

[16:27:30] Layer [1m1[0m train process start. Time left 3593.05 secs
[16:27:31] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[16:28:23] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.7887975745803916[0m
[16:28:23] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[16:28:23] Time left 3539.63 secs

[16:29:26] [1mSelector_LightGBM[0m fitting and predicting completed
[16:29:26] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[16:34:49] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.917510978991514[0m
[16:34:49] [1mLvl_0_Pipe_1_Mod_0_LightGBM

In [48]:
train_labels, train_preds = train_data[roles['target']].values[not_nan], out_of_fold_predictions.data[not_nan][:, 0]
test_labels, test_preds = test_data[roles['target']].values, predictions.data[:, 0]

In [44]:
joblib.dump(automl, 'model_best.pkl')
#automl=joblib.load(‘model.pkl’)

['model_best.pkl']

In [58]:
automl2 = joblib.load('model_best.pkl')

In [50]:
predictions = automl2.predict(test_data)

In [47]:
def pr_auc(labels, y_pred):
    fpr, tpr, thresholds = roc_curve(labels, y_pred)
    return auc(fpr, tpr)

In [56]:
pr_auc(test_labels, predictions.data[:, 0])

0.9134703502080135