In [152]:
import sys
import time

import optuna
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from fastprogress import master_bar, progress_bar

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

%matplotlib inline
import matplotlib.pyplot as plt

sys.path.append('../src')
from utils import DataHandler

In [153]:
dh = DataHandler()

In [154]:
val_df = pd.read_feather('../data/team/preds_val/X_val_wo_lec.feather')
y_true = val_df['answered_correctly'].values

usecols = ['row_id', 'timestamp', 'user_id', 'content_id', 'task_container_id', 'answered_correctly', 'oof_cat', 'oof_trans_new']

val_df['timestamp'] = val_df['timestamp'] // 60_000

In [160]:
te_dict = dh.load('../data/processed/te_content_id_by_answered_correctly.pkl')
te_df = pd.DataFrame.from_dict(te_dict).sort_index().iloc[:13523]
q2te = np.mean(te_df.values, axis=1)
q2te = {id_: v for id_, v in enumerate(q2te)}

val_df['te'] = val_df['content_id'].map(q2te)
usecols += ['te']

In [161]:
oof_cat_df = pd.read_csv('../data/team/preds_val/preds_val_0166e411e51e48a8b6639b4da2510d61_0.8032595869324429_1209.csv')
oof_trans_df = pd.read_csv('../data/team/preds_val/preds_val_transformer_020_8_20210104221115_0.796492531156136.csv')
oof_trans_seq50_df = pd.read_csv('../data/team/preds_val/preds_val_based_seq50_step75.csv')

oof_trans_new = np.load('./transformer_added_content_cnt.npy')
oof_trans_new = oof_trans_new[np.where(oof_trans_new > 0)]

In [162]:
oof_cat = oof_cat_df['preds'].values
oof_trans = oof_trans_df['preds'].values
oof_trans50 = oof_trans_seq50_df['preds'].values

In [163]:
val_df['oof_cat'] = oof_cat
val_df['oof_trans'] = oof_trans
val_df['oof_trans_new'] = oof_trans_new

In [164]:
val_df['diff_cat'] = np.abs(val_df['answered_correctly'] - val_df['oof_cat'])
val_df['diff_trans'] = np.abs(val_df['answered_correctly'] - val_df['oof_trans'])
val_df['diff_trans_new'] = np.abs(val_df['answered_correctly'] - val_df['oof_trans_new'])

In [165]:
val_df['diff_model'] = np.abs(val_df['diff_cat'] - val_df['diff_trans'])
val_df['diff_model_new'] = np.abs(val_df['diff_cat'] - val_df['diff_trans_new'])
val_df['diff_mean'] = (val_df['diff_cat'] + val_df['diff_trans']) / 2

In [166]:
val_df_sorted = val_df.sort_values(by='diff_mean', ascending=False)

In [167]:
val_df_sorted.head(10)[usecols]

Unnamed: 0,row_id,timestamp,user_id,content_id,task_container_id,answered_correctly,oof_cat,oof_trans_new,te
1653241,66896386,122135,1421546709,3089,1049,0,0.997474,0.996973,0.970974
942590,37630361,68132,803513901,12208,652,0,0.996317,0.993789,0.938363
2013351,82643503,71968,1759786869,10439,2653,0,0.997865,0.993809,0.981983
463971,19243433,12936,413660787,2722,172,0,0.995004,0.997171,0.973463
1496281,60897866,1141668,1292330761,6918,3467,0,0.994616,0.995098,0.898105
1579427,63830932,5635,1357720090,729,97,0,0.997033,0.984726,0.724689
689579,27751599,71270,592041780,10695,2264,0,0.994747,0.995115,0.974119
2154127,88308724,40439,1878706735,7704,1144,0,0.995605,0.992692,0.349255
1236870,50211733,62763,1063761126,70,3640,0,0.996081,0.989495,0.967421
1473757,60095742,344860,1275783392,10011,820,0,0.992914,0.993916,0.930236


# Post Process

In [168]:
val_df['user_task_id'] = val_df['user_id'].astype(str) + '__' + val_df['task_container_id'].astype(str)
val_df['order_task'] = val_df.groupby(['user_id', 'task_container_id']).cumcount()

In [169]:
a = dict(val_df.groupby('user_task_id')['oof_cat'].mean())

In [170]:
val_df['mean_oof_cat_each_user_task_id'] = val_df['user_task_id'].map(a)

In [179]:
val_df[usecols + ['mean_oof_cat_each_user_task_id']].iloc[463971 - 5:].head(10)

Unnamed: 0,row_id,timestamp,user_id,content_id,task_container_id,answered_correctly,oof_cat,oof_trans_new,te,mean_oof_cat_each_user_task_id
463966,19243428,12933,413660787,11614,171,1,0.82217,0.825693,0.682441,0.869411
463967,19243429,12933,413660787,11613,171,0,0.819482,0.859057,0.662753,0.869411
463968,19243430,12933,413660787,11612,171,1,0.96658,0.955233,0.918738,0.869411
463969,19243431,12936,413660787,2723,172,1,0.976472,0.987551,0.913437,0.877516
463970,19243432,12936,413660787,2724,172,1,0.661072,0.658414,0.369696,0.877516
463971,19243433,12936,413660787,2722,172,0,0.995004,0.997171,0.973463,0.877516
463972,19243435,12949,413660787,11864,174,1,0.987608,0.973677,0.923118,0.892741
463973,19243436,12949,413660787,11866,174,1,0.959914,0.936176,0.834086,0.892741
463974,19243437,12949,413660787,11863,174,0,0.78169,0.735426,0.628808,0.892741
463975,19243438,12949,413660787,11865,174,1,0.967743,0.957596,0.861261,0.892741


In [206]:
val_df.iloc[392314 - 5:][usecols].head(10)

Unnamed: 0,row_id,timestamp,user_id,content_id,task_container_id,answered_correctly,oof_cat,oof_trans_new,te
392309,16048340,686653,344198647,11582,93,1,0.555679,0.616341,0.725369
392310,16048341,686653,344198647,11584,93,1,0.420748,0.353808,0.485445
392311,16048342,686653,344198647,11581,93,0,0.258031,0.305504,0.379803
392312,16048343,686653,344198647,11580,93,1,0.788737,0.79273,0.798418
392313,16048344,686663,344198647,11867,94,0,0.483504,0.523058,0.605359
392314,16048345,686663,344198647,11864,94,1,0.941333,0.969952,0.923118
392315,16048346,686663,344198647,11865,94,1,0.880998,0.903297,0.861261
392316,16048347,686663,344198647,11866,94,1,0.860692,0.852103,0.834086
392317,16048348,686663,344198647,11863,94,1,0.574675,0.609735,0.628808
392318,16048349,686666,344198647,8712,95,0,0.174943,0.185686,0.288973


In [180]:
val_df[val_df['content_id'] == 11864]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,oof_trans_new,diff_cat,diff_trans,diff_trans_new,diff_model,diff_model_new,diff_mean,user_task_id,order_task,mean_oof_cat_each_user_task_id
10692,577623,5078,11637089,11864,0,260,0,1,65200,1,...,0.96132,0.036291,0.034866,0.03868,0.001425,0.002388,0.035579,11637089__260,3,0.757345
19978,970449,97824,19981595,11864,0,144,3,0,800,1,...,0.375546,0.665208,0.497495,0.375546,0.167713,0.289662,0.581352,19981595__144,2,0.417729
85584,3786524,271333,80944123,11864,0,122,3,0,1000,1,...,0.374559,0.406948,0.282199,0.374559,0.124749,0.032388,0.344573,80944123__122,3,0.351374
92945,4005471,24680,85604332,11864,0,117,1,0,1250,1,...,0.247785,0.186568,0.047036,0.247785,0.139532,0.061217,0.116802,85604332__117,1,0.324218
133036,5508914,9408,117734396,11864,0,176,0,1,18000,1,...,0.971735,0.057115,0.018294,0.028265,0.03882,0.02885,0.037705,117734396__176,0,0.770424
216888,8587517,278922,186327280,11864,0,299,0,1,32000,1,...,0.986856,0.025177,0.011395,0.013144,0.013782,0.012032,0.018286,186327280__299,2,0.826991
319413,12947987,669512,280270199,11864,0,251,0,1,80000,1,...,0.951813,0.060106,0.028898,0.048187,0.031208,0.011919,0.044502,280270199__251,3,0.72871
392314,16048345,686663,344198647,11864,0,94,0,1,83600,1,...,0.969952,0.058667,0.028773,0.030048,0.029894,0.028619,0.04372,344198647__94,1,0.748241
451047,18711457,19837,402675290,11864,0,851,0,1,62000,1,...,0.992637,0.008507,0.004895,0.007363,0.003612,0.001144,0.006701,402675290__851,1,0.902557
457725,18978206,61337,407772824,11864,0,463,0,1,84800,1,...,0.882775,0.150814,0.085892,0.117225,0.064922,0.03359,0.118353,407772824__463,4,0.631415


In [139]:
roc_auc_score(y_true, oof_cat)

0.8032595869324429

In [175]:
roc_auc_score(y_true, (val_df['mean_oof_cat_each_user_task_id'].values * 0.1 + oof_cat * 0.8 + val_df['te'] * 0.1))

0.8029228219757284

# Ensemble

In [42]:
y_preds = oof_cat * 0.5 + oof_trans * 0.5

roc_auc_score(y_true, y_preds)

0.8059445715931265

In [43]:
y_preds = oof_cat * 0.5 + oof_trans_new * 0.5

roc_auc_score(y_true, y_preds)

0.8058462447415765

In [6]:
# best_score = 0
# best_weight = 0
# iter_num = 100

# for i in tqdm(range(iter_num)):
#     p1 = i / iter_num
#     p2 = 1 - p1
#     y_preds = oof_cat * p1 + oof_trans * p2

#     score = roc_auc_score(y_true, y_preds)
#     if score > best_score:
#         best_score = score
#         best_weight = p1
        
# print(best_score, best_weight)   # s=0.8063879556253732 w=0.63

In [44]:
best_score = 0
best_weight = 0
iter_num = 100

for i in tqdm(range(iter_num)):
    p1 = i / iter_num
    p2 = 1 - p1
    y_preds = oof_cat * p1 + oof_trans_new * p2

    score = roc_auc_score(y_true, y_preds)
    if score > best_score:
        best_score = score
        best_weight = p1
        
print(best_score, best_weight)   # s=0.8063879556253732 w=0.63

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


0.8063316721131248 0.64


In [45]:
best_score = 0
best_weight1 = 0
best_weight2 = 0
iter_num = 100

oof_list = [oof_cat, oof_trans, oof_trans_new]

def objective(trial):
    p_list = [0 for i in range(len(oof_list))]
    for i in range(len(oof_list) - 1):
        p_list[i] = trial.suggest_discrete_uniform(f'p{i}', 0.0, 1.0 - sum(p_list), 0.01)
    p_list[-1] = round(1 - sum(p_list[:-1]), 2)

    y_pred = np.zeros(len(y_true))
    for i in range(len(oof_list)):
        y_pred += oof_list[i] * p_list[i]

    return roc_auc_score(y_true, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=60)

[32m[I 2021-01-07 04:53:39,044][0m A new study created in memory with name: no-name-e9f0ecbf-42ce-4666-a42d-2640c18c305f[0m
[32m[I 2021-01-07 04:53:39,808][0m Trial 0 finished with value: 0.8011625412470713 and parameters: {'p0': 0.13, 'p1': 0.58}. Best is trial 0 with value: 0.8011625412470713.[0m
  low=low, old_high=old_high, high=high, step=q
[32m[I 2021-01-07 04:53:40,566][0m Trial 1 finished with value: 0.8041055442587522 and parameters: {'p0': 0.33, 'p1': 0.04}. Best is trial 1 with value: 0.8041055442587522.[0m
[32m[I 2021-01-07 04:53:41,320][0m Trial 2 finished with value: 0.8032792537276676 and parameters: {'p0': 0.24, 'p1': 0.55}. Best is trial 1 with value: 0.8041055442587522.[0m
  low=low, old_high=old_high, high=high, step=q
[32m[I 2021-01-07 04:53:42,074][0m Trial 3 finished with value: 0.8058639020568567 and parameters: {'p0': 0.79, 'p1': 0.12}. Best is trial 3 with value: 0.8058639020568567.[0m
[32m[I 2021-01-07 04:53:42,826][0m Trial 4 finished with va

[32m[I 2021-01-07 04:53:57,220][0m Trial 23 finished with value: 0.8045846443517194 and parameters: {'p0': 0.91, 'p1': 0.08}. Best is trial 14 with value: 0.806612283711888.[0m
  low=low, old_high=old_high, high=high, step=q
[32m[I 2021-01-07 04:53:57,979][0m Trial 24 finished with value: 0.8064824442997625 and parameters: {'p0': 0.54, 'p1': 0.18}. Best is trial 14 with value: 0.806612283711888.[0m
  low=low, old_high=old_high, high=high, step=q
[32m[I 2021-01-07 04:53:58,738][0m Trial 25 finished with value: 0.8063942873926313 and parameters: {'p0': 0.7000000000000001, 'p1': 0.22}. Best is trial 14 with value: 0.806612283711888.[0m
[32m[I 2021-01-07 04:53:59,496][0m Trial 26 finished with value: 0.8065963620809975 and parameters: {'p0': 0.6, 'p1': 0.16}. Best is trial 14 with value: 0.806612283711888.[0m
  low=low, old_high=old_high, high=high, step=q
[32m[I 2021-01-07 04:54:00,255][0m Trial 27 finished with value: 0.8052688461423416 and parameters: {'p0': 0.85, 'p1': 0.

  low=low, old_high=old_high, high=high, step=q
[32m[I 2021-01-07 04:54:19,222][0m Trial 52 finished with value: 0.8065643377095265 and parameters: {'p0': 0.56, 'p1': 0.24}. Best is trial 14 with value: 0.806612283711888.[0m
  low=low, old_high=old_high, high=high, step=q
[32m[I 2021-01-07 04:54:19,981][0m Trial 53 finished with value: 0.8065502127740923 and parameters: {'p0': 0.66, 'p1': 0.2}. Best is trial 14 with value: 0.806612283711888.[0m
  low=low, old_high=old_high, high=high, step=q
[32m[I 2021-01-07 04:54:20,740][0m Trial 54 finished with value: 0.8065671326349672 and parameters: {'p0': 0.59, 'p1': 0.14}. Best is trial 14 with value: 0.806612283711888.[0m
  low=low, old_high=old_high, high=high, step=q
[32m[I 2021-01-07 04:54:21,500][0m Trial 55 finished with value: 0.8063834714901991 and parameters: {'p0': 0.54, 'p1': 0.11}. Best is trial 14 with value: 0.806612283711888.[0m
[32m[I 2021-01-07 04:54:22,258][0m Trial 56 finished with value: 0.8061090345564579 and

In [46]:
best_params = list(study.best_params.values())
best_weight = best_params + [round(1 - sum(best_params), 2)]

In [47]:
print(f'score: {study.best_value}')   # score: 0.8064009245782751
print(f'weight: {best_weight}')   # weight: [0.63, 0.32, 0.05]

score: 0.806612283711888
weight: [0.62, 0.19, 0.19]


# Stacking

In [10]:
oof_concat_array = np.vstack([oof_cat, oof_trans]).T

val_rate = 0.8
val_row_num = int(len(val_df) * val_rate)
trn_idx = val_df.iloc[:val_row_num].index.values
val_idx = val_df.iloc[val_row_num:].index.values

trn_x = oof_concat_array[trn_idx, :]
val_x = oof_concat_array[val_idx, :]

trn_y = y_true[trn_idx].reshape(-1, 1)
val_y = y_true[val_idx].reshape(-1, 1)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
class CustomDataset(Dataset):
    def __init__(self, array, target=None):
        super(CustomDataset, self).__init__()
        self.array = array
        
        self.is_train = False
        self.target = target
        if target is not None:
            self.is_train = True
        
    def __len__(self):
        return len(self.array)
    
    def __getitem__(self, idx):
        feat = self.array[idx, :]
        
        if self.is_train:
            target = self.target[idx]
            return torch.FloatTensor(feat), torch.LongTensor(target)
        else:
            return torch.FloatTensor(feat)

In [12]:
class Mlp(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 256)
        self.drop1 = nn.Dropout(0.1)
        
        self.fc2 = nn.Linear(256, 256)
        self.drop2 = nn.Dropout(0.1)
        
        self.fc3 = nn.Linear(256, 1)
        
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.drop1(x)
        
        x = self.fc2(x)
        x = self.relu(x)
        x = self.drop2(x)
        
        x = self.fc3(x)
        return x

In [13]:
def train(trn_x, val_x, trn_y, val_y, multi_gpu=False):
    model = Mlp()

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)

    train_dataset = CustomDataset(trn_x, trn_y)
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32, num_workers=4)

    valid_dataset = CustomDataset(val_x, val_y)
    valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=32, num_workers=4)

    best_epoch = -1
    best_val_score = -np.inf
    mb = master_bar(range(30))

    train_loss_list = []
    val_loss_list = []
    val_score_list = []

    for epoch in mb:
        start_time = time.time()

        model, avg_loss = _train_epoch(model, train_loader, criterion, optimizer, mb)
        valid_preds, avg_val_loss = _val_epoch(model, valid_loader, criterion)

        val_score = roc_auc_score(val_y, valid_preds)

        train_loss_list.append(avg_loss)
        val_loss_list.append(avg_val_loss)
        val_score_list.append(val_score)

        elapsed = time.time() - start_time
        mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.6f}  avg_val_loss: {avg_val_loss:.6f} val_score: {val_score:.6f} time: {elapsed:.0f}s')

        if val_score > best_val_score:
            best_epoch = epoch + 1
            best_val_score = val_score
            best_valid_preds = valid_preds
            if multi_gpu:
                best_model = model.module.state_dict()
            else:
                best_model = model.state_dict()

    oof[val_x.index, :] = best_valid_preds
    cv += best_val_score * 1   # self.fold_df[col].max()

    print('\n\n===================================\n')
    print(f'CV: {cv:.6f}')
    print('\n===================================\n\n')


def _train_epoch(model, train_loader, criterion, optimizer, mb):
    model.train()
    avg_loss = 0.

    for feats, targets in progress_bar(train_loader, parent=mb):
        if type(feats) == dict:
            for k, v in feats.items():
                feats[k] = v.to(device)
        else:
            feats = feats.to(device)
        targets = targets.to(device)

        preds = model(feats)

        loss = criterion(preds, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    del feats, targets; gc.collect()
    return model, avg_loss

def _val_epoch(model, valid_loader, criterion, n_classes):
    model.eval()
    valid_preds = np.zeros((len(valid_loader.dataset), n_classes))

    avg_val_loss = 0.
    valid_batch_size = valid_loader.batch_size

    with torch.no_grad():
        for i, (feats, targets) in enumerate(valid_loader):
            if type(feats) == dict:
                for k, v in feats.items():
                    feats[k] = v.to(device)
            else:
                feats = feats.to(device)
            targets = targets.to(device)

            preds = model(feats)

            loss = criterion(preds, targets)

            valid_preds[i * valid_batch_size: (i + 1) * valid_batch_size, :] = preds.sigmoid().cpu().detach().numpy().reshape(-1, 1)
            avg_val_loss += loss.item() / len(valid_loader)

    return valid_preds, avg_val_loss

In [None]:
train(trn_x, val_x, trn_y, val_y)

█

In [34]:
a = np.random.randint(1, 5, (120))
a

array([4, 1, 1, 1, 4, 1, 2, 4, 3, 4, 3, 1, 2, 3, 1, 1, 3, 1, 2, 3, 2, 3,
       2, 3, 2, 2, 3, 2, 3, 1, 2, 3, 3, 1, 2, 4, 2, 3, 2, 1, 2, 2, 4, 4,
       1, 4, 3, 4, 3, 3, 2, 2, 3, 3, 1, 1, 2, 1, 3, 3, 2, 4, 2, 1, 1, 4,
       2, 1, 3, 2, 4, 4, 1, 2, 4, 4, 3, 2, 1, 4, 2, 4, 4, 3, 1, 2, 1, 4,
       3, 1, 3, 2, 4, 3, 2, 1, 3, 1, 1, 2, 2, 1, 4, 1, 1, 3, 1, 1, 1, 1,
       3, 2, 2, 1, 3, 1, 2, 4, 1, 1])

In [35]:
%%time

counter = np.zeros(len(a))

b = []
for i, v in enumerate(a):
    if v in b:
        counter[i] = 1
    else:
        counter[i] = 0
        b.append(v)

CPU times: user 33 µs, sys: 20 µs, total: 53 µs
Wall time: 55.8 µs


In [36]:
counter

array([0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1.])

In [37]:
b

[4, 1, 2, 3]