In [1]:
# !nvidia-smi

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
from config import model_config
# from model import VedioRecommender
from dataset import ViewDataSet
import utils

In [4]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from adamp import AdamP

In [5]:
import torch
from torch import nn

In [6]:
from sklearn import metrics
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import json

### 1) Load Model

In [7]:
from model import VedioRecommenderModel

### 2) Read Data 

In [8]:
df_agg_dataset = utils.open_object("./artifacts/df_agg_dataset.pkl")

In [9]:
from sklearn.utils import shuffle
df_agg_dataset = shuffle(df_agg_dataset,random_state = 33).head(10000)

In [10]:
df_agg_dataset = df_agg_dataset.rename(columns = {'label':'labels'})

In [11]:
df_train,df_test = train_test_split(df_agg_dataset,test_size=0.3,random_state=33,shuffle = False)

train_dataset = ViewDataSet(df_train)
test_dataset = ViewDataSet(df_test)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=True)

In [13]:
for inputs in train_loader:
    break

In [14]:
# inputs

### 3) Training

In [15]:
model = VedioRecommenderModel()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing VedioRecommender: ['predictions.LayerNorm.weight', 'predictions.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias']
- This IS expected if you are initializing VedioRecommender from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VedioRecommender from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VedioRecommender were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['feature_embedding_dict.product_lang_name.weight', 'classifier.weight', 'feature_embedding_dict.video_strea

In [28]:
model

VedioRecommender(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
   

In [16]:
# with torch.no_grad():
#     outputs = model(inputs)

In [17]:
# outputs

In [18]:
optimizer = AdamP(model.parameters(),lr=2e-5,
                  betas=(0.9, 0.999), weight_decay=1e-1)

In [19]:
class train_config:
    epoches = 5
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_save_dir = "./artifacts/models"
    train_batch_size = 30
    val_batch_size = int(train_batch_size*1.5)
    eval_steps = (len(train_dataset)//train_batch_size)//2

In [20]:
model = model.to(train_config.device)

In [21]:
train_loader = DataLoader(train_dataset, batch_size=train_config.train_batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=train_config.val_batch_size, shuffle=True)

In [22]:
def evaluate_full_metrics(model,dataset_loader):

    model.eval()

    loss_list = []
    labels_list = []
    pred_list = []
    prob_list = []

    pbar = tqdm(total = len(dataset_loader),desc = "Model Evaluating",position=0, leave=True)


    for inputs in dataset_loader:

        with torch.no_grad():
            
            inputs = utils.to_device(inputs,train_config.device)
            labels = inputs['labels']
            outputs = model(inputs)
            
            loss_list.append(outputs.loss.item())
            
            labels  = labels.detach().cpu().numpy()
            labels_list.extend(labels)

            probs = torch.sigmoid(outputs.logits)
            prob = probs[:,1]
            prob = prob.detach().cpu().numpy()
            prob_list.extend(prob)
            
            pred = torch.argmax(probs,axis = 1)
            pred = pred.detach().cpu().numpy()
            pred_list.extend(pred)
            
            pbar.update(1)

    pbar.close()
    
    auc = metrics.roc_auc_score(labels_list, prob_list)
    recall, precision, thres = metrics.precision_recall_curve(labels_list, prob_list)
    
    
    f1 = recall*precision*2 / (recall + precision)
    f1_temp = f1
    f1 = np.nan_to_num(f1,nan = -1)

    arg = f1.argmax()
    
    best_thres = thres[arg]
    best_f1 = f1[arg]
    best_recall = recall[arg]
    best_precision = precision[arg]
    
    pred_list = [1 if prob>=best_thres else 0 for prob in prob_list]
    accuracy = metrics.accuracy_score(labels_list,pred_list)
    
    avg_loss = np.mean(loss_list)
    
    result = {"threshold":best_thres,
              "accuracy":accuracy,
              "recall":best_recall,
              "precision":best_precision,
              "f1":best_f1,'auc':auc,
              'eval_loss':avg_loss} 

    return result,prob_list

In [23]:
evaluate_result,prob_list = evaluate_full_metrics(model,test_loader)

Model Evaluating: 100%|██████████| 67/67 [00:37<00:00,  1.78it/s]
  f1 = recall*precision*2 / (recall + precision)


#### before training

In [24]:
display(pd.DataFrame([evaluate_result]))

Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss
0,0.636642,0.481,0.224479,0.863727,0.356346,0.627266,0.946061


In [25]:
def save_model(model, model_save_dir,step,model_metrics):
    model_save_dir = os.path.join(model_save_dir,f"checkpoint-{step}")
    model_name = "pytorch_model.bin"
    train_state_name = "training_state.json"
    os.makedirs(model_save_dir,exist_ok=True)
    
    model_path = os.path.join(model_save_dir,model_name)
    train_state_path = os.path.join(model_save_dir,train_state_name)

    torch.save(model,model_path)
    
    if model_metrics is not None:
        with open(train_state_path,mode = 'w',encoding = 'utf-8-sig') as f:
            model_metrics = {str(k):str(v) for k,v in model_metrics.items()} 
            json.dump(model_metrics,f,indent=4)

In [26]:
total_pbar = tqdm(total = len(train_loader)*train_config.epoches,desc = "Model Training",position=0, leave=True)

total_batch = 0 
for epoch in range(train_config.epoches):
    print("*"*50 + f"epoch: {epoch + 1}" + "*"*50)
    
    train_losses = []
    
    for inputs in train_loader:
        model = model.train()
        inputs = utils.to_device(inputs,train_config.device)
        labels = inputs['labels']
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = outputs.loss 
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        
        if (total_batch+1) % train_config.eval_steps ==0:
            
            model_metrics,_ = evaluate_full_metrics(model,test_loader)
            train_loss = np.mean(train_losses)
            model_metrics['train_loss'] = train_loss
            model_metrics["steps"] = total_batch+1
        
            save_model(model,train_config.model_save_dir,total_batch+1,model_metrics)
            df_metrics_temp = pd.DataFrame([model_metrics])
            display(df_metrics_temp)
            
            model = model.train()
            
        total_batch +=1
        total_pbar.update(1)
        
total_pbar.close()

Model Training:   0%|          | 0/1170 [00:00<?, ?it/s]

**************************************************epoch: 1**************************************************


Model Evaluating: 100%|██████████| 67/67 [00:36<00:00,  1.86it/s]]


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.492642,0.975,0.962882,0.883768,0.92163,0.978599,0.106742,0.166805,116


Model Evaluating: 100%|██████████| 67/67 [00:36<00:00,  1.82it/s]]  


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.478402,0.975667,0.975446,0.875752,0.922914,0.981337,0.106907,0.146027,232


Model Training:  20%|██        | 234/1170 [04:33<1:36:37,  6.19s/it]

**************************************************epoch: 2**************************************************


Model Evaluating: 100%|██████████| 67/67 [00:36<00:00,  1.81it/s]]  


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.340879,0.976333,0.94958,0.905812,0.927179,0.97741,0.099843,0.100453,348


Model Evaluating: 100%|██████████| 67/67 [00:36<00:00,  1.84it/s]]  


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.553251,0.976667,0.965293,0.891784,0.927083,0.980742,0.088164,0.106881,464


Model Training:  40%|████      | 468/1170 [09:08<39:14,  3.35s/it]  

**************************************************epoch: 3**************************************************


Model Evaluating: 100%|██████████| 67/67 [00:37<00:00,  1.81it/s]]


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.678457,0.977333,0.965443,0.895792,0.929314,0.983929,0.093564,0.091943,580


Model Evaluating: 100%|██████████| 67/67 [00:37<00:00,  1.79it/s]]  


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.720445,0.976333,0.981982,0.873747,0.924708,0.985455,0.097917,0.091623,696


Model Training:  60%|██████    | 702/1170 [13:44<15:53,  2.04s/it]  

**************************************************epoch: 4**************************************************


Model Evaluating: 100%|██████████| 67/67 [00:36<00:00,  1.83it/s]]


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.692164,0.977667,0.986486,0.877756,0.92895,0.983156,0.085977,0.067864,812


Model Training:  71%|███████   | 829/1170 [16:10<05:02,  1.13it/s]  

KeyboardInterrupt: 

In [27]:
model_metrics

{'threshold': 0.692164,
 'accuracy': 0.9776666666666667,
 'recall': 0.9864864864864865,
 'precision': 0.8777555110220441,
 'f1': 0.928950159066808,
 'auc': 0.9831562365033947,
 'eval_loss': 0.08597704187146764,
 'train_loss': 0.06786412145857784,
 'steps': 812}