In [1]:
# !nvidia-smi

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [3]:
from config import model_config
from model import VedioRecommender
from dataset import ViewDataSet
import utils

In [4]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from adamp import AdamP

In [5]:
import torch
from torch import nn

In [6]:
from sklearn import metrics
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import json

### 1) Load Model

In [7]:
model = VedioRecommender(model_config)

### 2) Read Data 

In [8]:
df_agg_dataset = utils.open_object("./artifacts/df_agg_dataset.pkl")

In [9]:
df_agg_dataset.head()

Unnamed: 0,episode_duration,device_first_visit_age,user_age,video_start_hour,video_end_hour,platform_name,user_type,subscription_source,plan_platform,resolution,...,video_streaming_mode,cp_name,product_cat_name,product_lang_name,product_series_cms_id,next_sri_des,hist_sri_des,user_id,sequence_id,label
0,0.491445,0.15625,0.991258,0.536232,0.557971,"[3, 3, 3, 3, 3]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, 0]","[31, 31, 31, 31, 31]","[0, 0, 0, 0, 0]",...,"[1, 1, 1, 1, 2]","[12, 12, 12, 12, 38, 36]","[16, 16, 16, 16, 17, 39]","[1, 1, 1, 1, 1, 1]","[508, 508, 508, 508, 1094, 924]",Ashes of Love: Four thousand years ago when Ji...,sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,1,0
1,0.491445,0.15625,0.991258,0.536232,0.557971,"[3, 3, 3, 3, 3]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, 0]","[31, 31, 31, 31, 31]","[0, 0, 0, 0, 0]",...,"[1, 1, 1, 1, 2]","[12, 12, 12, 12, 38, 28]","[16, 16, 16, 16, 17, 29]","[1, 1, 1, 1, 1, 0]","[508, 508, 508, 508, 1094, 801]",Gonjiam Haunted Asylum: 在网路广播节目剧组的召集下，數名青年一同進入...,sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,4,0
2,0.491445,0.15625,0.991258,0.536232,0.557971,"[3, 3, 3, 3, 3]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, 0]","[31, 31, 31, 31, 31]","[0, 0, 0, 0, 0]",...,"[1, 1, 1, 1, 2]","[12, 12, 12, 12, 38, 34]","[16, 16, 16, 16, 17, 38]","[1, 1, 1, 1, 1, 1]","[508, 508, 508, 508, 1094, 1424]",Look For A Star (SVOD): It’s hardly a coincide...,sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,5,0
3,0.459936,0.15625,0.991258,0.550725,0.572464,"[3, 3, 3, 3, 3]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, 0]","[31, 31, 31, 31, 31]","[0, 0, 0, 0, 0]",...,"[1, 1, 1, 2, 2]","[12, 12, 12, 38, 38, 56]","[16, 16, 16, 17, 17, 17]","[1, 1, 1, 1, 1, 1]","[508, 508, 508, 1094, 169, 660]",The Penthouse Special: The Penthouse is a plac...,sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,8,0
4,0.430993,0.15625,0.991258,0.565217,0.57971,"[3, 3, 3, 3, 3]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, 0]","[31, 31, 31, 31, 31]","[0, 0, 0, 0, 0]",...,"[1, 1, 2, 2, 2]","[12, 12, 38, 38, 16, 7]","[16, 16, 17, 17, 17, 35]","[1, 1, 1, 1, 1, 1]","[508, 508, 1094, 169, 984, 1407]","My Forever Sunshine: The story about Paeng, a ...",sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,13,0


In [10]:
sequence_5_features = ['platform_name', 'user_type',
       'subscription_source', 'plan_platform', 'resolution', 'subtitle',
       'screen_mode', 'device_network_mode', 'video_streaming_mode']

sequence_6_features = ['cp_name','product_cat_name', 'product_lang_name', 'product_series_cms_id']

In [11]:
for col in sequence_5_features:
    df_agg_dataset = df_agg_dataset[df_agg_dataset[col].apply(lambda x:len(x)==5)] 

for col in sequence_6_features:
    df_agg_dataset = df_agg_dataset[df_agg_dataset[col].apply(lambda x:len(x)==6)] 

In [12]:
df_train,df_test = train_test_split(df_agg_dataset,test_size=0.3,random_state=33,shuffle = True )

In [13]:
train_dataset = ViewDataSet(df_train)
test_dataset = ViewDataSet(df_test)

In [14]:
batch_size = 32

In [15]:
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=True)

In [16]:
for inputs in train_loader:
    break

### 3) Training

In [17]:
scores = model(inputs)

In [18]:
BCELoss = nn.BCELoss()

In [19]:
labels = inputs['label']

In [20]:
BCELoss(scores,labels.view(-1,1))

tensor(0.7262, grad_fn=<BinaryCrossEntropyBackward0>)

In [22]:
optimizer = AdamP(model.parameters(),lr=2e-4,
                  betas=(0.9, 0.999), weight_decay=1e-1)

In [23]:
class train_config:
    epoches = 5
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_save_dir = "./artifacts/models"
    train_batch_size = 12
    val_batch_size = int(train_batch_size*1.5)
    eval_steps = (len(train_dataset)//train_batch_size)//2

In [24]:
model = model.to(train_config.device)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [25]:
def evaluate_full_metrics(model,dataset_loader):

    model.eval()

    loss_list = []
    labels_list = []
    pred_list = []
    prob_list = []

    pbar = tqdm(total = len(dataset_loader),desc = "Model Evaluating",position=0, leave=True)


    for inputs in dataset_loader:

        with torch.no_grad():
            
            inputs = utils.to_device(inputs,train_config.device)
            labels = inputs['label'].view(-1,1)
            
            probs = model(inputs)
            
            loss = BCELoss(probs,labels).item()
            loss_list.append(loss)
            
            labels  = labels.detach().cpu().numpy()
            labels_list.extend(labels.flatten())

            probs = probs.detach().cpu().numpy()
            prob_list.extend(probs.flatten())
            pbar.update(1)

    pbar.close()
    
    auc = metrics.roc_auc_score(labels_list, prob_list)
    recall, precision, thres = metrics.precision_recall_curve(labels_list, prob_list)
    
    
    f1 = recall*precision*2 / (recall + precision)
    f1_temp = f1
    f1 = np.nan_to_num(f1,nan = -1)

    arg = f1.argmax()
    
    best_thres = thres[arg]
    best_f1 = f1[arg]
    best_recall = recall[arg]
    best_precision = precision[arg]
    
    pred_list = [1 if prob>=best_thres else 0 for prob in prob_list]
    accuracy = metrics.accuracy_score(labels_list,pred_list)
    
    avg_loss = np.mean(loss_list)
    
    result = {"threshold":best_thres,
              "accuracy":accuracy,
              "recall":best_recall,
              "precision":best_precision,
              "f1":best_f1,'auc':auc,
              'eval_loss':avg_loss} 

    return result,prob_list

In [26]:
train_config.device


'cuda'

In [27]:
result,prob_list = evaluate_full_metrics(model,test_loader)

Model Evaluating:   0%|          | 0/4870 [00:00<?, ?it/s]

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [28]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sat Aug 20 12:02:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:21:00.0 Off |                  N/A |
| 93%   75C    P2   320W / 350W |  23603MiB / 24576MiB |     94%      Default |
|                               |            

In [27]:
result

{'threshold': 0.53379494,
 'accuracy': 0.732,
 'recall': 0.3225806451612903,
 'precision': 0.6329113924050633,
 'f1': 0.4273504273504274,
 'auc': 0.718361947142127,
 'eval_loss': 0.7405883215722584}

In [28]:
def save_model(model, model_save_dir,step,model_metrics):
    model_save_dir = os.path.join(model_save_dir,f"checkpoint-{step}")
    model_name = "pytorch_model.bin"
    train_state_name = "training_state.json"
    os.makedirs(model_save_dir,exist_ok=True)
    
    model_path = os.path.join(model_save_dir,model_name)
    train_state_path = os.path.join(model_save_dir,train_state_name)

    torch.save(model,model_path)
    
    if model_metrics is not None:
        with open(train_state_path,mode = 'w',encoding = 'utf-8-sig') as f:
            model_metrics = {str(k):str(v) for k,v in model_metrics.items()} 
            json.dump(model_metrics,f,indent=4)

In [29]:
total_pbar = tqdm(total = len(train_loader)*train_config.epoches,desc = "Model Training",position=0, leave=True)

total_batch = 0 
for epoch in range(train_config.epoches):
    print("*"*50 + f"epoch: {epoch + 1}" + "*"*50)
    
    train_losses = []
    
    for inputs in train_loader:
        model = model.train()
        inputs = utils.to_device(inputs,train_config.device)
        labels = inputs['label'].view(-1,1)
        
        
        optimizer.zero_grad()
        
        probs = model(inputs)
        
        loss = BCELoss(probs,labels)
        
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        
        if (total_batch+1) % train_config.eval_steps ==0:
            model_metrics,_ = evaluate_full_metrics(model,test_loader)
            train_loss = np.mean(train_losses)
            model_metrics['train_loss'] = train_loss
            model_metrics["steps"] = total_batch+1
        
            save_model(model,train_config.model_save_dir,total_batch+1,model_metrics)
            df_metrics_temp = pd.DataFrame([model_metrics])
            display(df_metrics_temp)
            
            model = model.train()
            
        total_batch +=1
        total_pbar.update(1)
        
total_pbar.close()

Model Training:   0%|          | 0/490 [00:00<?, ?it/s]

**************************************************epoch: 1**************************************************


Model Evaluating: 100%|██████████| 42/42 [00:05<00:00,  7.83it/s]


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.189,0.32,0.177778,0.911392,0.297521,0.555263,0.44503,0.537698,32


Model Evaluating: 100%|██████████| 42/42 [00:05<00:00,  7.84it/s]


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.16287,0.334,0.179293,0.898734,0.298947,0.55585,0.435935,0.505945,64


Model Evaluating: 100%|██████████| 42/42 [00:05<00:00,  7.82it/s]
  f1 = recall*precision*2 / (recall + precision)


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.12533,0.346,0.180412,0.886076,0.299786,0.552948,0.438162,0.481268,96


Model Training:  20%|█▉        | 97/490 [00:48<09:38,  1.47s/it]

**************************************************epoch: 2**************************************************


Model Evaluating: 100%|██████████| 42/42 [00:05<00:00,  7.77it/s]
  f1 = recall*precision*2 / (recall + precision)


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.237433,0.346,0.180412,0.886076,0.299786,0.540666,0.457642,0.442531,128


Model Evaluating: 100%|██████████| 42/42 [00:05<00:00,  7.80it/s]
  f1 = recall*precision*2 / (recall + precision)


Unnamed: 0,threshold,accuracy,recall,precision,f1,auc,eval_loss,train_loss,steps
0,0.175448,0.304,0.174334,0.911392,0.292683,0.527376,0.435514,0.458487,160


Model Training:  37%|███▋      | 181/490 [01:27<01:45,  2.92it/s]

KeyboardInterrupt: 