## Function

In [1]:
import os
import sys
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [2]:
import datasets
import random
from contextlib import nullcontext
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from torch import nn
from transformers import default_data_collator, Trainer, TrainingArguments

import itertools
from tqdm.auto import tqdm

import torch


sys.path.append("../src")

from utils import number_split, create_mix
from process_SHAC import load_process_SHAC

In [3]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
# from peft import (
#     get_peft_model,
#     LoraConfig,
#     TaskType,
#     prepare_model_for_int8_training,
# )


In [4]:

##### Dataset Loader and Tokenizer
def preprocess_function(examples):
    # tokenize
    ret = tokenizer(examples['text'], return_tensors='pt', max_length=globalconfig.max_seq_length, padding='max_length', truncation=True).to(globalconfig.device)

    return  ret

def datasets_loader(df, txt_col):
    # from pandas df to Dataset & tokenize
    ret_datasets = datasets.Dataset.from_pandas(df[[txt_col,"label"]].reset_index(drop=True))
    ret_tokenized = ret_datasets.map(preprocess_function, batched=True)

    return ret_tokenized

# def create_peft_config(model):

#     peft_config = LoraConfig(
#         task_type=TaskType.SEQ_CLS,
#         inference_mode=False,
#         r=8,
#         bias="none",
#         lora_alpha=32,
#         lora_dropout=0.05,
#         target_modules = ["query", "value"],
#         modules_to_save=["classifier"],
#     )

#     # prepare int-8 model for training
#     if globalconfig.quantization:
#         model = prepare_model_for_int8_training(model)
#     model = get_peft_model(model, peft_config)
#     model.print_trainable_parameters()
#     return model, peft_config

## Define metric
def compute_metrics_twoLevels(eval_pred):
    # compute AUPRC, based on only two levels of Y
    predictions, labels = eval_pred
    probabilities = nn.functional.softmax(torch.FloatTensor(predictions), dim=-1)[:,1]

    auprc = average_precision_score(y_true=labels, y_score=probabilities)

    return {"auprc":auprc}


# Load Data & Split

## SHAC

### Load SHAC

In [8]:
df_shac = load_process_SHAC(replaceNA="all")

In [9]:
z_Categories = ["uw", "mimic"]  # the order here matters! Should match with df0, df1
label='Drug'
n_zCats = len(z_Categories)
txt_col="text"
domain_col = "location"

y_cat = [0, 1]

In [10]:
# Create binary version of "label"
assert "label" not in df_shac.columns

In [11]:
df_shac['label'] = df_shac[label].astype(int)

In [12]:
df_shac_uw = df_shac.query("location == 'uw'").reset_index(drop=True)
df_shac_mimic = df_shac.query("location == 'mimic'").reset_index(drop=True)


In [13]:
df0 = df_shac_uw
df1 = df_shac_mimic


In [14]:



label2id = {y:idx for idx,y in zip(range(len(y_cat)), y_cat)}
id2label = {idx:y for idx,y in zip(range(len(y_cat)), y_cat)}


In [15]:
label2id

{0: 0, 1: 1}

### Split - SHAC

In [17]:
##### Split
# SHAC-Drug - Balanced Alpha
n_test = 200
train_test_ratio = 4


p_pos_train_z0_ls = np.arange(0, 1, 0.1) # probability of training set examples drawn from site/domain z0 being positive
p_pos_train_z1_ls = np.arange(0, 1, 0.1) # probability of test set examples drawn from site/domain z1 being positive

p_mix_z1_ls     = np.arange(0, 1, 0.05) 

numvals = 1023
base = 1.1


alpha_test_ls = np.power(base, np.arange(numvals))/np.power(base,numvals//2)

valid_full_settings = []
for combination in itertools.product(p_pos_train_z0_ls, 
                                     p_pos_train_z1_ls, 
                                     p_mix_z1_ls,
                                     alpha_test_ls
                                    ):
    

    number_setting = number_split(p_pos_train_z0=combination[0], 
                           p_pos_train_z1 = combination[1], 
                           p_mix_z1 = combination[2], alpha_test = combination[3],
                           train_test_ratio = train_test_ratio, 
                           n_test=n_test,
                                  verbose=False
                                 )

    if (number_setting is not None):
        if np.all([number_setting[k] >= 10 for k in list(number_setting.keys())[:-1]]):
            valid_full_settings.append(number_setting)
    
    
    
    
# run for check valid settings

import warnings; warnings.simplefilter('ignore')

# Validate settings

df0 = df_shac_uw
df1 = df_shac_mimic


valid_n_full_settings = []

for c in tqdm(valid_full_settings):
    c = c.copy()
    # create train/test split according to stats
    dfs = create_mix(df0=df0, df1=df1, target=label, setting=c, sample=False, 
                     seed=222
                    )

    if dfs is None:
        continue
    
    valid_n_full_settings.append(c)


  alpha_train = p_pos_train_z1 / p_pos_train_z0
  alpha_train = p_pos_train_z1 / p_pos_train_z0


  0%|          | 0/26251 [00:00<?, ?it/s]

In [14]:
# ##### Split
# # SHAC-Drug - Balanced Alpha
# ## Only selecting C_y in [0.2, 0.48, 0.72]
# n_test = 200
# train_test_ratio = 4


# p_pos_train_z0_ls = np.arange(0, 1, 0.1) # probability of training set examples drawn from site/domain z0 being positive
# p_pos_train_z1_ls = np.arange(0, 1, 0.1) # probability of test set examples drawn from site/domain z1 being positive

# p_mix_z1_ls     = np.arange(0, 1, 0.05) 

# numvals = 129
# base = 1.01

# alpha_test_ls = np.power(base, np.arange(numvals))/np.power(base,numvals//2)

# valid_full_settings = []
# for combination in itertools.product(p_pos_train_z0_ls, 
#                                      p_pos_train_z1_ls, 
#                                      p_mix_z1_ls,
#                                      alpha_test_ls
#                                     ):
    

#     number_setting = number_split(p_pos_train_z0=combination[0], 
#                            p_pos_train_z1 = combination[1], 
#                            p_mix_z1 = combination[2], alpha_test = combination[3],
#                            train_test_ratio = train_test_ratio, 
#                            n_test=n_test,
#                                   verbose=False
#                                  )

#     if (number_setting is not None) and (number_setting['mix_param_dict']['C_y'] in [0.2, 0.48, 0.72]) and (number_setting['mix_param_dict']['alpha_train'] in [1., 3, 5, 1/3, 0.2]):
#         if np.all([number_setting[k] >= 10 for k in list(number_setting.keys())[:-1]]):
#             valid_full_settings.append(number_setting)
    
    
    
    
# # run for check valid settings

# import warnings; warnings.simplefilter('ignore')

# # Validate settings

# df0 = df_shac_uw
# df1 = df_shac_mimic


# valid_n_full_settings = []

# for c in tqdm(valid_full_settings):
#     c = c.copy()
#     # create train/test split according to stats
#     dfs = create_mix(df0=df0, df1=df1, target=label, setting=c, sample=False, 
#                      seed=222
#                     )

#     if dfs is None:
#         continue
    
#     valid_n_full_settings.append(c)


  alpha_train = p_pos_train_z1 / p_pos_train_z0
  alpha_train = p_pos_train_z1 / p_pos_train_z0


  0%|          | 0/1428 [00:00<?, ?it/s]

In [18]:
len(valid_n_full_settings)

21927

In [19]:
tmp_df = pd.DataFrame([st['mix_param_dict'] for st in valid_n_full_settings])

In [20]:
tmp_df['C_y'].unique()

array([0.1  , 0.1  , 0.1  , 0.11 , 0.115, 0.12 , 0.125, 0.13 , 0.135,
       0.14 , 0.145, 0.15 , 0.155, 0.16 , 0.165, 0.17 , 0.175, 0.18 ,
       0.185, 0.19 , 0.2  , 0.21 , 0.22 , 0.23 , 0.24 , 0.25 , 0.26 ,
       0.27 , 0.19 , 0.205, 0.235, 0.25 , 0.265, 0.28 , 0.295, 0.31 ,
       0.325, 0.34 , 0.355, 0.24 , 0.26 , 0.3  , 0.32 , 0.34 , 0.36 ,
       0.38 , 0.4  , 0.175, 0.2  , 0.225, 0.275, 0.3  , 0.35 , 0.375,
       0.4  , 0.425, 0.19 , 0.31 , 0.37 , 0.43 , 0.46 , 0.17 , 0.345,
       0.38 , 0.415, 0.45 , 0.485, 0.42 , 0.46 , 0.5  , 0.18 , 0.17 ,
       0.165, 0.16 , 0.12 , 0.115, 0.2  , 0.215, 0.23 , 0.235, 0.24 ,
       0.245, 0.255, 0.27 , 0.285, 0.29 , 0.29 , 0.32 , 0.33 , 0.36 ,
       0.245, 0.305, 0.335, 0.35 , 0.365, 0.395, 0.41 , 0.425, 0.38 ,
       0.44 , 0.375, 0.45 , 0.475, 0.5  , 0.47 , 0.305, 0.445, 0.48 ,
       0.515, 0.21 , 0.18 , 0.255, 0.235, 0.3  , 0.315, 0.33 , 0.345,
       0.355, 0.37 , 0.385, 0.39 , 0.36 , 0.39 , 0.405, 0.435, 0.465,
       0.48 , 0.495,

In [21]:
tmp_df['alpha_train'].unique()

array([1.        , 2.        , 3.        , 4.        , 5.        ,
       6.        , 7.        , 8.        , 9.        , 0.5       ,
       1.5       , 2.5       , 3.5       , 4.5       , 0.33333333,
       0.66666667, 1.33333333, 1.66666667, 2.33333333, 2.66666667,
       3.        , 0.25      , 0.75      , 1.25      , 1.75      ,
       2.25      , 0.2       , 0.4       , 0.6       , 0.8       ,
       1.2       , 1.4       , 1.6       , 1.8       , 0.16666667,
       0.83333333, 1.16666667, 1.5       , 0.14285714, 0.28571429,
       0.42857143, 0.57142857, 0.71428571, 0.85714286, 1.14285714,
       1.28571429, 0.125     , 0.375     , 0.625     , 0.875     ,
       1.125     , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889])

In [36]:
##### Experiment - ONLY One Setting
pick_C = 12720

c = valid_n_full_settings[pick_C]
print("Balanced? Check setting....")
print(c)
dfs = create_mix(df0=df0, df1=df1, target=label, setting=c, sample=False, 
                 # seed=random.randint(0,1000),
                 seed=222
                )


Balanced? Check setting....
{'n_train': 800, 'n_test': 200, 'n_z0_pos_train': 200, 'n_z0_neg_train': 200, 'n_z0_pos_test': 50, 'n_z0_neg_test': 50, 'n_z1_pos_train': 200, 'n_z1_neg_train': 200, 'n_z1_pos_test': 50, 'n_z1_neg_test': 50, 'mix_param_dict': {'p_pos_train_z0': 0.5, 'p_pos_train_z1': 0.5, 'p_pos_train': 0.5, 'p_pos_test': 0.5, 'p_mix_z0': 0.5, 'p_mix_z1': 0.5, 'alpha_train': 1.0, 'alpha_test': 1.0, 'p_pos_test_z0': 0.5, 'p_pos_test_z1': 0.5, 'C_y': 0.5, 'C_z': 0.5}}


## HateSpeech

### Load HateSpeech

In [8]:
import pathlib

In [25]:
txt_col="text"

In [9]:
### Hate Speech
z_category = ['nothate', 'hate']

label2id = {z:idx for idx,z in zip(range(len(z_category)), z_category)}
id2label = {idx:z for idx,z in zip(range(len(z_category)), z_category)}

# (1) dynGen
df_dynGen = pd.read_csv("/bime-munin/xiruod/data/hateSpeech_Bulla2023/Dynamically-Generated-Hate-Speech-Dataset/Dynamically Generated Hate Dataset v0.2.3.csv",)

df_dynGen['label'] = df_dynGen['label'].map({"hate":"hate", "nothate":"nothate"})
df_dynGen["dfSource"] = "dynGen"
df_dynGen['label_binary'] = df_dynGen['label'].map({"hate":1,"nothate":0})

# (2)  wsf
ls_allFiles = pathlib.Path("/bime-munin/xiruod/data/hateSpeech_Bulla2023/hate-speech-dataset/all_files/").glob("*.txt")

ls_id = []
ls_text = []

for ifile in ls_allFiles:
    ls_id.append(ifile.name.split(".txt")[0])
    with open(ifile, "r") as f:
        ls_text.append(f.read())

df_wsf_raw = pd.DataFrame({"file_id":ls_id, "text":ls_text})

df_wsf_annotation = pd.read_csv("/bime-munin/xiruod/data/hateSpeech_Bulla2023/hate-speech-dataset/annotations_metadata.csv")

df_wsf = df_wsf_raw.merge(df_wsf_annotation, on="file_id", how="inner")

df_wsf = df_wsf[df_wsf['label'].isin(['hate','noHate'])].reset_index(drop=True)

# df_wsf['label_binary'] = df_wsf['label'].map({"hate":1,"noHate":0})
df_wsf['label'] = df_wsf['label'].map({"hate":"hate","noHate":"nothate"})
df_wsf["dfSource"] = "wsf"
df_wsf['label_binary'] = df_wsf['label'].map({"hate":1,"nothate":0})


In [34]:
df_dynGen['label'] = df_dynGen['label_binary']
df_wsf['label'] = df_wsf['label_binary']

### Split - HateSpeech

In [35]:
n_test = 1000
train_test_ratio = 4


p_pos_train_z0_ls = np.arange(0,1,0.05) # probability of training set examples drawn from site/domain z0 being positive
p_pos_train_z1_ls = np.arange(0,1,0.05) # probability of test set examples drawn from site/domain z1 being positive

p_mix_z1_ls     = [0.5] # = np.arange(0.1, 0.9, 0.05) 


# alpha_test_ls = np.arange(0, 10, 0.05)

numvals = 1023
base = 1.1
alpha_test_ls = [1]


valid_full_settings = []
for combination in itertools.product(p_pos_train_z0_ls, 
                                     p_pos_train_z1_ls, 
                                     p_mix_z1_ls,
                                     alpha_test_ls
                                    ):
    

    number_setting = number_split(p_pos_train_z0=combination[0], 
                           p_pos_train_z1 = combination[1], 
                           p_mix_z1 = combination[2], alpha_test = combination[3],
                           train_test_ratio = train_test_ratio, 
                           n_test=n_test,
                                  verbose=False
                                 )

    if (number_setting is not None):
        if np.all([number_setting[k] >= 10 for k in list(number_setting.keys())[:-1]]):
            valid_full_settings.append(number_setting)

import warnings; warnings.simplefilter('ignore')

# Validate settings
label='label_binary'
df0 = df_dynGen
df1 = df_wsf


valid_n_full_settings = []

for c in tqdm(valid_full_settings):
        c = c.copy()
        # create train/test split according to stats
        dfs = create_mix(df0=df0, df1=df1, target=label, setting=c, sample=False, 
                         seed=222
                        )

        if dfs is None:
            continue
        
        valid_n_full_settings.append(c)


  0%|          | 0/361 [00:00<?, ?it/s]

In [36]:

##### Experiment - ONLY One Setting
pick_C = 75

c = valid_n_full_settings[pick_C]
print("Balanced? Check setting....")
print(c)
dfs = create_mix(df0=df0, df1=df1, target=label, setting=c, sample=False, 
                 # seed=random.randint(0,1000),
                 seed=222
                )

Balanced? Check setting....
{'n_train': 4000, 'n_test': 1000, 'n_z0_pos_train': 800, 'n_z0_neg_train': 1200, 'n_z0_pos_test': 200, 'n_z0_neg_test': 300, 'n_z1_pos_train': 800, 'n_z1_neg_train': 1200, 'n_z1_pos_test': 200, 'n_z1_neg_test': 300, 'mix_param_dict': {'p_pos_train_z0': 0.4, 'p_pos_train_z1': 0.4, 'p_pos_train': 0.4, 'p_pos_test': 0.4, 'p_mix_z0': 0.5, 'p_mix_z1': 0.5, 'alpha_train': 1.0, 'alpha_test': 1, 'p_pos_test_z0': 0.4, 'p_pos_test_z1': 0.4, 'C_y': 0.4, 'C_z': 0.5}}


# Set Up For One Run on A Specific Dataset

In [12]:
class train_config:
    def __init__(self):
        self.quantization: bool = False

    

In [13]:
globalconfig = train_config()

In [14]:
globalconfig.model_id="distilbert-base-uncased"

In [23]:
# globalconfig.output_dir = "../output/tmpData/DistilBERT_shac_12720"
globalconfig.output_dir = "../output/tmpData/DistilBERT_HateSpeech_75"

In [16]:
globalconfig.quantization = False

In [17]:
globalconfig.device = "cuda:0"

In [18]:
globalconfig.max_seq_length=512

In [19]:
globalconfig.num_train_epochs=3

In [20]:
globalconfig.lr = 1e-4
globalconfig.warmup_ratio = 0.1

In [21]:
rand_seed_np = 24
rand_seed_torch = 187

In [62]:
itertools.product([1,2,3],['a','b'])

<itertools.product at 0x7f90701d13c0>

In [None]:

for m,lr,epoch in itertools.product(["distilbert-base-uncased","bert-base-uncased"], [1e-4, 7e-5, 5e-5, 3e-5, 1e-5], [3,5]):
    
    globalconfig.output_dir = f"../output/tmpData/DistilBERT_shac_12720_{m}_lr_{lr}_epoch_{epoch}"
    globalconfig.model_id=m
    globalconfig.lr = lr
    globalconfig.num_train_epochs=epoch
    
    random.seed(rand_seed_np)
    np.random.seed(rand_seed_np)
    torch.manual_seed(rand_seed_torch)
    torch.cuda.manual_seed(rand_seed_torch)

    # Init model
    tokenizer = AutoTokenizer.from_pretrained(globalconfig.model_id)
    model = AutoModelForSequenceClassification.from_pretrained(globalconfig.model_id, num_labels=len(id2label))

    config = {
                'learning_rate': globalconfig.lr,
                'num_train_epochs': globalconfig.num_train_epochs,
                'gradient_accumulation_steps': 2,
                'per_device_train_batch_size': 8,
                'per_device_eval_batch_size': 8,
                'gradient_checkpointing': False,
                'warmup_ratio':globalconfig.warmup_ratio,
            }

    # Set up profiler
    profiler = nullcontext()

    tokenized_train = datasets_loader(dfs['train'], txt_col=txt_col)
    tokenized_test = datasets_loader(dfs['test'], txt_col=txt_col)

    # Define training args
    training_args = TrainingArguments(
        output_dir=globalconfig.output_dir,
        overwrite_output_dir=True,
        bf16=globalconfig.quantization,  # Use BF16 if available
        # logging strategies
        logging_dir=f"{globalconfig.output_dir}/logs",
        logging_strategy="steps",
        logging_steps=5,
        save_strategy="no",
        optim="adamw_torch_fused" if globalconfig.quantization else "adamw_torch",
        max_steps= -1,

        **{k:v for k,v in config.items() if k != 'lora_config'}
    )

    with profiler:
        # Create Trainer instance
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_test,
            data_collator=default_data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics_twoLevels,
            callbacks=[],
        )

        # Start training
        ret_train = trainer.train()
        ret_eval = trainer.evaluate()

    # save metrics
    ret = c
    ret.update(ret_eval)
    ret.update(ret_train.metrics)
    trainer.save_metrics(split="all", metrics=ret)

    ret_code = 1


    # model.save_pretrained(output_dir)

In [37]:

for m,lr,epoch in itertools.product(["distilbert-base-uncased"], [ 3e-5], [3]):
    
    # globalconfig.output_dir = f"../output/tmpData/DistilBERT_shac_12720_{m}_lr_{lr}_epoch_{epoch}"
    globalconfig.output_dir = f"../output/tmpData/DistilBERT_HateSpeech_75_{m}_lr_{lr}_epoch_{epoch}"

    globalconfig.model_id=m
    globalconfig.lr = lr
    globalconfig.num_train_epochs=epoch
    
    random.seed(rand_seed_np)
    np.random.seed(rand_seed_np)
    torch.manual_seed(rand_seed_torch)
    torch.cuda.manual_seed(rand_seed_torch)

    # Init model
    tokenizer = AutoTokenizer.from_pretrained(globalconfig.model_id)
    model = AutoModelForSequenceClassification.from_pretrained(globalconfig.model_id, num_labels=len(id2label))

    config = {
                'learning_rate': globalconfig.lr,
                'num_train_epochs': globalconfig.num_train_epochs,
                'gradient_accumulation_steps': 2,
                'per_device_train_batch_size': 8,
                'per_device_eval_batch_size': 8,
                'gradient_checkpointing': False,
                'warmup_ratio':globalconfig.warmup_ratio,
            }

    # Set up profiler
    profiler = nullcontext()

    tokenized_train = datasets_loader(dfs['train'], txt_col=txt_col)
    tokenized_test = datasets_loader(dfs['test'], txt_col=txt_col)

    # Define training args
    training_args = TrainingArguments(
        output_dir=globalconfig.output_dir,
        overwrite_output_dir=True,
        bf16=globalconfig.quantization,  # Use BF16 if available
        # logging strategies
        logging_dir=f"{globalconfig.output_dir}/logs",
        logging_strategy="steps",
        logging_steps=5,
        save_strategy="no",
        optim="adamw_torch_fused" if globalconfig.quantization else "adamw_torch",
        max_steps= -1,
        

        **{k:v for k,v in config.items() if k != 'lora_config'}
    )

    with profiler:
        # Create Trainer instance
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_test,
            data_collator=default_data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics_twoLevels,
            callbacks=[],
        )

        # Start training
        ret_train = trainer.train()
        ret_eval = trainer.evaluate()

    # save metrics
    ret = c
    ret.update(ret_eval)
    ret.update(ret_train.metrics)
    trainer.save_metrics(split="all", metrics=ret)

    ret_code = 1


    model.save_pretrained(output_dir)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Step,Training Loss
5,0.6432
10,0.6377
15,0.6697
20,0.7283
25,0.6762
30,0.6863
35,0.6622
40,0.6488
45,0.6669
50,0.6644


NameError: name 'output_dir' is not defined

# Predict on All

## SHAC

In [67]:
df_shac

Unnamed: 0,id,Drug,Alcohol,Tobacco,SubstanceAgg,set,location,text,label
0,3407,True,False,False,True,train,uw,Social History: Work: Unemployed Lives with m...,1
1,3313,False,True,False,True,train,uw,SOCIAL HISTORY: Living situation: Comfortabl...,0
2,3045,True,False,True,True,train,uw,HABITS: Tobacco Use: Approximately 20 pack-ye...,1
3,4399,False,True,True,True,train,uw,Social History: Immigrated from [LOCATION] in ...,0
4,3637,False,False,True,True,train,uw,HABITS: Tobacco Use: Formerly smoked 1-2 ciga...,0
...,...,...,...,...,...,...,...,...,...
4400,2884,True,True,False,True,test,mimic,Social History: lives w/ mother in [**Name2 (N...,1
4401,2728,False,False,True,True,test,mimic,Social History: lives with wife (recent nursin...,0
4402,2854,False,True,True,True,test,mimic,"Social History: quit smoking several yrs ago, ...",0
4403,2726,False,False,False,False,test,mimic,"Social History: Denies Tob, EtOH, or Illicit d...",0


In [105]:
softmax = nn.Softmax()
ret_ls = []
model.eval()
with torch.no_grad():
    for txt in df_shac['text']:
        t = tokenizer([txt], return_tensors='pt', max_length=globalconfig.max_seq_length, padding='max_length', truncation=True).to(globalconfig.device)

        tmp = model(**t)
        
        ret = softmax(tmp['logits'].cpu()).squeeze().tolist()
        ret_ls.append(ret)

In [111]:
df_shac['pred_0'] = np.array(ret_ls)[:, 0]
df_shac['pred_1'] = np.array(ret_ls)[:, 1]

In [112]:
ret_ls

[[0.025337515398859978, 0.9746624827384949],
 [0.9409753680229187, 0.05902468413114548],
 [0.019247403368353844, 0.9807525277137756],
 [0.9108449816703796, 0.08915499597787857],
 [0.9318321347236633, 0.06816788762807846],
 [0.020348506048321724, 0.9796515107154846],
 [0.9470767378807068, 0.052923258394002914],
 [0.6848385334014893, 0.31516146659851074],
 [0.9626388549804688, 0.037361208349466324],
 [0.8300272226333618, 0.16997279226779938],
 [0.961880624294281, 0.03811943158507347],
 [0.018697934225201607, 0.9813020825386047],
 [0.019205909222364426, 0.9807940721511841],
 [0.8733491897583008, 0.12665076553821564],
 [0.020410727709531784, 0.9795892834663391],
 [0.022444212809205055, 0.9775558114051819],
 [0.021024446934461594, 0.9789755940437317],
 [0.023078789934515953, 0.9769212603569031],
 [0.08561872690916061, 0.9143812656402588],
 [0.9593448042869568, 0.0406552217900753],
 [0.022279266268014908, 0.9777206778526306],
 [0.8931539058685303, 0.10684601962566376],
 [0.025607725605368614

In [114]:
globalconfig.output_dir

'../output/tmpData/DistilBERT_shac_12720_distilbert-base-uncased_lr_3e-05_epoch_3'

In [115]:
# df_shac.to_pickle(f"{globalconfig.output_dir}/SHAC_Pred.pkl")

In [120]:
df_shac.query("location == 'uw'").reset_index(drop=True).to_pickle(f"{globalconfig.output_dir}/SHAC_Pred_uw.pkl")
df_shac.query("location == 'mimic'").reset_index(drop=True).to_pickle(f"{globalconfig.output_dir}/SHAC_Pred_mimic.pkl")

## HateSpeech

In [None]:
df_dynGen

In [53]:
softmax = nn.Softmax()
model.eval()

for df in [df_dynGen, df_wsf]:
    
    ret_ls = []
    
    with torch.no_grad():
        for txt in tqdm(df['text']):
            t = tokenizer([txt], return_tensors='pt', max_length=globalconfig.max_seq_length, padding='max_length', truncation=True).to(globalconfig.device)

            tmp = model(**t)

            ret = softmax(tmp['logits'].cpu()).squeeze().tolist()
            ret_ls.append(ret)
    df['pred_0'] = np.array(ret_ls)[:, 0]
    df['pred_1'] = np.array(ret_ls)[:, 1]

  0%|          | 0/41144 [00:00<?, ?it/s]

  0%|          | 0/10703 [00:00<?, ?it/s]

In [54]:
globalconfig.output_dir

'../output/tmpData/DistilBERT_HateSpeech_75_distilbert-base-uncased_lr_3e-05_epoch_3'

In [115]:
# df_shac.to_pickle(f"{globalconfig.output_dir}/SHAC_Pred.pkl")

In [55]:
df_dynGen.to_pickle(f"{globalconfig.output_dir}/HateSpeech_Pred_dynGen.pkl")
df_wsf.to_pickle(f"{globalconfig.output_dir}/HateSpeech_Pred_wsf.pkl")