#Import

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#Class and Metrics

In [None]:
class FitmentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True,
            add_special_tokens=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
from sklearn.metrics import classification_report

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    #print(classification_report(labels, preds)) # newly added

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


#Preprocess Data

In [None]:
import pandas as pd

def prepare_data(fold_idx):
    # Step 1: Load the predictions file and select the relevant columns
    preds = pd.read_csv(f'bc_data/predictions_fold_{fold_idx}.csv')  # INDEX, FTMNT_YEAR, FTMNT_MAKE, FTMNT_MODEL, UN_NORMALIZED

    # Select the required columns from the predictions file
    new_preds = preds[['INDEX', 'FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'UN_NORMALIZED']].copy()

    # Step 2: Add a new column for LABEL and initialize with 0
    new_preds['LABEL'] = 0  # Initialize with all zeros

    # Step 3: Load the targets file
    targets = pd.read_csv(f'bc_data/targets_fold_{fold_idx}.csv')  # INDEX, FTMNT_YEAR, FTMNT_MAKE, FTMNT_MODEL, UN_NORMALIZED

    # Step 4: Compare rows in new_preds with the targets
    for idx, row in new_preds.iterrows():
        # Check if the combination of columns matches any row in the targets
        match = targets[
            (targets['INDEX'] == row['INDEX']) &
            (targets['FTMNT_YEAR'] == row['FTMNT_YEAR']) &
            (targets['FTMNT_MAKE'] == row['FTMNT_MAKE']) &
            (targets['FTMNT_MODEL'] == row['FTMNT_MODEL']) &
            (targets['UN_NORMALIZED'] == row['UN_NORMALIZED'])
        ]

        # If there's a match, set LABEL to 1 (true positive)
        if not match.empty:
            new_preds.at[idx, 'LABEL'] = 1  # Update the LABEL column to 1 for true positive

    # Step 5: Save the updated predictions to a new CSV file
    #new_preds.to_csv('new_prediction.csv', index=False)
    # return new_preds
    # print(f"new_prediction.csv file created for fold {fold_idx}")


        # Step 2: Load the data_fold_i.csv file
    data_fold = pd.read_csv(f'bc_data/data_fold_{fold_idx}.csv')  # Contains index, prompt, response, cat

    # Step 3: Initialize an empty DataFrame for the final output
    final_data = pd.DataFrame(columns=['INDEX', 'prompt', 'response', 'cat', 'FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'UN_NORMALIZED', 'LABEL'])

    # Step 4: Merge the two datasets based on the INDEX
    for _, row in new_preds.iterrows():
        # Find the corresponding row in data_fold_i.csv where index == INDEX
        match = data_fold[data_fold['index'] == row['INDEX']]

        # If a match is found, create a new row in the final data
        if not match.empty:
            # Extract prompt, response, and cat from data_fold
            final_row = {
                'INDEX': row['INDEX'],
                'prompt': match.iloc[0]['prompt'],
                'response': match.iloc[0]['response'],
                'cat': match.iloc[0]['cat'],
                'FTMNT_YEAR': row['FTMNT_YEAR'],
                'FTMNT_MAKE': row['FTMNT_MAKE'],
                'FTMNT_MODEL': row['FTMNT_MODEL'],
                'UN_NORMALIZED': row['UN_NORMALIZED'],
                'LABEL': row['LABEL']
            }
            # Append the new row to the final_data DataFrame
            final_data = final_data.append(final_row, ignore_index=True)

    # Step 5: Save the final_data DataFrame to final_data_fold_i.csv
    final_data.to_csv(f'data/final_data_fold_{fold_idx}.csv', index=False)
    print(f"final_data_fold_{fold_idx}.csv file created successfully.")
    #final_data.to_csv('final_data.csv', index=False)

    return final_data


In [None]:
# Example: Prepare data for fold 0
import csv

final_processed_Data = prepare_data(4)


final_data_fold_4.csv file created successfully.


In [None]:
import os

directory = "data"

if not os.path.exists(directory):
    os.makedirs(directory)

# Specify the full file path (directory + file name)
file_path = os.path.join(directory, 'output_file.csv')

# Save the file_P DataFrame to the specified directory
file_path.to_csv(final_processed_Data, index=False)

print(f"CSV file saved successfully to {file_path}")

In [None]:
data = pd.read_csv(f'data/final_data_fold_4.csv')
true_positives = data[data['LABEL'] == 0]
#false_positives = new_preds[file_path['LABEL'] == 0]
print(true_positives)

      INDEX  ... LABEL
38       11  ...     0
39       11  ...     0
41       11  ...     0
84       20  ...     0
122      26  ...     0
...     ...  ...   ...
6975    993  ...     0
6976    993  ...     0
6977    993  ...     0
6978    993  ...     0
7027    999  ...     0

[1013 rows x 9 columns]


#Prepare Train and Test Dataset

In [None]:
def prepare_train_test_data(fold_idx,tokenizer, max_len):

        # Load the final_data_fold_i.csv file


    # Separate the features and the label
    # You can use 'prompt' or 'response' depending on the task, let's assume 'prompt'
    # texts = data['response'].tolist()
    # labels = data['LABEL'].tolist()

    train_texts, train_labels = [], []

    for i in range(5):
        if i==fold_idx:
            continue
        data = pd.read_csv(f'data/final_data_fold_{i}.csv')

        for _, row in data.iterrows():
            train_texts.append(row['response'])
            train_labels.append(row['LABEL'])



    test_texts, test_labels = [], []
    data = pd.read_csv(f'data/final_data_fold_{fold_idx}.csv')

    for _, row in data.iterrows():
        test_texts.append(row['response'])
        test_labels.append(row['LABEL'])

      # Tokenize the data
    train_dataset = FitmentDataset(train_texts, train_labels, tokenizer, max_len)
    test_dataset = FitmentDataset(test_texts, test_labels, tokenizer, max_len)

    return train_dataset, test_dataset

#Train and Evaluate

In [None]:
def train_and_evaluate(fold_idx, tokenizer, model, max_len, batch_size, epochs):
    train_dataset, test_dataset = prepare_train_test_data(fold_idx, tokenizer, max_len)

    # Set up the Trainer
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold_idx}',
        #evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        evaluation_strategy="steps",
        eval_steps=10
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model on test fold
    eval_results = trainer.evaluate()

    print(f"Fold {fold_idx} results:", eval_results)
    return eval_results


In [None]:
#def main():
# Load the tokenizer and model
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
# model = model.to(device)

from transformers import DebertaTokenizerFast, DebertaForSequenceClassification

# Load the tokenizer and model for DeBERTa
tokenizer = DebertaTokenizerFast.from_pretrained('microsoft/deberta-base')
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2)

# Move the model to the device (GPU/CPU)
model = model.to(device)


max_len = 128
batch_size = 16
epochs = 3

# Train and evaluate for each fold
all_results = []
for fold_idx in range(5): # Place 5 for all fold results
    print(f"Training on fold {fold_idx}...")
    results = train_and_evaluate(fold_idx, tokenizer, model, max_len, batch_size, epochs)
    print(f"Fold {fold_idx} results:", results)
    all_results.append(results)

print("All fold results:", all_results)

# if __name__ == "__main__":
#     main()


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training on fold 0...


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,0.55,0.348691,0.886334,0.886334,1.0,0.939742
20,0.4395,0.36492,0.886334,0.886334,1.0,0.939742
30,0.3992,0.37595,0.886334,0.886334,1.0,0.939742
40,0.433,0.339222,0.886334,0.886334,1.0,0.939742
50,0.3606,0.353628,0.883876,0.891387,0.989558,0.937911
60,0.4197,0.349529,0.87679,0.886367,0.9876,0.934249
70,0.4123,0.331394,0.886334,0.886334,1.0,0.939742
80,0.3389,0.331507,0.883731,0.88671,0.996084,0.93822
90,0.4376,0.331494,0.883442,0.89588,0.982705,0.937286
100,0.3743,0.341898,0.881562,0.8931,0.984174,0.936428




Fold 0 results: {'eval_loss': 0.3766857087612152, 'eval_accuracy': 0.8822848879248012, 'eval_precision': 0.8974128906834156, 'eval_recall': 0.9791156795562082, 'eval_f1': 0.9364856429463171, 'eval_runtime': 16.0082, 'eval_samples_per_second': 431.965, 'eval_steps_per_second': 6.809, 'epoch': 3.0}
Fold 0 results: {'eval_loss': 0.3766857087612152, 'eval_accuracy': 0.8822848879248012, 'eval_precision': 0.8974128906834156, 'eval_recall': 0.9791156795562082, 'eval_f1': 0.9364856429463171, 'eval_runtime': 16.0082, 'eval_samples_per_second': 431.965, 'eval_steps_per_second': 6.809, 'epoch': 3.0}
Training on fold 1...




Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,0.3209,0.274378,0.891683,0.895574,0.992229,0.941427
20,0.3107,0.309171,0.886835,0.893816,0.98843,0.938745
30,0.3086,0.289133,0.881988,0.903413,0.96909,0.9351
40,0.3279,0.279445,0.88835,0.891783,0.993265,0.939793
50,0.333,0.280418,0.886078,0.904609,0.972716,0.937427
60,0.3237,0.292696,0.886532,0.898388,0.981696,0.938196
70,0.2882,0.289932,0.889562,0.895716,0.989294,0.940182
80,0.337,0.282612,0.883654,0.898967,0.977206,0.936455
90,0.3269,0.28399,0.881836,0.900048,0.973407,0.935291
100,0.3058,0.2852,0.883805,0.89659,0.98066,0.936742




Fold 1 results: {'eval_loss': 0.31604668498039246, 'eval_accuracy': 0.8807756400545372, 'eval_precision': 0.8900841908325537, 'eval_recall': 0.9858400967017786, 'eval_f1': 0.9355182302335109, 'eval_runtime': 15.1301, 'eval_samples_per_second': 436.281, 'eval_steps_per_second': 6.874, 'epoch': 3.0}
Fold 1 results: {'eval_loss': 0.31604668498039246, 'eval_accuracy': 0.8807756400545372, 'eval_precision': 0.8900841908325537, 'eval_recall': 0.9858400967017786, 'eval_f1': 0.9355182302335109, 'eval_runtime': 15.1301, 'eval_samples_per_second': 436.281, 'eval_steps_per_second': 6.874, 'epoch': 3.0}
Training on fold 2...




Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,0.3109,0.324574,0.852545,0.869577,0.962741,0.913791
20,0.2868,0.333022,0.847808,0.850232,0.986234,0.913197
30,0.2796,0.327028,0.854974,0.877044,0.95526,0.914482
40,0.3419,0.32809,0.852423,0.889459,0.934311,0.911333
50,0.2667,0.355915,0.850601,0.852489,0.986683,0.91469
60,0.2563,0.325248,0.854974,0.871531,0.96334,0.915139
70,0.2271,0.336762,0.851694,0.856155,0.982343,0.914919
80,0.2868,0.325416,0.857525,0.871795,0.966632,0.916767
90,0.3028,0.326495,0.855095,0.877579,0.954661,0.914499
100,0.2414,0.361683,0.849387,0.854963,0.980847,0.913589




Fold 2 results: {'eval_loss': 0.35242268443107605, 'eval_accuracy': 0.8569172841005709, 'eval_precision': 0.885990744636096, 'eval_recall': 0.9453838096663175, 'eval_f1': 0.9147241928478356, 'eval_runtime': 20.5292, 'eval_samples_per_second': 401.039, 'eval_steps_per_second': 6.284, 'epoch': 3.0}
Fold 2 results: {'eval_loss': 0.35242268443107605, 'eval_accuracy': 0.8569172841005709, 'eval_precision': 0.885990744636096, 'eval_recall': 0.9453838096663175, 'eval_f1': 0.9147241928478356, 'eval_runtime': 20.5292, 'eval_samples_per_second': 401.039, 'eval_steps_per_second': 6.284, 'epoch': 3.0}
Training on fold 3...




Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,0.3041,0.269398,0.879255,0.882655,0.980541,0.929026
20,0.3088,0.259524,0.877598,0.894714,0.96124,0.926785
30,0.2582,0.265112,0.878235,0.880244,0.982598,0.928609
40,0.2898,0.256336,0.88053,0.917364,0.936086,0.926631




KeyboardInterrupt: 

In [None]:
print(all_results)
#print("hello")

#Write Train Data to text File

In [None]:
import pandas as pd

def prepare_textFILE(fold_idx):

    true_positive=[]
    false_positive=[]

    filename=[]
    for i in range(5):
          if i==fold_idx:
              continue
          name = "data/final_data_fold_"+str(i)+".csv"
          filename.append(name)
    #print(filename)
    df1 = pd.read_csv(filename[0])
    df2 = pd.read_csv(filename[1])
    df3 = pd.read_csv(filename[2])
    df4 = pd.read_csv(filename[3])

    merged_df = pd.concat([df1, df2, df3, df4], ignore_index=True)

    # Filter rows where 'LABEL' is 1 (true positive) and 0 (false positive)
    true_positive = merged_df[merged_df['LABEL'] == 1]['response']
    false_positive = merged_df[merged_df['LABEL'] == 0]['response']



    # Write true positive responses to a file
    with open(f'bc_data/processed_data_true_positive_train_{fold_idx}.txt', 'w') as f_true:
        for response in true_positive:
            f_true.write(f"{response}\n")

    # Write false positive responses to a file
    with open(f'bc_data/processed_data_false_positive_train_{fold_idx}.txt', 'w') as f_false:
        for response in false_positive:
            f_false.write(f"{response}\n")

    print("Files created successfully!")


In [None]:
for i in range(5):
  prepare_textFILE(i)

Files created successfully!
Files created successfully!
Files created successfully!
Files created successfully!
Files created successfully!


#Write Test Data to text File

In [None]:
import pandas as pd

def prepare_textFILE_Test(fold_idx):

    true_positive=[]
    false_positive=[]



    df = pd.read_csv(f'data/final_data_fold_{fold_idx}.csv')

    # Filter rows where 'LABEL' is 1 (true positive) and 0 (false positive)
    true_positive = df[df['LABEL'] == 1]['response']
    false_positive = df[df['LABEL'] == 0]['response']


    # Write true positive responses to a file
    with open(f'bc_data/processed_data_true_positive_test_{fold_idx}.txt', 'w') as f_true:
        for response in true_positive:
            f_true.write(f"{response}\n")

    # Write false positive responses to a file
    with open(f'bc_data/processed_data_false_positive_test_{fold_idx}.txt', 'w') as f_false:
        for response in false_positive:
            f_false.write(f"{response}\n")

    print("Files created successfully!")


In [None]:
for i in range(5):
  prepare_textFILE_Test(i)

Files created successfully!
Files created successfully!
Files created successfully!
Files created successfully!
Files created successfully!


#Binary Classifier - Distilbert

In [None]:
# The following code is adapted from:
# 1. HuggingFace tutorial on using DistillBert https://huggingface.co/distilbert/distilbert-base-uncased
# 2. Huggingface tutorial on training transformers for sequence classification here: https://huggingface.co/docs/transformers/tasks/sequence_classification

### Importing libraries
import argparse
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler, SequentialSampler

# specify the available devices
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

## Function for reading the given file
def read_text(filename):
  with open(filename, "r") as f:
    lines = f.readlines()
    lines = [l.strip() for l in lines]
  return pd.DataFrame(lines)

# Set seed
seed = 912

## Parser for setting input values
parser = argparse.ArgumentParser(description='Adversarial masks for the safety classifier.')
parser.add_argument('--safe_train', type=str, default='data/safe_prompts_train_insertion_erased.txt', help='File containing safe prompts for training')
parser.add_argument('--harmful_train', type=str, default='data/harmful_prompts_train.txt', help='File containing harmful prompts for training')
parser.add_argument('--safe_test', type=str, default='data/safe_prompts_test_insertion_erased.txt', help='File containing safe prompts for testing')
parser.add_argument('--harmful_test', type=str, default='data/harmful_prompts_test.txt', help='File containing harmful prompts for testing')
parser.add_argument('--save_path', type=str, default='models/distilbert_insertion.pt', help='Path to save the model')

args = parser.parse_args(['--safe_train', 'bc_data/processed_data_true_positive_train_0.txt',
                          '--harmful_train', 'bc_data/processed_data_false_positive_train_0.txt',
                          '--safe_test', 'bc_data/processed_data_true_positive_test_0.txt',
                          '--harmful_test', 'bc_data/processed_data_false_positive_test_0.txt',
                          '--save_path', 'bc_data/distilbert_insertion.pt'
                          ])

# Load safe and harmful prompts and create the dataset for training classifier
# Class 1: Safe, Class 0: Harmful
safe_prompt_train = read_text(args.safe_train)
harm_prompt_train = read_text(args.harmful_train)
prompt_data_train = pd.concat([safe_prompt_train, harm_prompt_train], ignore_index=True)
prompt_data_train['Y'] = pd.Series(np.concatenate([np.ones(safe_prompt_train.shape[0]), np.zeros(harm_prompt_train.shape[0])])).astype(int)

# Split train dataset into train and validation sets
train_text, val_text, train_labels, val_labels = train_test_split(prompt_data_train[0],
								prompt_data_train['Y'],
								random_state=seed,
								test_size=0.2,
								stratify=prompt_data_train['Y'])

# Count number of samples in each class in the training set
count = train_labels.value_counts().to_dict()

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# pass the pre-trained DistilBert to our define architecture
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
# print(model)

# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

## Convert lists to tensors for train split
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())
sample_weights = torch.tensor([1/count[i] for i in train_labels])

## Convert lists to tensors for validation split
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# define the batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
# train_sampler = RandomSampler(train_data)
train_sampler = WeightedRandomSampler(sample_weights, len(train_data), replacement=True)

# dataLoader for the train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

# push the model to GPU
model = model.to(device)

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5)          # learning rate

# from sklearn.utils.class_weight import compute_class_weight

# #compute the class weights
# class_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(train_labels), y = train_labels.to_numpy())

# print("Class Weights:",class_weights)

# # converting list of class weights to a tensor
# weights= torch.tensor(class_weights,dtype=torch.float)

# # push to GPU
# weights = weights.to(device)

# define the loss function
# loss_fn  = nn.NLLLoss(weight=weights)
# loss_fn  = nn.CrossEntropyLoss(weight=weights)
loss_fn = nn.CrossEntropyLoss()
# loss_fn = nn.NLLLoss()

# number of training epochs
epochs = 3

# function to train the model
def train():

  model.train()
  total_loss, total_accuracy = 0, 0

  # empty list to save model predictions
  total_preds=[]

  # iterate over batches
  for step, batch in enumerate(train_dataloader):

    # progress update after every 50 batches.
    if (step + 1) % 50 == 0 or step == len(train_dataloader) - 1:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step + 1, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]

    sent_id, mask, labels = batch

    # clear previously calculated gradients
    model.zero_grad()

    # get model predictions for the current batch
    preds = model(sent_id, mask)[0]

    # compute the loss between actual and predicted values
    loss = loss_fn(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)

  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

# function for evaluating the model
def evaluate():

  print("\nEvaluating...")

  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0

  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):

    # Progress update every 50 batches.
    if (step + 1) % 50 == 0 or step == len(val_dataloader) - 1:

      # Calculate elapsed time in minutes.
      # elapsed = format_time(time.time() - t0)

      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step + 1, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():

      # model predictions
      preds = model(sent_id, mask)[0]

      # compute the validation loss between actual and predicted values
      loss = loss_fn(preds, labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader)

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

# set initial loss to infinite
best_validation_loss = float('inf')

# empty lists to store training and validation loss of each epoch
training_losses=[]
validation_losses=[]
train_flag = True

if train_flag == True:
    # for each epoch
    for epoch in range(epochs):
        # Copilot Code Reference: Similar code with 2 license types [MIT, unknown]
        # https://github.com/github-copilot/code_referencing?cursor=ca31ec3ebd8e24ea9127b39656a9ec6b&editor=vscode
        print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

        #train model
        training_loss, _ = train()

        #evaluate model
        validation_loss, _ = evaluate()

        #save the best model
        if validation_loss < best_validation_loss:
            best_validation_loss = validation_loss
            torch.save(model.state_dict(), args.save_path)
            # torch.save(model.state_dict(), 'new_distillbert_saved_weights.pt')

        # append training and validation loss
        training_losses.append(training_loss)
        validation_losses.append(validation_loss)

        print(f'\nTraining Loss: {training_loss:.3f}')
        print(f'Validation Loss: {validation_loss:.3f}')


# Test safety classifier
safe_prompt_test = read_text(args.safe_test)
harm_prompt_test = read_text(args.harmful_test)
prompt_data_test = pd.concat([safe_prompt_test, harm_prompt_test], ignore_index=True)
prompt_data_test['Y'] = pd.Series(np.concatenate([np.ones(safe_prompt_test.shape[0]), np.zeros(harm_prompt_test.shape[0])])).astype(int)

test_text = prompt_data_test[0]
test_labels = prompt_data_test['Y']

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

#load weights of best model
# path = args.save_path
# # path = 'new_distillbert_saved_weights.pt'
# model.load_state_dict(torch.load(path))
# model.eval()

# # get predictions for test data
# with torch.no_grad():
#   preds = model(test_seq.to(device), test_mask.to(device))[0]
#   preds = preds.detach().cpu().numpy()

# preds = np.argmax(preds, axis = 1)
# print(f'Testing Accuracy = {100*torch.sum(torch.tensor(preds) == test_y)/test_y.shape[0]}%')
# print(classification_report(test_y, preds))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Epoch 1 / 3
  Batch    50  of  6,362.
  Batch   100  of  6,362.
  Batch   150  of  6,362.
  Batch   200  of  6,362.
  Batch   250  of  6,362.
  Batch   300  of  6,362.
  Batch   350  of  6,362.
  Batch   400  of  6,362.
  Batch   450  of  6,362.
  Batch   500  of  6,362.
  Batch   550  of  6,362.
  Batch   600  of  6,362.
  Batch   650  of  6,362.
  Batch   700  of  6,362.
  Batch   750  of  6,362.
  Batch   800  of  6,362.
  Batch   850  of  6,362.
  Batch   900  of  6,362.
  Batch   950  of  6,362.
  Batch 1,000  of  6,362.
  Batch 1,050  of  6,362.
  Batch 1,100  of  6,362.
  Batch 1,150  of  6,362.
  Batch 1,200  of  6,362.
  Batch 1,250  of  6,362.
  Batch 1,300  of  6,362.
  Batch 1,350  of  6,362.
  Batch 1,400  of  6,362.
  Batch 1,450  of  6,362.
  Batch 1,500  of  6,362.
  Batch 1,550  of  6,362.
  Batch 1,600  of  6,362.
  Batch 1,650  of  6,362.
  Batch 1,700  of  6,362.
  Batch 1,750  of  6,362.
  Batch 1,800  of  6,362.
  Batch 1,850  of  6,362.
  Batch 1,900  of  6,362

#Get Results

In [None]:
# Test safety classifier in batches to avoid CUDA out of memory error
test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)  # Use the same batch size as training

#load weights of best model
path = args.save_path
# path = 'new_distillbert_saved_weights.pt'
model.load_state_dict(torch.load(path))
model.eval()

all_preds = []
# get predictions for test data
with torch.no_grad():
    for batch in test_dataloader:
        sent_id, mask, labels = [t.to(device) for t in batch]
        preds = model(sent_id, mask)[0]
        preds = preds.detach().cpu().numpy()
        all_preds.append(preds)

all_preds = np.concatenate(all_preds, axis=0)  # Combine predictions from all batches
preds = np.argmax(all_preds, axis=1)
print(f'Testing Accuracy = {100*torch.sum(torch.tensor(preds) == test_y)/test_y.shape[0]}%')
print(classification_report(test_y, preds))

Testing Accuracy = 73.4814224243164%
              precision    recall  f1-score   support

           0       0.28      0.43      0.34      7916
           1       0.88      0.79      0.83     41423

    accuracy                           0.73     49339
   macro avg       0.58      0.61      0.59     49339
weighted avg       0.78      0.73      0.76     49339

