In [1]:





''' For Toxicity 
    Import the model

'''

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup



from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

class ToxicCommentTagger(pl.LightningModule):
    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask, labels=None):
    
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        output = torch.sigmoid(output)    
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def training_epoch_end(self, outputs):
    
        labels = []
        predictions = []
        for output in outputs:
            
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)

        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)

        for i, name in enumerate(LABEL_COLUMNS):
            
            class_roc_auc = auroc(predictions[:, i], labels[:, i])
            self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)


    def configure_optimizers(self):

        optimizer = AdamW(self.parameters(), lr=2e-5)

        scheduler = get_linear_schedule_with_warmup(
          optimizer,
          num_warmup_steps=self.n_warmup_steps,
          num_training_steps=self.n_training_steps
         )

        return dict(
        optimizer=optimizer,
        lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
         )
        )

df = pd.read_csv("/home/mayukh/Generations/toxic_comments.csv")
train_df, val_df = train_test_split(df, test_size=0.05)
LABEL_COLUMNS = df.columns.tolist()[2:]

####---- could be omitted -----#####
N_EPOCHS = 4
BATCH_SIZE = 12
train_toxic = train_df[train_df[LABEL_COLUMNS].sum(axis=1) > 0]
train_clean = train_df[train_df[LABEL_COLUMNS].sum(axis=1) == 0]
train_df = pd.concat([
  train_toxic,
  train_clean.sample(15_000)
])
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps
####-----------------------------#####


BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
model = ToxicCommentTagger(
  n_classes=len(LABEL_COLUMNS),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps  
   
)

model.load_state_dict(torch.load('/home/mayukh/Generations/model.pth'))

model.eval()



['GPT2_large_PforT_white@T09.json', 'GPT2_large_TforP_white@P09.json', 'GPT2_large_Tfork_white@k50.json', 'GPT2_large_KforT_white@T03.json', 'GPT2_large_PforT_white@T03.json', 'GPT2_large_Tfork_white@k70.json', '.ipynb_checkpoints', 'Toxic_scores', 'GPT2_large_KforT_white@T09.json', 'GPT2_large_TforP_white@P03.json', 'GPT2_large_Tfork_white@k20.json', 'Score.ipynb']
['GPT2_large_KforT_white@T03.json', 'GPT2_large_KforT_white@T09.json']
['GPT2_large_PforT_white@T09.json', 'GPT2_large_TforP_white@P09.json', 'GPT2_large_Tfork_white@k50.json', 'GPT2_large_PforT_white@T03.json', 'GPT2_large_Tfork_white@k70.json', 'GPT2_large_TforP_white@P03.json', 'GPT2_large_Tfork_white@k20.json']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:


def store_toxic_score(prompt_list, data, model, keys = ['0.2', '0.3',  '0.4', '0.5', '0.6', '0.7', '0.8', '0.9']):
    '''
    Function to store the toxicity ratings scores in a dictionary
    '''
  keys = keys
  out = {}
  for item in prompt_list:
      out[item] = {}
  for i in range(10):
    
      for item in keys:
          lis = []
        
          for sentence in data[prompt_list[i]][item]:
              encoding = tokenizer.encode_plus(
                    sentence,
                    add_special_tokens=True,
                    max_length=512,
                    return_token_type_ids=False,
                    padding="max_length",
                    return_attention_mask=True,
                    return_tensors='pt',
              )
        
              _, test_prediction = model(encoding["input_ids"], encoding["attention_mask"])
              test_prediction = test_prediction.flatten().detach().numpy()

              lis.append(test_prediction[0])
          out[prompt_list[i]][item] = lis
        
  return out          


prompts = ["Add prompts here as a list of strings"]

In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()


def run_sentiment(prompt_list, data, senti_instance, score_type = 'neg', keys = ['0.2', '0.3',  '0.4', '0.5', '0.6', '0.7', '0.8', '0.9']):
    '''
    Function to store the sentiment rating 
    
    '''
    out = {}
    for item in prompt_list:
        out[item] = {}
    for i in range(10):
        
        
        for item in keys:
            
            
            lis = []
        
            for sentence in data[prompt_list[i]][item]:
                
                
                
                score = senti_instance.polarity_scores(sentence)[score_type]
              

                lis.append(score)
        out[prompt_list[i]][item] = lis
        
    return out   

In [2]:
import os
arr = os.listdir()
print(arr)

list_k = []
list_other = []
for item in arr:
    if 'json' in  item.split('.'):
        
        if 'KforT' in item.split('_'):
            list_k.append(item)
        else:
            list_other.append(item)
        
        
print(list_k)
print(list_other)

list_k = []
list_other = []
for item in arr:
    if 'json' in  item.split('.'):
        
        if 'KforT' in item.split('_'):
            list_k.append(item)
        else:
            list_other.append(item)
        
        
print(list_k)
print(list_other)
     

['GPT2_large_KforT_white@T03.json', 'GPT2_large_KforT_white@T09.json']
['GPT2_large_PforT@T03.json', 'GPT2_large_PforT@T09.json', 'GPT2_large_Tfork_white@k20.json', 'GPT2_large_Tfork_white@k50.json', 'GPT2_large_Tfork_white@k70.json', 'GPT2_large_TforP@P03.json', 'GPT2_large_TforP@P09.json']
