In [None]:
!pip install -q -r requirements.txt

In [2]:
OUTPUT_DIR = '/content/drive/MyDrive/debertatftmdl'
test_cases = [
      "beautiful movie.",
      "This is a beautiful movie",
      "This is such a hateful and horrible movie",
      "You are an asshole",
      "wonderful person",
      "You are a wonderful person!",
      "I hate you and wish you would die",
      "The weather is nice today",
      "You're completely stupid and useless",
      "You idiot you will pay for this",
      "You are rotten bloody person",
      "You should be ashamed of yourself",
      "You should die or hang yourself why are your still living"
  ]

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from copy import deepcopy
import torch, torch.nn as nn
import pandas as pd
from transformers import  AutoConfig, AutoModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DebertaForSequenceClassification

class ToxicityPredictor:
    def __init__(self, model_path=OUTPUT_DIR):
        print(model_path)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.tokenizer = AutoTokenizer.from_pretrained(model_path)

        config = AutoConfig.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_path, config=config, ignore_mismatched_sizes=True
        )

        # Move to the selected device
        #self.model.to(self.device)


        self.model.resize_token_embeddings(len(self.tokenizer)) #add this line


        # Ensure model is in evaluation mode
        self.model.eval()

        # Store the original state
        self.original_state = deepcopy(self.model.state_dict())

    def predict(self, text, threshold=0.5):
        # Ensure model is in eval mode before each prediction
        self.model.eval()

        with torch.no_grad():
            inputs = self.tokenizer(
                text,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            #.to(self.device)

            outputs = self.model(**inputs)

            #outputs = self.model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

            print(outputs[0])

            print("### logits####")
            print(outputs.logits)
            print("##################")


            probabilities = torch.softmax(outputs.logits, dim=1)
            print("####### probabilities ######")
            print(probabilities)

            toxic_prob = probabilities[0][1].item()
            print("### toxic_prob ####")
            print(toxic_prob)

            prediction = 'Toxic' if toxic_prob >= threshold else 'Non-toxic'

            return {
                'text': text,
                'prediction': prediction,
                'toxic_probability': f"{toxic_prob:.3f}",
                'non_toxic_probability': f"{1-toxic_prob:.3f}",
                'raw_probabilities': probabilities[0].cpu().numpy()
            }

    def reset_model(self):
        """Reset model to original state"""
        self.model.load_state_dict(self.original_state)

In [None]:
def test_model_consistency(model_path, test_cases):
    predictor = ToxicityPredictor(model_path)
    intialrundict = {'contextstr': [], 'ToxicProbability': [], 'predictionresult': []}
    # First run
    print("First run:")
    for text in test_cases:
        t = predictor.predict(text)
        intialrundict['contextstr'].append(text)
        intialrundict['ToxicProbability'].append(t['toxic_probability'])
        intialrundict['predictionresult'].append(t['prediction'])
    intialresultdf = pd.DataFrame(intialrundict)
    print(intialresultdf.head(15))

    print("################################################")

    # Reset model
    predictor.reset_model()

    # Second run
    print("\nSecond run:")
    secndrundict = {'contextstr': [], 'ToxicProbability': [], 'predictionresult': []}
    for text in test_cases:
        test = predictor.predict(text)
        secndrundict['contextstr'].append(text)
        secndrundict['ToxicProbability'].append(test['toxic_probability'])
        secndrundict['predictionresult'].append(test['prediction'])

    secondresultdf = pd.DataFrame(secndrundict)
    print(secondresultdf.head(15))

In [None]:
test_model_consistency(model_path=OUTPUT_DIR, test_cases=test_cases)

/content/drive/MyDrive/debertatftmdl


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/debertatftmdl and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.embeddings.position_embeddings.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.self.v_bias', 'deberta.encoder.layer.10.attention.output.LayerNorm.bias', 'deberta.encoder.layer.10.attention.output.LayerNorm.weight', 'deberta.encoder.layer.10.attention.output.dense.bias', 'deberta.encoder.layer.10.attention.output.dense.weight', 'deberta.encoder.layer.10.attention.self.in_proj.weight', 'deberta.encoder.layer.10.attention.self.q_bias', 'deberta.encoder.layer.10.attention.self.v_bias', 'deberta.encoder.layer.10.intermediate.dens

First run:
                                           contextstr ToxicProbability  \
0                                    beautiful movie.            0.455   
1                           This is a beautiful movie            0.458   
2           This is such a hateful and horrible movie            0.468   
3                                  You are an asshole            0.465   
4                                    wonderful person            0.437   
5                         You are a wonderful person!            0.447   
6                   I hate you and wish you would die            0.448   
7                           The weather is nice today            0.457   
8                You're completely stupid and useless            0.447   
9                     You idiot you will pay for this            0.460   
10                       You are rotten bloody person            0.440   
11                  You should be ashamed of yourself            0.452   
12  You should die or hang 

In [None]:
test_model_consistency(model_path=OUTPUT_DIR, test_cases=["You should die or hang yourself"])

/content/drive/MyDrive/debertatftmdl


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/debertatftmdl and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.embeddings.position_embeddings.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.self.v_bias', 'deberta.encoder.layer.10.attention.output.LayerNorm.bias', 'deberta.encoder.layer.10.attention.output.LayerNorm.weight', 'deberta.encoder.layer.10.attention.output.dense.bias', 'deberta.encoder.layer.10.attention.output.dense.weight', 'deberta.encoder.layer.10.attention.self.in_proj.weight', 'deberta.encoder.layer.10.attention.self.q_bias', 'deberta.encoder.layer.10.attention.self.v_bias', 'deberta.encoder.layer.10.intermediate.dens

First run:
tensor([[-0.0389, -0.0259]])
### logits####
tensor([[-0.0389, -0.0259]])
##################
####### probabilities ######
tensor([[0.4968, 0.5032]])
### toxic_prob ####
0.5032382607460022
                        contextstr ToxicProbability predictionresult
0  You should die or hang yourself            0.503            Toxic
################################################

Second run:
tensor([[-0.0389, -0.0259]])
### logits####
tensor([[-0.0389, -0.0259]])
##################
####### probabilities ######
tensor([[0.4968, 0.5032]])
### toxic_prob ####
0.5032382607460022
                        contextstr ToxicProbability predictionresult
0  You should die or hang yourself            0.503            Toxic


In [3]:
testlist = [
      "beautiful movie.",
      "This is a beautiful movie",
      "This is such a hateful and horrible movie",
      "You are an asshole",
      "wonderful person",
      "You are a wonderful person!",
      "I hate you and wish you would die",
      "The weather is nice today",
      "You're completely stupid and useless",
      "You idiot you will pay for this",
      "You are rotten bloody person",
      "You should be ashamed of yourself",
      "You should die or hang yourself why are your still living",
      "You are a scumbag why dont you die nasty fellow",
      "You are a asshole"
  ]

In [21]:
import warnings, os
import torch
from debertaclassifier import CustomDebertaClassifier
from transformers import AutoTokenizer, AutoModel
warnings.filterwarnings("ignore")

def infer_toxiccnfvalidator(contextstr, mdlpath=OUTPUT_DIR):
    try:
        ft_model = CustomDebertaClassifier()
        binpath = mdlpath + '/pytorch_model.bin'
        if os.path.exists(binpath):
            ft_model.load_state_dict(torch.load(binpath))
            ft_model.eval()
            tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR +'/')
            result = {}
            result['contextstr'] = contextstr
            with torch.no_grad():
                #Tokenize inputs
                inputs = tokenizer(contextstr, return_tensors="pt")
                output = ft_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

                #print(output[0][0])

                print(output[0][0][0], output[0][0][1])
                cnf_intr = torch.abs(abs(output[0][0][1]) - abs(output[0][0][0]))
                print(f"Absolute difference: {cnf_intr.item():.4f}")

                # get the outputtensor
                evallabel = output[0].argmax().item()
                result['cnf_intr'] = f"{cnf_intr.item():.4f}"
                result['evallabel'] = evallabel
                if evallabel==1:
                  result['prediction'] = 'toxic'
                else:
                  result['prediction'] = 'non-toxic'
            return result

    except Exception as e:
        return (f"Encountered error while performing inference: {e}")

In [22]:
infer_toxiccnfvalidator(contextstr="You are a asshole", mdlpath=OUTPUT_DIR)

tensor(-5.1721) tensor(5.0374)
Absolute difference: 0.1347


{'contextstr': 'You are a asshole',
 'cnf_intr': '0.1347',
 'evallabel': 1,
 'prediction': 'toxic'}

In [24]:
import pandas as pd
resultdict = {'contextstr': [],'cnf_intr': [],'evallabel': [], 'predictionresult': []}

for s in testlist:
    t = infer_toxiccnfvalidator(contextstr=s)
    resultdict['contextstr'].append(s)
    resultdict['cnf_intr'].append(t['cnf_intr'])
    resultdict['evallabel'].append(t['evallabel'])
    resultdict['predictionresult'].append(t['prediction'])
resultdf = pd.DataFrame(resultdict)

tensor(2.5843) tensor(-2.7499)
Absolute difference: 0.1656
tensor(3.3614) tensor(-3.4493)
Absolute difference: 0.0879
tensor(-4.8856) tensor(4.7549)
Absolute difference: 0.1307
tensor(-5.1726) tensor(5.0368)
Absolute difference: 0.1358
tensor(3.0788) tensor(-3.1365)
Absolute difference: 0.0577
tensor(2.9462) tensor(-2.9178)
Absolute difference: 0.0284
tensor(-4.5914) tensor(4.5007)
Absolute difference: 0.0906
tensor(4.2531) tensor(-4.5266)
Absolute difference: 0.2735
tensor(-5.1585) tensor(5.0246)
Absolute difference: 0.1339
tensor(-5.0664) tensor(4.9276)
Absolute difference: 0.1388
tensor(-5.1367) tensor(4.9869)
Absolute difference: 0.1499
tensor(-4.9174) tensor(4.7916)
Absolute difference: 0.1259
tensor(-4.9788) tensor(4.8555)
Absolute difference: 0.1233
tensor(-5.1555) tensor(5.0284)
Absolute difference: 0.1271
tensor(-5.1721) tensor(5.0374)
Absolute difference: 0.1347


In [25]:
resultdf

Unnamed: 0,contextstr,cnf_intr,evallabel,predictionresult
0,beautiful movie.,0.1656,0,non-toxic
1,This is a beautiful movie,0.0879,0,non-toxic
2,This is such a hateful and horrible movie,0.1307,1,toxic
3,You are an asshole,0.1358,1,toxic
4,wonderful person,0.0577,0,non-toxic
5,You are a wonderful person!,0.0284,0,non-toxic
6,I hate you and wish you would die,0.0906,1,toxic
7,The weather is nice today,0.2735,0,non-toxic
8,You're completely stupid and useless,0.1339,1,toxic
9,You idiot you will pay for this,0.1388,1,toxic
