In [1]:
OUTPUT_DIR = '/content/drive/MyDrive/debertatftmdl'
test_cases = [
      "beautiful movie.",
      "This is a beautiful movie",
      "This is such a hateful and horrible movie",
      "You are an asshole",
      "wonderful person",
      "You are a wonderful person!",
      "I hate you and wish you would die",
      "The weather is nice today",
      "You're completely stupid and useless",
      "You idiot you will pay for this",
      "You are rotten bloody person",
      "You should be ashamed of yourself",
      "You should die or hang yourself why are your still living"
  ]

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
from copy import deepcopy
import torch, torch.nn as nn
import pandas as pd
from transformers import  AutoConfig, AutoModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DebertaForSequenceClassification

class ToxicityPredictor:
    def __init__(self, model_path=OUTPUT_DIR):
        print(model_path)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.tokenizer = AutoTokenizer.from_pretrained(model_path)

        config = AutoConfig.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_path, config=config, ignore_mismatched_sizes=True
        )

        # Move to the selected device
        #self.model.to(self.device)


        self.model.resize_token_embeddings(len(self.tokenizer)) #add this line


        # Ensure model is in evaluation mode
        self.model.eval()

        # Store the original state
        self.original_state = deepcopy(self.model.state_dict())

    def predict(self, text, threshold=0.5):
        # Ensure model is in eval mode before each prediction
        self.model.eval()

        with torch.no_grad():
            inputs = self.tokenizer(
                text,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            #.to(self.device)

            outputs = self.model(**inputs)

            #outputs = self.model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

            print(outputs[0])

            print("### logits####")
            print(outputs.logits)
            print("##################")


            probabilities = torch.softmax(outputs.logits, dim=1)
            print("####### probabilities ######")
            print(probabilities)

            toxic_prob = probabilities[0][1].item()
            print("### toxic_prob ####")
            print(toxic_prob)

            prediction = 'Toxic' if toxic_prob >= threshold else 'Non-toxic'

            return {
                'text': text,
                'prediction': prediction,
                'toxic_probability': f"{toxic_prob:.3f}",
                'non_toxic_probability': f"{1-toxic_prob:.3f}",
                'raw_probabilities': probabilities[0].cpu().numpy()
            }

    def reset_model(self):
        """Reset model to original state"""
        self.model.load_state_dict(self.original_state)

In [36]:
def test_model_consistency(model_path, test_cases):
    predictor = ToxicityPredictor(model_path)
    intialrundict = {'contextstr': [], 'ToxicProbability': [], 'predictionresult': []}
    # First run
    print("First run:")
    for text in test_cases:
        t = predictor.predict(text)
        intialrundict['contextstr'].append(text)
        intialrundict['ToxicProbability'].append(t['toxic_probability'])
        intialrundict['predictionresult'].append(t['prediction'])
    intialresultdf = pd.DataFrame(intialrundict)
    print(intialresultdf.head(15))

    print("################################################")

    # Reset model
    predictor.reset_model()

    # Second run
    print("\nSecond run:")
    secndrundict = {'contextstr': [], 'ToxicProbability': [], 'predictionresult': []}
    for text in test_cases:
        test = predictor.predict(text)
        secndrundict['contextstr'].append(text)
        secndrundict['ToxicProbability'].append(test['toxic_probability'])
        secndrundict['predictionresult'].append(test['prediction'])

    secondresultdf = pd.DataFrame(secndrundict)
    print(secondresultdf.head(15))

In [24]:
test_model_consistency(model_path=OUTPUT_DIR, test_cases=test_cases)

/content/drive/MyDrive/debertatftmdl


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/debertatftmdl and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.embeddings.position_embeddings.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.self.v_bias', 'deberta.encoder.layer.10.attention.output.LayerNorm.bias', 'deberta.encoder.layer.10.attention.output.LayerNorm.weight', 'deberta.encoder.layer.10.attention.output.dense.bias', 'deberta.encoder.layer.10.attention.output.dense.weight', 'deberta.encoder.layer.10.attention.self.in_proj.weight', 'deberta.encoder.layer.10.attention.self.q_bias', 'deberta.encoder.layer.10.attention.self.v_bias', 'deberta.encoder.layer.10.intermediate.dens

First run:
                                           contextstr ToxicProbability  \
0                                    beautiful movie.            0.455   
1                           This is a beautiful movie            0.458   
2           This is such a hateful and horrible movie            0.468   
3                                  You are an asshole            0.465   
4                                    wonderful person            0.437   
5                         You are a wonderful person!            0.447   
6                   I hate you and wish you would die            0.448   
7                           The weather is nice today            0.457   
8                You're completely stupid and useless            0.447   
9                     You idiot you will pay for this            0.460   
10                       You are rotten bloody person            0.440   
11                  You should be ashamed of yourself            0.452   
12  You should die or hang 

In [39]:
test_model_consistency(model_path=OUTPUT_DIR, test_cases=["You should die or hang yourself"])

/content/drive/MyDrive/debertatftmdl


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/debertatftmdl and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.embeddings.position_embeddings.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.self.v_bias', 'deberta.encoder.layer.10.attention.output.LayerNorm.bias', 'deberta.encoder.layer.10.attention.output.LayerNorm.weight', 'deberta.encoder.layer.10.attention.output.dense.bias', 'deberta.encoder.layer.10.attention.output.dense.weight', 'deberta.encoder.layer.10.attention.self.in_proj.weight', 'deberta.encoder.layer.10.attention.self.q_bias', 'deberta.encoder.layer.10.attention.self.v_bias', 'deberta.encoder.layer.10.intermediate.dens

First run:
tensor([[-0.0389, -0.0259]])
### logits####
tensor([[-0.0389, -0.0259]])
##################
####### probabilities ######
tensor([[0.4968, 0.5032]])
### toxic_prob ####
0.5032382607460022
                        contextstr ToxicProbability predictionresult
0  You should die or hang yourself            0.503            Toxic
################################################

Second run:
tensor([[-0.0389, -0.0259]])
### logits####
tensor([[-0.0389, -0.0259]])
##################
####### probabilities ######
tensor([[0.4968, 0.5032]])
### toxic_prob ####
0.5032382607460022
                        contextstr ToxicProbability predictionresult
0  You should die or hang yourself            0.503            Toxic
