# Insalling dependencies

In [None]:
!pip install datasets transformers rouge-score sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


# Reading dictionary

In [None]:
import pandas as pd
df = pd.read_csv("/content/bad-words.csv")
df

Unnamed: 0,jigaboo
0,mound of venus
1,asslover
2,s&m
3,queaf
4,whitetrash
...,...
1611,cocky
1612,transsexual
1613,unfuckable
1614,bestiality


# Load dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="/content/detoxification_dataset.csv", split = "train[400000:500000]")

In [None]:
data = dataset.train_test_split(test_size=0.01, seed=42)

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['toxic', 'detoxified'],
        num_rows: 99000
    })
    test: Dataset({
        features: ['toxic', 'detoxified'],
        num_rows: 1000
    })
})

In [None]:
import pandas as pd
test_df = pd.DataFrame()
test_df["toxic_texts"] = data["test"]["toxic"]
test_df["detoxified"] = data["test"]["detoxified"]
test_df

Unnamed: 0,toxic_texts,detoxified
0,I don't give a shit.,I don't care.
1,I've never seen anything more disgusting in my...,"See, isn't that the most disgusting thing you'..."
2,my biggest client is a crook.,My biggest client is a fraud.
3,"When that o'malley kid cheated on you, - I was...","when that O'Malley boy cheated on you, I was a..."
4,"Those morons. if escaping was easy, I would ha...","if the escape was so simple, I'd be gone."
...,...,...
995,Let's get the fuck out ofhere!,let's get out of here.
996,"shut up, Bordin. Or I'll punish you.","Quiet Bordin, or you'll get a reprimand."
997,We are not goanna kill a goatin this man's bac...,we will not kill the goat in this man's yard.
998,They wanted to see what screwing a dead man fe...,they just wanted to see what it was like with ...


# Detoxification using regex, and dictionary

In [None]:
import pandas as pd
baseline = []
import re

def text_detox(text, offensive_words):
    text = text.lower()  # Convert text to lowercase for case insensitivity
    for word in offensive_words:
        text = re.sub(re.escape(word)," " ,text, flags=re.IGNORECASE)
    return text

offensive_words = list(df["jigaboo"])

for input_text in list(test_df["toxic_texts"]):
  baseline.append(text_detox(input_text, offensive_words))

In [None]:
test_df["baseline"] = baseline
test_df

Unnamed: 0,toxic_texts,detoxified,baseline
0,I don't give a shit.,I don't care.,i don't give a .
1,I've never seen anything more disgusting in my...,"See, isn't that the most disgusting thing you'...",i've never seen anything more disgusting in my...
2,my biggest client is a crook.,My biggest client is a fraud.,my ggest client is a crook.
3,"When that o'malley kid cheated on you, - I was...","when that O'Malley boy cheated on you, I was a...","when that o'malley cheated on you, - i was r..."
4,"Those morons. if escaping was easy, I would ha...","if the escape was so simple, I'd be gone.","t se s. if escaping was easy, i would have be..."
...,...,...,...
995,Let's get the fuck out ofhere!,let's get out of here.,let's get the ck out ofhere!
996,"shut up, Bordin. Or I'll punish you.","Quiet Bordin, or you'll get a reprimand.","shut up, bordin. or i'll punish you."
997,We are not goanna kill a goatin this man's bac...,we will not kill the goat in this man's yard.,we are not goanna a goatin this man's bac ard.
998,They wanted to see what screwing a dead man fe...,they just wanted to see what it was like with ...,they wanted to see what ing a man felt like.


# Sample example

In [None]:
test_df["toxic_texts"].iloc[0], baseline[0]

("I don't give a shit.", "i don't give a  .")

# Evaluation

## ROUGE scores

In [None]:
from rouge_score import rouge_scorer
import pandas as pd

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

# Iterate through rows and calculate ROUGE scores
for index, row in test_df.iterrows():
    reference = row['detoxified']  # Ground truth reference
    prediction = row['baseline']  # Model-generated prediction

    # Calculate ROUGE scores
    scores = scorer.score(reference, prediction)

    rouge_scores.append(scores)

# Create a new DataFrame to store ROUGE scores
rouge_df = pd.DataFrame(rouge_scores)

In [None]:
rouge1 = list(rouge_df["rouge1"])
rouge2 = list(rouge_df["rouge2"])
rougeL = list(rouge_df["rougeL"])

def mean_rouge(ls : list):
    sum = 0
    count = 0
    for scores in ls:
        count+=1
        sum+=scores[-1]
    return sum/count
avg_rouge1 = mean_rouge(rouge1)
avg_rouge2 = mean_rouge(rouge2)
avg_rougeL = mean_rouge(rougeL)

In [None]:
avg_rouge1, avg_rouge2, avg_rougeL

(0.5419490403667457, 0.2887050028434485, 0.5233145446304337)

## BERT embeddings similarity

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import warnings

# Initialize the BERT model and tokenizer outside the loop
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to("cuda")

def bert_text_similarity(reference, candidate, tokenizer, model):
    tokens_ref = tokenizer.encode(reference, add_special_tokens=True, return_tensors="pt").to("cuda")
    tokens_cand = tokenizer.encode(candidate, add_special_tokens=True, return_tensors="pt").to("cuda")

    with torch.no_grad():
        embeddings_ref = model(tokens_ref)[0][:, 0, :]
        embeddings_cand = model(tokens_cand)[0][:, 0, :]

    similarity_score = cosine_similarity(embeddings_ref.cpu(), embeddings_cand.cpu())[0][0]

    return similarity_score

def calculate_mean_similarity(df, tokenizer, model):
    total_similarity = 0.0
    num_pairs = len(df)

    for index, row in df.iterrows():
        human_response = row["detoxified"]
        model_response = row["baseline"]

        similarity_score = bert_text_similarity(human_response, model_response, tokenizer, model)
        total_similarity += similarity_score

    mean_similarity = total_similarity / num_pairs
    return mean_similarity

# Calculate and print the mean similarity
mean_similarity = calculate_mean_similarity(test_df, tokenizer, model)
print(f"Mean BERT Text Similarity Score: {mean_similarity:.2f}")

Mean BERT Text Similarity Score: 0.92


# Toxicity Classification model's true toxic labels count

## Baseline toxicity

In [None]:
from transformers import XLMRobertaForSequenceClassification, AutoTokenizer
import torch

labels = []
model_name = "Jayveersinh-Raj/PolyGuard"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name)

for text in list(test_df["baseline"]):
  inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
  outputs = model(inputs)[0]
  probabilities = torch.softmax(outputs, dim=1)
  predicted_class = torch.argmax(probabilities).item()
  if predicted_class == 1:
    labels.append(1)
  else:
    labels.append(0)

Downloading (…)lve/main/config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [None]:
labels.count(1)

355

## Actual ground truth toxicity

In [None]:
from transformers import XLMRobertaForSequenceClassification, AutoTokenizer
import torch

labels_actual = []
model_name = "Jayveersinh-Raj/PolyGuard"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name)

for text in list(test_df["detoxified"]):
  inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
  outputs = model(inputs)[0]
  probabilities = torch.softmax(outputs, dim=1)
  predicted_class = torch.argmax(probabilities).item()
  if predicted_class == 1:
    labels_actual.append(1)
  else:
    labels_actual.append(0)

In [None]:
labels.count(1)

355