In [1]:
from google.colab import drive
import pandas as pd
from IPython.display import display
from google.colab import drive
drive.mount('/content/drive')
csv_file_path = '/content/drive/MyDrive/Colab Notebooks/data/random_selected_summary.csv'
df = pd.read_csv(csv_file_path)

Mounted at /content/drive


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and model
model_name = "JungleLee/bert-toxic-comment-classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function to predict toxicity
def predict_toxicity(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

    # Make sure to use torch.no_grad() to prevent gradient calculation during inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the logits and apply softmax to get probabilities
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)

    # Get the predicted class (index of the highest probability)
    predicted_class = torch.argmax(probabilities, dim=-1).item()

    # Return the class and probabilities
    return predicted_class, probabilities.squeeze().numpy()

# Initialize a list to store results
results = []
toxicity_labels = ['Non-toxic', 'Toxic']
# Loop through articles and score them
for _, article in df.head(10).iterrows():
    # Score the cleaned article
    article_predicted_class, article_class_probabilities = predict_toxicity(article['cleaned_article'])

    # Score the summary
    summary_predicted_class, summary_class_probabilities = predict_toxicity(article['gpt4_summary'])

    # Append results to the list
    results.append({
        "Article Title": article['title'],  # Assuming you want to use the article's title
        "Article Predicted Class": toxicity_labels[article_predicted_class],
        "Article Class Probabilities": article_class_probabilities,
        "GPT Predicted Class": toxicity_labels[summary_predicted_class],
        "GPT Class Probabilities": summary_class_probabilities
    })


# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Display the results in tabular format
display(df)

tokenizer_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Unnamed: 0,Article Title,Article Predicted Class,Article Class Probabilities,GPT Predicted Class,GPT Class Probabilities
0,Poland abortion: Polish women look for help in...,Toxic,"[0.4129452, 0.5870547]",Non-toxic,"[0.99609876, 0.003901225]"
1,Hong Kong Cardinal advocates for promoting a c...,Non-toxic,"[0.9947699, 0.0052301236]",Non-toxic,"[0.99953675, 0.00046323016]"
2,Sex Education in the Philippines - The Borgen ...,Non-toxic,"[0.99851114, 0.0014888949]",Non-toxic,"[0.9980215, 0.001978482]"
3,"When a fetal scan showed problems, she fled Id...",Non-toxic,"[0.9523109, 0.047689006]",Non-toxic,"[0.9976618, 0.0023382176]"
4,Prioritizing the Neglected Areas of SRHR in Ca...,Non-toxic,"[0.9605853, 0.03941465]",Non-toxic,"[0.9985434, 0.0014566102]"
5,Abortion policy activism heats up for Roe v. W...,Non-toxic,"[0.9981713, 0.0018287124]",Non-toxic,"[0.9995215, 0.00047850635]"
6,Barbara Kruger’s 'Your Body is a Battleground'...,Non-toxic,"[0.8496231, 0.15037692]",Non-toxic,"[0.9977417, 0.0022583308]"
7,Abortion 'practically banned in Turkey',Non-toxic,"[0.99058366, 0.009416392]",Non-toxic,"[0.99775416, 0.0022458246]"
8,"Mitch McConnell, Senate Republicans Vote to St...",Non-toxic,"[0.9971726, 0.0028273745]",Non-toxic,"[0.9990159, 0.0009840126]"
9,DeSantis signs 15-week abortion ban into law d...,Non-toxic,"[0.99682814, 0.003171805]",Non-toxic,"[0.9983967, 0.0016033273]"
