# **INTRO TO AI PROJECT**


# **Package installs, import statements, and misc**

You will also have to add the HuggingFace API Key in the Secrets section of this colab. It is the key icon to the left. 

In [1]:
# !pip install -q kaggle
# !pip install -q huggingface_hub
# !pip install googletrans


In [None]:
import torch
print("PyTorch built with CUDA version:", torch.version.cuda)
print(torch.cuda.is_available())

In [None]:
import unsloth
from datasets import load_dataset
import os


In [None]:
!kaggle datasets list 

## **Dataset Downloads and Setup**

### **Downloading from Kaggle**

To download a dataset from Kaggle, you first need to find the dataset's identifier. This usually looks like `username/dataset-name`. You can find this on the dataset's page on Kaggle. Once you have it, you use the `!kaggle datasets download` command, and then `unzip` the downloaded file if it's a zip archive.

In [5]:
# Load Twitter Data
# !kaggle datasets download mrmorj/hate-speech-and-offensive-language-dataset
# !unzip hate-speech-and-offensive-language-dataset.zip
# !mv labeled_data.csv twitter_hate_speech_data.csv

In [6]:
# # Load Twitter Data
# !kaggle datasets download mrmorj/hate-speech-and-offensive-language-dataset
# !unzip hate-speech-and-offensive-language-dataset.zip
# !mv labeled_data.csv twitter_hate_speech_data.csv

In [7]:
# # Load Wikipedia Data
# # TO-DO: This data is copied from the competition, we need to check that its actually the same as the competition
# !kaggle datasets download julian3833/jigsaw-toxic-comment-classification-challenge
# !unzip jigsaw-toxic-comment-classification-challenge.zip

# target_path = os.path.expanduser('~/wikipedia')

# if not os.path.exists(target_path):
#   !mkdir wikipedia_data
# else:
#   print(f"Directory already found at {target_path}. Replacing files")

# !mv sample_submission.csv wikipedia_data
# !mv test.csv wikipedia_data
# !mv test_labels.csv wikipedia_data
# !mv train.csv wikipedia_data

### **Downloading from Hugging Face**

For Hugging Face, you can use the `huggingface_hub` library. The primary function for downloading files is `hf_hub_download`. You'll need the `repo_id` (e.g., 'HuggingFaceH4/ultrachat_200k') and the `filename` you want to download from that repository.

In [8]:
anthropic_data = load_dataset("Anthropic/hh-rlhf")

In [9]:
# prosocial_dataset = load_dataset("allenai/prosocial-dialog")

In [10]:
toxigen_dataset = load_dataset("toxigen/toxigen-data")

In [11]:
real_toxicity_dataset = load_dataset("allenai/real-toxicity-prompts")


In [None]:
# print(anthropic_data)
# print(prosocial_dataset)
print(toxigen_dataset)
print(real_toxicity_dataset)

## **Data Processing**

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter

print(" Processing libraries imported")

In [None]:
def clean_text(text):
    if pd.isna(text) or text is None:
        return ""
    text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = ' '.join(text.split())
    return text.strip()

def extract_metadata(text):
    return {
        'length': len(text),
        'word_count': len(text.split()),
        'has_mentions': '@' in text,
        'has_hashtags': '#' in text,
        'has_caps': any(c.isupper() for c in text),
        'exclamation_count': text.count('!'),
        'question_count': text.count('?')
    }

print("Help functions defined")

In [None]:
def process_twitter_data():
    print("Processing Twitter dataset (Human-Generated, Social Media)...")
    twitter_df = pd.read_csv('twitter_hate_speech_data.csv')
    twitter_df['text'] = twitter_df['tweet'].apply(clean_text)
    twitter_df['binary_label'] = twitter_df['class'].apply(lambda x: 1 if x in [0, 1] else 0)
    twitter_df['three_class_label'] = twitter_df['class']
    twitter_df['source'] = 'twitter'
    twitter_df['origin'] = 'human'
    twitter_df['domain'] = 'social_media'
    metadata = twitter_df['text'].apply(extract_metadata)
    for key in metadata.iloc[0].keys():
        twitter_df[f'meta_{key}'] = metadata.apply(lambda x: x[key])
    twitter_df = twitter_df[twitter_df['text'].str.len() > 0].reset_index(drop=True)
    print(f"  Twitter: {len(twitter_df)} examples")
    print(f"  Hate speech: {(twitter_df['three_class_label'] == 0).sum()}")
    print(f"  Offensive: {(twitter_df['three_class_label'] == 1).sum()}")
    print(f"  None: {(twitter_df['three_class_label'] == 2).sum()}")
    return twitter_df

print(" Twitter processing function defined")

In [None]:
def process_wikipedia_data():
    print("\nProcessing Wikipedia dataset (Human-Generated, Non-Social Media)...")
    wiki_df = pd.read_csv('wikipedia_data/train.csv')
    wiki_df['text'] = wiki_df['comment_text'].apply(clean_text)
    toxic_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    wiki_df['binary_label'] = wiki_df[toxic_cols].max(axis=1)
    for col in toxic_cols:
        wiki_df[f'label_{col}'] = wiki_df[col]
    wiki_df['toxicity_severity'] = wiki_df[toxic_cols].sum(axis=1)
    wiki_df['source'] = 'wikipedia'
    wiki_df['origin'] = 'human'
    wiki_df['domain'] = 'non_social_media'
    metadata = wiki_df['text'].apply(extract_metadata)
    for key in metadata.iloc[0].keys():
        wiki_df[f'meta_{key}'] = metadata.apply(lambda x: x[key])
    wiki_df = wiki_df[wiki_df['text'].str.len() > 0].reset_index(drop=True)
    print(f" Wikipedia: {len(wiki_df)} examples")
    print(f"   Toxic: {wiki_df['binary_label'].sum()}")
    print(f"   Non-Toxic: {len(wiki_df) - wiki_df['binary_label'].sum()}")
    return wiki_df

print(" Wikipedia processing function defined")

In [None]:
def process_anthropic_data(anthropic_data):
    print("\nProcessing Anthropic HH-RLHF (Human Feedback on AI)...")
    texts = []
    labels = []
    response_types = []
    for split_name in ['train', 'test']:
        for example in anthropic_data[split_name]:
            chosen = clean_text(example['chosen'])
            rejected = clean_text(example['rejected'])
            if len(chosen) > 0:
                texts.append(chosen)
                labels.append(0)
                response_types.append('chosen')
            if len(rejected) > 0:
                texts.append(rejected)
                labels.append(1)
                response_types.append('rejected')
    anthropic_df = pd.DataFrame({
        'text': texts,
        'binary_label': labels,
        'response_type': response_types,
        'source': 'anthropic',
        'origin': 'machine_generated',
        'domain': 'mixed'
    })
    metadata = anthropic_df['text'].apply(extract_metadata)
    for key in metadata.iloc[0].keys():
        anthropic_df[f'meta_{key}'] = metadata.apply(lambda x: x[key])
    print(f"  Anthropic: {len(anthropic_df)} examples")
    print(f"    Rejected (harmful): {anthropic_df['binary_label'].sum()}")
    print(f"    Chosen (liked): {len(anthropic_df) - anthropic_df['binary_label'].sum()}")
    return anthropic_df

print("Anthropic processing function defined")

In [None]:
def process_anthropic_data_modified(anthropic_data):
    print("\nProcessing Anthropic HH-RLHF (Human Feedback on AI)...")
    texts = []
    labels = []
    response_types = []
    for split_name in ['train', 'test']:
        for example in anthropic_data[split_name]:
            chosen_list = example['chosen'].split("\n\n")[1:]
            real_chosen_list = list()
            for segment in chosen_list:
                if segment.startswith("Human:") or segment.startswith("Assistant:"):
                    real_chosen_list.append(segment)
                else:
                    real_chosen_list[-1] = real_chosen_list[-1] + '\n\n' + segment
                
            chosen = clean_text(real_chosen_list[-1])

            if chosen.startswith("Assistant:") and len(chosen) > 150:
               chosen = chosen[len("Assistant:"):]
               texts.append(chosen)
               labels.append(0)
               response_types.append('chosen')

    anthropic_df = pd.DataFrame({
        'text': texts,
        'binary_label': labels,
        'response_type': response_types,
        'source': 'anthropic',
        'origin': 'machine_generated',
        'domain': 'mixed'
    })
    metadata = anthropic_df['text'].apply(extract_metadata)
    for key in metadata.iloc[0].keys():
        anthropic_df[f'meta_{key}'] = metadata.apply(lambda x: x[key])
    print(f"  Anthropic: {len(anthropic_df)} examples")
    print(f"    Rejected (harmful): {anthropic_df['binary_label'].sum()}")
    print(f"    Chosen (liked): {len(anthropic_df) - anthropic_df['binary_label'].sum()}")
    return anthropic_df

print("Anthropic processing function defined")

In [None]:
def process_prosocial_data(prosocial_dataset):
    print("\nProcessing ProsocialDialog (Conversational Safety)...")
    texts = []
    labels = []
    safety_labels = []
    for split_name in ['train', 'validation']:
        for example in prosocial_dataset[split_name]:
            context = example['context']
            response = example['response']
            full_text = f"{context} {response}".strip()
            clean = clean_text(full_text)
            safety = example['safety_label']
            is_unsafe = 0 if safety == '__casual__' else 1
            if len(clean) > 0:
                texts.append(clean)
                labels.append(is_unsafe)
                safety_labels.append(safety)
    prosocial_df = pd.DataFrame({
        'text': texts,
        'binary_label': labels,
        'safety_annotation': safety_labels,
        'source': 'prosocial',
        'origin': 'mixed',
        'domain': 'conversational'
    })
    metadata = prosocial_df['text'].apply(extract_metadata)
    for key in metadata.iloc[0].keys():
        prosocial_df[f'meta_{key}'] = metadata.apply(lambda x: x[key])
    print(f"  ProsocialDialog: {len(prosocial_df)} examples")
    print(f"    Unsafe: {prosocial_df['binary_label'].sum()}")
    print(f"    Safe: {len(prosocial_df) - prosocial_df['binary_label'].sum()}")
    return prosocial_df

print(" Prosocial processing function defined")

In [None]:
def process_toxigen_data(toxigen_dataset):
    print("\nProcessing ToxiGen ...")

    texts = []
    labels = []
    target_groups = []
    human_scores = []
    ai_scores = []
    final_scores = []
    source_used = []

    for example in toxigen_dataset['train']:
        text = clean_text(example['text'])

        human_tox = example.get('toxicity_human')
        ai_tox = example.get('toxicity_ai')

        if human_tox and ai_tox:
          final_tox = (human_tox + ai_tox) / 2.0
        elif human_tox:
          final_tox = human_tox
        elif ai_tox:
          final_tox = ai_tox
        else:
          continue

        if example.get("actual_method") == 'topk':
          source = 'machine_generated'
        elif example.get("actual_method") == 'cbs':
          source = 'human'
        if len(text) > 0:
            label = 1 if final_tox > 3.0 else 0

            texts.append(text)
            labels.append(label)
            target_groups.append(example.get('target_group', 'unknown'))

            human_scores.append(human_tox if human_tox is not None else -1)
            ai_scores.append(ai_tox if ai_tox is not None else -1)
            final_scores.append(final_tox)
            source_used.append(source)

    toxigen_df = pd.DataFrame({
        'text': texts,
        'binary_label': labels,
        'toxicity_final': final_scores,
        'toxicity_human': human_scores,
        'toxicity_ai': ai_scores,
        'origin': source_used,
        'target_group': target_groups,
        'source': 'toxigen',
        'domain': 'synthetic'
    })
    metadata = toxigen_df['text'].apply(extract_metadata)
    for key in metadata.iloc[0].keys():
        toxigen_df[f'meta_{key}'] = metadata.apply(lambda x: x[key])

    print(f"  ToxiGen processed: {len(toxigen_df)} examples")
    print(f"    Toxic: {toxigen_df['binary_label'].sum()}")
    print(f"    Safe: {len(toxigen_df) - toxigen_df['binary_label'].sum()}")
    print(f"    Human-labeled: {sum(toxigen_df['origin']=='human')}")
    print(f"    AI-labeled: {sum(toxigen_df['origin']=='machine_generated')}")

    return toxigen_df

print("  Updated ToxiGen processing function defined.")


In [None]:
def process_real_toxicity_data(real_toxicity_dataset):
    print("\nProcessing RealToxicityPrompts (API-Scored Prompts)...")
    texts = []
    labels = []
    toxicity_scores = []
    for example in real_toxicity_dataset['train']:
        prompt_dict = example['prompt']
        if isinstance(prompt_dict, dict):
            text = prompt_dict.get('text', '')
            toxicity = prompt_dict.get('toxicity')
        else:
            text = str(prompt_dict)
            toxicity = None
        clean = clean_text(text)
        if len(clean) > 0:
            texts.append(clean)
            if toxicity is not None and not pd.isna(toxicity):
                tox_score = float(toxicity)
                toxicity_scores.append(tox_score)
                labels.append(1 if tox_score > 0.5 else 0)
            else:
                toxicity_scores.append(0.0)
                labels.append(0)
    real_tox_df = pd.DataFrame({
        'text': texts,
        'binary_label': labels,
        'toxicity_score': toxicity_scores,
        'source': 'real_toxicity_prompts',
        'origin': 'human',
        'domain': 'web_text'
    })
    metadata = real_tox_df['text'].apply(extract_metadata)
    for key in metadata.iloc[0].keys():
        real_tox_df[f'meta_{key}'] = metadata.apply(lambda x: x[key])
    print(f"  RealToxicityPrompts: {len(real_tox_df)} examples")
    print(f"    Toxic (>0.5): {real_tox_df['binary_label'].sum()}")
    print(f"    Non-toxic: {len(real_tox_df) - real_tox_df['binary_label'].sum()}")
    return real_tox_df

print("  RealToxicity processing function defined")

In [None]:
def create_simple_splits(twitter_df, wiki_df, anthropic_df, prosocial_df, toxigen_df, real_tox_df):
    print("\n" + "="*70)
    print("CREATING TRAIN/VAL/TEST SPLITS")
    print("="*70)

    # Combines all data
    all_data = pd.concat([twitter_df, wiki_df], ignore_index=True, join='inner')
    all_data = pd.concat([all_data, anthropic_df], ignore_index=True) # This dataset is not as clear-cut in toxicity vs non-toxicity
    # all_data = pd.concat([all_data, prosocial_df], ignore_index=True) # This dataset is better, but still has some odd spots
    all_data = pd.concat([all_data, toxigen_df], ignore_index=True, join='inner')
    all_data = pd.concat([all_data, real_tox_df], ignore_index=True, join='inner')

    # Shuffles it
    all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # Creates 70/15/15 split
    n = len(all_data)
    train_end = int(0.7 * n)
    val_end = int(0.85 * n)

    train_df = all_data[:train_end]
    val_df = all_data[train_end:val_end]
    test_df = all_data[val_end:]

    print(f"\nTrain: {len(train_df):,} examples")
    print(f"Val: {len(val_df):,} examples")
    print(f"Test: {len(test_df):,} examples")

    # Save the splits
    print("\n" + "="*70)
    print("SAVING FILES")
    print("="*70)

    train_df.to_csv('train.csv', index=False)
    val_df.to_csv('val.csv', index=False)
    test_df.to_csv('test.csv', index=False)
    all_data.to_csv('all_data.csv', index=False)

    print("  Files saved:")
    print("  - train.csv")
    print("  - val.csv")
    print("  - test.csv")
    print("  - all_data.csv")

    return train_df, val_df, test_df, all_data

print("Split function made")

In [None]:
print("="*70)
print("STARTING DATA PROCESSING PIPELINE")
print("="*70)
# prosocial_df = process_prosocial_data(prosocial_dataset)
prosocial_df = pd.DataFrame() # Not clear cut

# Additional Processing
anthropic_df = process_anthropic_data_modified(anthropic_data) # Only taking chosen responses
if len(anthropic_df) > 36000:
    anthropic_df = anthropic_df.sample(n=36000, random_state=42)
    
twitter_df = process_twitter_data()

# Extended processing to remove 
wiki_df = process_wikipedia_data()
wiki_label_0 = wiki_df[wiki_df['binary_label'] == 0]
wiki_label_1 = wiki_df[wiki_df['binary_label'] == 1]

# Take max 48k from the safe (label 0) portion
if len(wiki_label_0) > 48000:
    # random_state ensures you get the same 48k every time you run it
    wiki_label_0 = wiki_label_0.sample(n=48000, random_state=42) 

# Combine: 48k Safe + ALL Toxic
wiki_df = pd.concat([wiki_label_0, wiki_label_1])

toxigen_df = process_toxigen_data(toxigen_dataset)
real_tox_df = process_real_toxicity_data(real_toxicity_dataset)

# Call the function to create splits and assign to global dataframes
train_df, val_df, test_df, all_data = create_simple_splits(twitter_df, wiki_df, anthropic_df, prosocial_df, toxigen_df, real_tox_df)

In [None]:
toxigen_df.head()

In [None]:
print("\n" + "="*70)
print("Final dataset summary")
print("="*70)
print(f"\nTotal examples: {len(all_data):,}")
print(f"\nBy Origin:")
print(all_data['origin'].value_counts())
print(f"\nBy Domain:")
print(all_data['domain'].value_counts())
print(f"\nBy Source:")
print(all_data['source'].value_counts())
print(f"\nLabel Distribution:")
toxic_count = all_data['binary_label'].sum()
print(f"  Toxic: {toxic_count:,} ({all_data['binary_label'].mean()*100:.1f}%)")
print(f"  Safe: {(len(all_data)-toxic_count):,} ({(1-all_data['binary_label'].mean())*100:.1f}%)")
print("\n  data processing done!")

## **Machine Learning Attempts**


### Other Model Approaches

Since our main focus is on LLMs and toxicity, we will put most of our effort towards that. However, for some comparisons, we will provide some other models and how they perform.

We will be providing Naive Bayes, Logistic Regression, and SVM approaches. For this portion, we will use SciKit-Learn.

SVM (Support Vector Machines) & Logistic Regression typically use TF-IDF (Term Frequency-Inverse Document Frequency) or N-grams.

In [None]:
# Naive Bayes Classifier Implementation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np

print("\n" + "="*70)
print("NAIVE BAYES CLASSIFIER")
print("="*70)

# Prepare the data
X_train = train_df['text'].values
y_train = train_df['binary_label'].values

X_val = val_df['text'].values
y_val = val_df['binary_label'].values

X_test = test_df['text'].values
y_test = test_df['binary_label'].values

# Create TF-IDF vectorizer
print("\nVectorizing text data...")
vectorizer = TfidfVectorizer(
    max_features=10000,  # Limit to top 10k features
    ngram_range=(1, 2),  # Use unigrams and bigrams
    min_df=2,            # Ignore terms that appear in fewer than 2 documents
    max_df=0.95,         # Ignore terms that appear in more than 95% of documents
    strip_accents='unicode',
    lowercase=True
)

# Fit on training data and transform
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

print(f"  Training set shape: {X_train_tfidf.shape}")
print(f"  Validation set shape: {X_val_tfidf.shape}")
print(f"  Test set shape: {X_test_tfidf.shape}")

# Train Naive Bayes classifier
print("\nTraining Naive Bayes classifier...")
nb_classifier = MultinomialNB(alpha=1.0)  # alpha is the smoothing parameter
nb_classifier.fit(X_train_tfidf, y_train)
print("  Training complete!")

# Make predictions
print("\nMaking predictions...")
y_train_pred = nb_classifier.predict(X_train_tfidf)
y_val_pred = nb_classifier.predict(X_val_tfidf)
y_test_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate on all splits
print("\n" + "="*70)
print("PERFORMANCE METRICS")
print("="*70)

for split_name, y_true, y_pred in [
    ("TRAIN", y_train, y_train_pred),
    ("VALIDATION", y_val, y_val_pred),
    ("TEST", y_test, y_test_pred)
]:
    print(f"\n{split_name} SET:")
    print(f"  Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"  Precision: {precision_score(y_true, y_pred, zero_division=0):.4f}")
    print(f"  Recall:    {recall_score(y_true, y_pred, zero_division=0):.4f}")
    print(f"  F1 Score:  {f1_score(y_true, y_pred, zero_division=0):.4f}")

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print(f"\n  Confusion Matrix:")
    print(f"    TN: {cm[0,0]:,}  FP: {cm[0,1]:,}")
    print(f"    FN: {cm[1,0]:,}  TP: {cm[1,1]:,}")

# Detailed classification report for test set
print("\n" + "="*70)
print("DETAILED TEST SET CLASSIFICATION REPORT")
print("="*70)
print(classification_report(y_test, y_test_pred, target_names=['Safe', 'Toxic']))

# Get prediction probabilities
print("\n" + "="*70)
print("PREDICTION PROBABILITIES")
print("="*70)
train_probs = nb_classifier.predict_proba(X_train_tfidf)[:, 1]
val_probs = nb_classifier.predict_proba(X_val_tfidf)[:, 1]
test_probs = nb_classifier.predict_proba(X_test_tfidf)[:, 1]

# Add predictions to dataframes
train_df_filtered = train_df.copy()
train_df_filtered['nb_prediction'] = y_train_pred
train_df_filtered['nb_prob_toxic'] = train_probs

val_df_filtered = val_df.copy()
val_df_filtered['nb_prediction'] = y_val_pred
val_df_filtered['nb_prob_toxic'] = val_probs

test_df_filtered = test_df.copy()
test_df_filtered['nb_prediction'] = y_test_pred
test_df_filtered['nb_prob_toxic'] = test_probs

print("Prediction probabilities added to dataframes")

# Show some example predictions
print("\n" + "="*70)
print("EXAMPLE PREDICTIONS (Test Set)")
print("="*70)
sample_indices = np.random.choice(len(test_df_filtered), size=5, replace=False)
for idx in sample_indices:
    row = test_df_filtered.iloc[idx]
    print(f"\nText: {row['text'][:100]}...")
    print(f"  True Label: {'Toxic' if row['binary_label'] == 1 else 'Safe'}")
    print(f"  Predicted: {'Toxic' if row['nb_prediction'] == 1 else 'Safe'}")
    print(f"  Probability (Toxic): {row['nb_prob_toxic']:.4f}")

# Save filtered datasets with predictions
print("\n" + "="*70)
print("SAVING FILTERED DATASETS")
print("="*70)
train_df_filtered.to_csv('train_nb_filtered.csv', index=False)
val_df_filtered.to_csv('val_nb_filtered.csv', index=False)
test_df_filtered.to_csv('test_nb_filtered.csv', index=False)

print("  Files saved:")
print("  - train_nb_filtered.csv")
print("  - val_nb_filtered.csv")
print("  - test_nb_filtered.csv")

print("\n" + "="*70)
print("NAIVE BAYES PIPELINE COMPLETE!")
print("="*70)

### Finetuning Approaches for toxicity identification

We will finetune a small transformer for toxicity identification.

In [27]:
# https://huggingface.co/docs/transformers/en/training#fine-tune-a-pretrained-model

In [None]:
import torch

# Confirm that the GPU is detected
print(torch.cuda.is_available())

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")

In [None]:
!pip install transformers
!pip install evaluate
!pip install datasets



print('Success!')

import os


# Create a directory to save pretrained models
pretrained_models_dir = './pretrained_models_dir'
if not os.path.isdir(pretrained_models_dir):
  os.mkdir(pretrained_models_dir)
print('Model directory created')

In [30]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, # Switched to SequenceClassification
    TrainingArguments, 
    Trainer
)

# Configuration
if 'pretrained_models_dir' not in locals():
    pretrained_models_dir = "./models" 

model_name_or_path = "answerdotai/ModernBERT-base"
cache_dir = os.path.join(pretrained_models_dir, "ModernBERT-base")

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)


In [31]:
def create_new_model():
    # Load Model for Sequence Classification (Binary)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path, 
        cache_dir=cache_dir,
        num_labels=2,
        torch_dtype="auto"
    )

    # Move to Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    return model

def tokenize_function(examples):
    # Our text column is named 'text', not 'sentence'
    return tokenizer(examples["text"], padding="max_length", max_length=512, truncation=True)

def prepare_datasets(df):
    dataset_raw = Dataset.from_pandas(df)

    tokenized_dataset = dataset_raw.shuffle(seed=97520349).select(
        range(min(20000, len(dataset_raw)))
    ).map(tokenize_function, batched=True)

    keep_columns = ['text', 'input_ids', 'token_type_ids', 'attention_mask', 'binary_label']

    columns_to_remove = [
        col for col in tokenized_dataset.column_names 
        if col not in keep_columns and not col.startswith('__')
    ]

    tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)
    
    if "binary_label" in tokenized_dataset.column_names:
        tokenized_dataset = tokenized_dataset.rename_column("binary_label", "label")

    tokenized_dataset.set_format("torch")
    return tokenized_dataset




In [32]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [33]:
import timeit
import gc
from peft import LoraConfig, get_peft_model, TaskType
start_time = timeit.default_timer()

def train_model(model, train_dataset, val_dataset):
  peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, 
    r=32, 
    lora_alpha=32, 
    lora_dropout=0.2,
    target_modules="all-linear" 
)
  peft_model = get_peft_model(model, peft_config)

  training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,      # Crucial for 8GB VRAM
    gradient_accumulation_steps=1,     # Maintains effective batch size
    bf16=True,                          # Saves ~40% memory
    dataloader_num_workers=4,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=625,
    save_steps=625,
    learning_rate=1e-4,
    num_train_epochs=4,
    load_best_model_at_end=True,
    weight_decay=0.01,
)
  results = []
  output_dir = f"test_trainer_{123456}"
  training_args.output_dir = output_dir
  training_args.seed = 123456
  trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=train_dataset, # Use our tokenized training data
        eval_dataset=val_dataset,   # Use our tokenized validation data
        compute_metrics=compute_metrics,
  )
  trainer.train()
  print(trainer.evaluate())
  # The 'accuracy' metric returns a dictionary, so access the 'accuracy' key
  results.append(trainer.evaluate()['eval_accuracy'])


  results = np.array(results)
  mean = np.mean(results)
  std = np.std(results)
  return peft_model, trainer, results, mean, std



In [None]:
print(train_df['origin'].value_counts())

In [None]:
filtered_train_df = train_df[train_df['origin'] == 'human']
print(train_df['binary_label'].value_counts())

In [None]:
base_model = create_new_model()
base_train_dataset = prepare_datasets(train_df)
base_val_dataset = prepare_datasets(val_df)
print("Tokenized datasets prepared.")
peft_model, base_trainer, results, mean, std = train_model(base_model, base_train_dataset, base_val_dataset)
print(f"Accuracy on toxicity dev set: {mean} +/- {std}")
elapsed_time = timeit.default_timer() - start_time
print(f"Time elapsed: {elapsed_time} seconds")

merged_model = peft_model.merge_and_unload()

# 2. Define a clearer path (don't overwrite the original 'base_model' name!)
# Suggested: "modernbert_finetuned" or similar
save_path = r"C:\Users\jaydo\OneDrive\Documents\CS\Fall25CS4824\pretrained_models_dir\base_model"

# 3. Save the FULL model and tokenizer
merged_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Full merged model saved to: {save_path}")

del base_model
del peft_model
del base_trainer
gc.collect()
torch.cuda.empty_cache()

In [None]:
human_toxicity_model = create_new_model()
# Intermediate step
human_train_df = train_df[train_df['origin'] == 'human']
human_val_df = val_df[val_df['origin'] == 'human']

human_train_dataset = prepare_datasets(human_train_df)
human_val_dataset = prepare_datasets(human_val_df)
print("Tokenized datasets prepared.")
human_peft_model, base_trainer, results, mean, std = train_model(human_toxicity_model, human_train_dataset, human_val_dataset)
print(f"Accuracy on toxicity dev set: {mean} +/- {std}")
elapsed_time = timeit.default_timer() - start_time
print(f"Time elapsed: {elapsed_time} seconds")

merged_model = human_peft_model.merge_and_unload()

# 2. Define a clearer path (don't overwrite the original 'base_model' name!)
# Suggested: "modernbert_finetuned" or similar
save_path = r"C:\Users\jaydo\OneDrive\Documents\CS\Fall25CS4824\pretrained_models_dir\human_model"

# 3. Save the FULL model and tokenizer
merged_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Full merged model saved to: {save_path}")

del human_toxicity_model
del human_peft_model
del base_trainer
gc.collect()
torch.cuda.empty_cache()

In [None]:
machine_toxicity_model = create_new_model()
# Intermediate step
machine_train_df = train_df[train_df['origin'] == 'machine_generated']
machine_val_df = val_df[val_df['origin'] == 'machine_generated']

machine_train_dataset = prepare_datasets(machine_train_df)
machine_val_dataset = prepare_datasets(machine_val_df)
print("Tokenized datasets prepared.")
machine_peft_model, base_trainer, results, mean, std = train_model(machine_toxicity_model, machine_train_dataset, machine_val_dataset)
print(f"Accuracy on toxicity dev set: {mean} +/- {std}")
elapsed_time = timeit.default_timer() - start_time
print(f"Time elapsed: {elapsed_time} seconds")


merged_model = machine_peft_model.merge_and_unload()

# 2. Define a clearer path (don't overwrite the original 'base_model' name!)
# Suggested: "modernbert_finetuned" or similar
save_path = r"C:\Users\jaydo\OneDrive\Documents\CS\Fall25CS4824\pretrained_models_dir\machine_model"

# 3. Save the FULL model and tokenizer
merged_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Full merged model saved to: {save_path}")

del machine_toxicity_model
del machine_peft_model
del base_trainer
gc.collect()
torch.cuda.empty_cache()


In [39]:
import numpy as np
def evaluate_performance(preds, dataset, dataset_name):
    y_true = dataset['label'] 
    
    acc = accuracy_score(y_true, preds)
    
    print(f"--- Performance on {dataset_name} ---")
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_true, preds))
    print("-" * 30 + "\n")


def run_evals(model_path, test_df):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    human_test_df =  test_df[test_df['origin'] == 'human']
    machine_test_df =  test_df[test_df['origin'] == 'machine_generated']
    
    full_tokenized_dataset = prepare_datasets(test_df)
    human_tokenized_dataset = prepare_datasets(human_test_df)
    machine_tokenized_dataset = prepare_datasets(machine_test_df)

    eval_trainer = Trainer(model=model)

    full_raw_preds = eval_trainer.predict(full_tokenized_dataset)
    full_final_preds = np.argmax(full_raw_preds.predictions, axis=1)
    evaluate_performance(full_final_preds, full_tokenized_dataset, "Full Test Set")

    human_raw_preds = eval_trainer.predict(human_tokenized_dataset)
    human_final_preds = np.argmax(human_raw_preds.predictions, axis=1)
    evaluate_performance(human_final_preds, human_tokenized_dataset, "Human Only")
    
    machine_raw_preds = eval_trainer.predict(machine_tokenized_dataset)
    machine_final_preds = np.argmax(machine_raw_preds.predictions, axis=1)
    evaluate_performance(machine_final_preds, machine_tokenized_dataset, "Machine Only")

    del model
    del eval_trainer
    gc.collect()
    torch.cuda.empty_cache()

In [40]:
base_model = r"C:\Users\jaydo\OneDrive\Documents\CS\Fall25CS4824\pretrained_models_dir\base_model"
human_path = r"C:\Users\jaydo\OneDrive\Documents\CS\Fall25CS4824\pretrained_models_dir\human_model"
machine_path = r"C:\Users\jaydo\OneDrive\Documents\CS\Fall25CS4824\pretrained_models_dir\machine_model"


In [None]:
run_evals(base_model, test_df)

In [None]:
run_evals(human_path, test_df)

In [None]:
run_evals(machine_path, test_df)

### Unsloth Fine Tuning

This is an attempt at getting unsloth to work


In [None]:
# !pip install "unsloth[local]"
# !pip install trl peft

In [None]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset
import torch
print("Unsloth ready!")

In [None]:
def format_text_for_unsloth(text, label):
    """Convert your toxicity data to instruction format"""
    instruction = "Classify this text as toxic or non-toxic."
    answer = "toxic" if label == 1 else "non-toxic"

    return f"""### Instruction:
{instruction}

### Input:
{text}

### Response:
{answer}"""
print("Formatting your data for Unsloth...")
train_df.loc[:, 'formatted'] = train_df.apply(lambda r: format_text_for_unsloth(r['text'], r['binary_label']), axis=1)
val_df.loc[:, 'formatted'] = val_df.apply(lambda r: format_text_for_unsloth(r['text'], r['binary_label']), axis=1)

train_sample = train_df.sample(n=2000, random_state=42)  # Start small
val_sample = val_df.sample(n=500, random_state=42)

print(f"Ready to train with {len(train_sample)} examples")


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/tinyllama-bnb-4bit",  # Small model for Colab
    max_seq_length=512,
    load_in_4bit=True,
)

# Add LoRA adapters (makes training faster)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

tokenizer.pad_token = tokenizer.eos_token
print("Model loaded!")


In [None]:
MAX_INPUT_CHARS = 400

def preprocess_dataset(dataset):
    def format_example(example):

        instruction = "Classify this text as toxic or non-toxic."
        text = example["text"]
        answer = "toxic" if example["binary_label"] == 1 else "non-toxic"
        
        example["text"] = f"""### Instruction:
{instruction}

### Input:
{text}

### Response:
{answer}"""
        return example
    
    return dataset.map(format_example, num_proc=1)



train_dataset = Dataset.from_pandas(train_sample[['text', 'binary_label']])
val_dataset = Dataset.from_pandas(val_sample[['text', 'binary_label']])

train_dataset = preprocess_dataset(train_dataset)
val_dataset = preprocess_dataset(val_dataset)


In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    max_seq_length=512,
    dataset_text_field="text",
    packing=False,
    dataset_num_proc=1,  # Disable during tokenization
    args=TrainingArguments(
        output_dir="toxicity_unsloth",
        num_train_epochs=1,
        per_device_train_batch_size=2,  # Reduced
        gradient_accumulation_steps=2,   # Reduced
        warmup_steps=50,
        learning_rate=2e-4,
        fp16=False,
        bf16=False,
        logging_steps=25,
        optim="adamw_8bit",
        save_strategy="epoch",
        eval_strategy="steps",
        eval_steps=100,
        report_to="none",
        dataloader_num_workers=0,  # Disable dataloader workers
        dataloader_pin_memory=False,  # Disable pin memory
    ),
)


In [None]:
# Preprocess and tokenize manually BEFORE passing to SFTTrainer
def create_tokenized_dataset(dataset, tokenizer):
    def tokenize_function(example):
        MAX_INPUT_CHARS = 400
        instruction = "Classify this text as toxic or non-toxic."
        text = example["text"][:MAX_INPUT_CHARS]
        answer = "toxic" if example["binary_label"] == 1 else "non-toxic"
        
        formatted_text = f"""### Instruction:
{instruction}

### Input:
{text}

### Response:
{answer}"""
        
        # Tokenize it here
        tokenized = tokenizer(
            formatted_text,
            truncation=True,
            max_length=512,
            padding=False,
        )
        return tokenized
    
    # Tokenize with num_proc=1 to avoid multiprocessing issues
    return dataset.map(tokenize_function, remove_columns=dataset.column_names, num_proc=1)

# Create datasets
train_dataset = Dataset.from_pandas(train_sample[['text', 'binary_label']])
val_dataset = Dataset.from_pandas(val_sample[['text', 'binary_label']])

print("Tokenizing datasets manually...")
train_dataset = create_tokenized_dataset(train_dataset, tokenizer)
val_dataset = create_tokenized_dataset(val_dataset, tokenizer)
print("Done!")

# Now use regular Trainer, not SFTTrainer
from transformers import Trainer, DataCollatorForLanguageModeling

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    args=TrainingArguments(
        output_dir="toxicity_unsloth",
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        warmup_steps=50,
        learning_rate=2e-4,
        fp16=False,
        bf16=False,
        logging_steps=25,
        optim="adamw_8bit",
        save_strategy="epoch",
        eval_strategy="steps",
        eval_steps=100,
        report_to="none",
        dataloader_num_workers=0,
    ),
)

# Start training
trainer.train()

In [None]:
# fast func for testing
FastLanguageModel.for_inference(model)

def check_toxicity(text):
    prompt = f"""### Instruction:
Classify this text as toxic or non-toxic.

### Input:
{text}

### Response:
"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    outputs = model.generate(**inputs, max_new_tokens=10, temperature=0.1)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # extract response
    response = result.split("### Response:")[-1].strip().lower()
    if "toxic" in response and "non-toxic" not in response:
        return "toxic"
    return "non-toxic"


# Test on some examples
test_texts = [
    "You did a great job!",
    "I hate you so much",
    "This is interesting",
    "You're an idiot",
    "I hate that i'm so good at this",
]

print("\nTesting the model:")
for text in test_texts:
    result = check_toxicity(text)
    print(f"Text: '{text}' â†’ Prediction: {result}")


In [None]:
from sklearn.metrics import accuracy_score

# Test on 100 examples
test_sample = test_df.sample(n=100, random_state=42)
predictions = []
true_labels = test_sample['binary_label'].tolist()

print("\nEvaluating on test set...")
for text in test_sample['text']:
    pred = check_toxicity(text)
    predictions.append(1 if pred == "toxic" else 0)

accuracy = accuracy_score(true_labels, predictions)
print(f"\nUnsloth Model Accuracy: {accuracy:.4f}")

# Compare with BERT results
print(f"Your BERT Accuracy: ~0.85 (from your code)")
print(f"Unsloth Accuracy: {accuracy:.4f}")

In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Define the path in Google Drive
gdrive_path = '/content/drive/MyDrive/Colab Notebooks/my_toxicity_detector'

# Create the directory if it doesn't exist
import os
os.makedirs(gdrive_path, exist_ok=True)

# %%% Saves model %%%
model.save_pretrained(gdrive_path)
tokenizer.save_pretrained(gdrive_path)
print(f"Model saved to '{gdrive_path}'!")

### Multi-shot and prompting approaches for toxicity identification

This will simply use prompting to identify toxicity.


In [None]:
# !pip install cerebras-cloud-sdk
# https://inference-docs.cerebras.ai/models/overview
# Cerebras key: csk-22y49tpjjx4nfym4pry6eemfrfp9vtyhdwmv5m5jf3826vvm
import os
from cerebras.cloud.sdk import Cerebras

client = Cerebras(
    api_key=os.environ.get("CEREBRAS_API_KEY")
)

completion = client.chat.completions.create(
    messages=[{"role":"user","content":"Why is fast inference important?"}],
    model="llama-3.3-70b",
    max_completion_tokens=1024,
    temperature=0.2,
    top_p=1,
    stream=False
)

print(completion.choices[0].message.content)