In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import Dataset as HFDataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
import os 


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

# Step 1: Load and Concatenate Data
df_facebook = pd.read_csv('usable/balanced_facebook.csv')
df_reddit = pd.read_csv('usable/balanced_reddit.csv')
df_twitter = pd.read_csv('usable/balanced_twitter.csv')
df_youtube = pd.read_csv('usable/balanced_youtube.csv')

df_combined = pd.concat([df_facebook, df_reddit, df_twitter, df_youtube])

# Ensure labels are integers
df_combined['label'] = df_combined['label'].astype(int)

# Save combined DataFrame to a CSV file for loading with the Hugging Face Dataset
df_combined.to_csv('cleaned_combined.csv', index=False)

# Load the combined dataset using Hugging Face Dataset
dataset = load_dataset('csv', data_files={'train': 'cleaned_combined.csv'})

# Step 2: Tokenization using the datasets library
model_name = "Hate-speech-CNERG/dehatebert-mono-english"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset = HFDataset.from_pandas(df_combined)

# Define the tokenization function with added checks and replacing None values
def tokenize_function(examples):
    if 'text' not in examples:
        raise ValueError("Expected 'text' field in examples but not found.")
    if not isinstance(examples['text'], list):
        raise ValueError(f"Expected 'text' field to be a list but got {type(examples['text'])}")
    processed_texts = [text if isinstance(text, str) else "" for text in examples['text']]
    return tokenizer(processed_texts, padding="max_length", truncation=True)
# Apply the tokenization function to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the dataset into train and test sets
train_test_split = tokenized_datasets['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Load the pre-trained model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
)




Map:   0%|          | 0/48362 [00:00<?, ? examples/s]



In [3]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the Model
trainer.train()

# Predict on the Test Set
predictions_test = trainer.predict(test_dataset)
preds_test = predictions_test.predictions.argmax(-1)

# Calculate Test Accuracy and Classification Report
accuracy_test = accuracy_score(test_dataset['label'], preds_test)
report_test = classification_report(test_dataset['label'], preds_test, output_dict=True)

# Predict on the Train Set
predictions_train = trainer.predict(train_dataset)
preds_train = predictions_train.predictions.argmax(-1)

# Calculate Train Accuracy and Classification Report
accuracy_train = accuracy_score(train_dataset['label'], preds_train)
report_train = classification_report(train_dataset['label'], preds_train, output_dict=True)

# Display the Results
results = {
    'train': {
        'accuracy': accuracy_train,
        'precision': report_train['weighted avg']['precision'],
        'recall': report_train['weighted avg']['recall'],
        'f1-score': report_train['weighted avg']['f1-score']
    },
    'test': {
        'accuracy': accuracy_test,
        'precision': report_test['weighted avg']['precision'],
        'recall': report_test['weighted avg']['recall'],
        'f1-score': report_test['weighted avg']['f1-score']
    }
}
results_df = pd.DataFrame(results) 
print(results_df)

  0%|          | 0/7257 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Tokenize the text data
model_name = "Hate-speech-CNERG/dehatebert-mono-english"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset = HFDataset.from_pandas(df_combined)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, return_tensors="pt")

tokenized_datasets = hf_dataset.map(tokenize_function, batched=True)

# Ensure all elements in 'text' are strings
df_combined['text'] = df_combined['text'].astype(str)

# Progress bar for tokenization
tqdm.pandas(desc="Tokenizing")
df_combined['input_ids'] = df_combined['text'].progress_apply(lambda x: tokenize_function({'text': x})['input_ids'])

# Split data into training and test sets
X = df_combined['text'].tolist()
y = df_combined['label'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Tokenizing: 100%|██████████| 48362/48362 [00:42<00:00, 1146.72it/s]


In [15]:
# Create a custom Dataset class
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.texts[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create DataLoader for training and test sets
train_dataset = HateSpeechDataset(X_train, y_train)
test_dataset = HateSpeechDataset(X_test, y_test)

# Load the pre-trained model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



In [16]:
# Train the model
trainer.train()
# Predict on the test set (this will show the progress bar)
predictions_test = trainer.predict(test_dataset)
preds_test = predictions_test.predictions.argmax(-1)

# Calculate test accuracy and classification report
accuracy_test = accuracy_score(y_test, preds_test)
report_test = classification_report(y_test, preds_test, output_dict=True)

# Predict on the train set (this will show the progress bar)
predictions_train = trainer.predict(train_dataset)
preds_train = predictions_train.predictions.argmax(-1)

# Calculate train accuracy and classification report
accuracy_train = accuracy_score(y_train, preds_train)
report_train = classification_report(y_train, preds_train, output_dict=True)

# Display the results
results = {
    'train': {
        'accuracy': accuracy_train,
        'precision': report_train['weighted avg']['precision'],
        'recall': report_train['weighted avg']['recall'],
        'f1-score': report_train['weighted avg']['f1-score']
    },
    'test': {
        'accuracy': accuracy_test,
        'precision': report_test['weighted avg']['precision'],
        'recall': report_test['weighted avg']['recall'],
        'f1-score': report_test['weighted avg']['f1-score']
    }
}
results_df = pd.DataFrame(results)
print(results_df)

  0%|          | 0/7257 [00:00<?, ?it/s]

TypeError: new(): invalid data type 'str'