### Data Preprocessing

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

# Load the dataset
df = pd.read_csv('racism_xenophobia_dataset.csv')
df['label'] = df['label'].apply(lambda x: 0 if x == 'racism' else 1)

# Convert the dataset to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Preprocess the dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Split the dataset into train and test
train_test_split = encoded_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']


### Model Training

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Load pre-trained model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./racism_xenophobia_model')
tokenizer.save_pretrained('./racism_xenophobia_model')

### Model Loading

In [None]:
import torch

# Load the tokenizer and model
model_dir = './racism_xenophobia_model'
tokenizer = DistilBertTokenizer.from_pretrained(model_dir)
model = DistilBertForSequenceClassification.from_pretrained(model_dir)

# Set the model to evaluation mode
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

### Inference

In [6]:
import time
import torch.nn.functional as F

def predict_with_time(text, model, tokenizer, device):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Measure inference time
    start_time = time.time()
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
    end_time = time.time()
    
    inference_time = end_time - start_time
    
    logits = outputs.logits
    probs = F.softmax(logits, dim=1)
    return probs, inference_time

In [None]:
# Example sentences
sentences = [
    "Immigrants are taking our jobs.",
    "People of that race are inferior.",
    "I love all cultures and people.",
    "Black people look like monsters"
]

# Predict class and probabilities for each sentence and measure inference time
for sentence in sentences:
    probs, inference_time = predict_with_time(sentence, model, tokenizer, device)
    racism_prob = probs[0][0].item()
    xenophobia_prob = probs[0][1].item()
    label = 'racism' if racism_prob > xenophobia_prob else 'xenophobia'
    
    print(f'Sentence: "{sentence}"')
    print(f'Prediction: {label}')
    print(f'Racism Probability: {racism_prob:.4f}')
    print(f'Xenophobia Probability: {xenophobia_prob:.4f}')
    print(f'Inference Time: {inference_time:.4f} seconds')
    print()
