In [18]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import re
from huggingface_hub import login
from transformers import TrainingArguments, Trainer

In [19]:
# Load datasets
train_df = pd.read_csv('/content/Ur_train.csv')
test_df = pd.read_csv('/content/Ur_test_without_labels.csv')

In [20]:
# Ensure correct column names
train_df.columns = ['text', 'binary', 'multiclass']
test_df.columns = ['text']

In [21]:
# Drop unnecessary columns
train_df = train_df.drop(columns=['multiclass'])

In [22]:
# Text Preprocessing
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

# Convert labels to numeric format
# Replace 'Not Hope' and other non-numeric values with 0 or 1
train_df['binary'] = train_df['binary'].replace({'Not Hope': 0, 'Hope':1}).astype(int) # Assuming 'Hope' is the other label


# Split training data into 80% train, 20% validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'].tolist(), train_df['binary'].tolist(), test_size=0.2, random_state=42
)

  train_df['binary'] = train_df['binary'].replace({'Not Hope': 0, 'Hope':1}).astype(int) # Assuming 'Hope' is the other label


In [23]:
# Load tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenization
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=512)

In [24]:
# Create Dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

In [25]:
# Create dataset objects
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)
test_dataset = TextDataset(test_encodings)

In [26]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer

# Define local directory for saving the model
local_model_path = "xlm_roberta_local"

# Download and save model locally
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base")
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)

print(f"Model downloaded and saved to '{local_model_path}'")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model downloaded and saved to 'xlm_roberta_local'


In [27]:
from transformers import Trainer, TrainingArguments
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np
#from sklearn.metrics import f1_score # Removed import

# Removed compute_metrics function
#def compute_metrics(pred):
#    labels = pred.label_ids
#    preds = pred.predictions.argmax(-1)
#    f1 = f1_score(labels, preds, average='macro')  # Calculate macro F1 score
#    return {"macro_f1": f1}

# Update training arguments to include compute_metrics
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    # Removed metric_for_best_model and greater_is_better
    #metric_for_best_model="macro_f1",
    #greater_is_better=True,
    fp16=True,
    report_to="none",
    # Removed compute_metrics
    #compute_metrics=compute_metrics
)



In [28]:
# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Compute class weights for handling imbalance
labels = train_dataset.labels
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [29]:
# Define a custom loss function with class weights
from torch.nn import CrossEntropyLoss

def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fn = CrossEntropyLoss(weight=class_weights)  # Apply class weights here
    loss = loss_fn(logits, labels)
    return (loss, outputs) if return_outputs else loss

In [30]:
# Define a custom training step function
def training_step(model, inputs, num_items_in_batch=None): # Adding num_items_in_batch argument
    model.train()
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to device
    # Scale the loss before backpropagation
    with trainer.accelerator.autocast(): # Use autocast for automatic mixed precision
        loss = compute_loss(model, inputs)

    trainer.accelerator.backward(loss)  # Use accelerator for backward pass
    # Return loss as a Tensor instead of a float
    return loss.detach() # Detach to avoid retaining computational graph

In [31]:
# Initialize Trainer without custom loss function (it's now in training_step)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Override the default training step with your custom function
trainer.training_step = training_step

In [32]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2449,0.214017
2,0.114,0.202471
3,0.1317,0.215461


TrainOutput(global_step=693, training_loss=0.2179564946743661, metrics={'train_runtime': 678.3097, 'train_samples_per_second': 16.32, 'train_steps_per_second': 1.022, 'total_flos': 2912639382835200.0, 'train_loss': 0.2179564946743661, 'epoch': 3.0})

In [33]:
from sklearn.metrics import classification_report

# Make predictions on the validation set
predictions = trainer.predict(val_dataset)

# Extract predicted labels
predicted_labels = predictions.predictions.argmax(-1)

# Generate and print the classification report
print(classification_report(val_labels, predicted_labels))


              precision    recall  f1-score   support

           0       0.97      0.91      0.94       482
           1       0.91      0.97      0.94       441

    accuracy                           0.94       923
   macro avg       0.94      0.94      0.94       923
weighted avg       0.94      0.94      0.94       923



In [34]:
# Predict on the test set
test_predictions = trainer.predict(test_dataset)
all_predictions = test_predictions.predictions.argmax(-1)

# Convert numeric labels to "Hope" or "Not Hope"
label_mapping = {0: "Not Hope", 1: "Hope"}
test_df["Tag"] = [label_mapping[pred] for pred in all_predictions]

# Create submission file with "Text" and "Tag" columns
test_df["Text"] = ["text" + str(i + 1) for i in range(len(test_df))]  # Generate "text1, text2, ..." format
submission_df = test_df[["Text", "Tag"]]  # Keep only required columns

In [35]:
# Save as CSV
submission_df.to_csv("predictions.csv", index=False)

print("Submission file 'submission.csv' created successfully!")

Submission file 'submission.csv' created successfully!
