# Implementation of a Pre-Trained Model (RoBERTa)

In [1]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, pipeline
import torch
from torch.utils.data import Dataset, DataLoader




# Preprocess Data and Training

In [3]:
df = pd.read_csv('ML_Models/Propietary_Models/token_datasets.csv')

# String Conversion
df['Posts'] = df['Posts'].astype(str)

# Label Mapping
label_mapping = {'Positive': 0, 'Depression/Suicidal Thoughts': 1, 'Neutral': 2}
df['label'] = df['label'].map(label_mapping)

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', clean_up_tokenization_spaces=True)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Preprocess Data
# Tokenize the input texts
inputs = tokenizer(list(df['Posts']), padding=True, truncation=True, return_tensors='pt')

# Get labels
labels = torch.tensor(df['label'].values, dtype=torch.long)

# Create a PyTorch Dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

dataset = CustomDataset(inputs, labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Train the model
try:
    trainer.train()
except Exception as e:
    print("Error during training:", e)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/780 [00:00<?, ?it/s]