In [None]:
!pip3 install transformers datasets transformers[torch]
!pip3 install accelerate -U

In [2]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
import datasets

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
#from transformers import BertTokenizer, BertForSequenceClassification
#from transformers import DebertaTokenizer, DebertaForSequenceClassification

In [None]:
# Load the dataset from the review.csv file
dataset = datasets.load_dataset('csv', data_files="review.csv")

In [4]:
# Drop unwanted columns from the dataset
unwanted_columns = ['category', 'rating']  # List the names of the unwanted columns here
dataset = dataset.remove_columns(unwanted_columns)

# Split the dataset into training and validation sets
dataset = dataset['train'].train_test_split(test_size=0.2)

In [None]:
dataset

In [None]:
label_mapping = {"OR": 0, "CG": 1}

# Convert labels to numerical values
def convert_label(label):
    return label_mapping[label]

dataset = dataset.map(lambda example: {'text_': example['text_'], 'label': convert_label(example['label'])})

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') # Load the RoBERTa tokenizer
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Load the BERT tokenizer
#tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base') # Load the DeBERTa tokenizer

In [None]:
# Tokenize the input texts
def tokenize_function(examples):
    tokenized_examples = tokenizer(examples['text_'], padding='max_length', truncation=True)
    return {
        'input_ids': tokenized_examples['input_ids'],
        'attention_mask': tokenized_examples['attention_mask'],
        'label': examples['label']
    }

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
tokenized_dataset

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
#model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2)


In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

In [None]:
# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.112,0.14088


In [None]:
# Save the fine-tuned model
trainer.save_model('./fine_tuned_model')