In [1]:
!pip install transformers datasets torch scikit-learn



In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split
import pandas as pd

# Check and use GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



data = pd.read_csv('train.csv')

# Spliting into train and test sets (80-20 split)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

#created new files for further reference
train_data.to_csv('train_split.csv', index=False)
test_data.to_csv('test_split.csv', index=False)


dataset = DatasetDict({
    'train': load_dataset('csv', data_files='train_split.csv', split='train'),
    'test': load_dataset('csv', data_files='test_split.csv', split='train')
})

print("Train labels:", set(dataset['train']['label']))
print("Test labels:", set(dataset['test']['label']))

Using device: cuda


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Train labels: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}
Test labels: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}


In [3]:
#using Bert Tokenizer Model fro generating tokenized dataset.
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:

print("Train label range:", min(dataset['train']['label']), max(dataset['train']['label']))
print("Test label range:", min(dataset['test']['label']), max(dataset['test']['label']))

Train label range: 0 27
Test label range: 0 27


In [5]:
#BERT Sequence Classifier Model from hugging face.
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=28).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args = TrainingArguments(
    output_dir="./results",            # Directory for saving checkpoints
    evaluation_strategy="epoch",      # Evaluate at each epoch
    learning_rate=3e-5,               # Learning rate
    per_device_train_batch_size=32,   # Larger batch size for better GPU utilization
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,    # Accumulate gradients over 2 steps
    num_train_epochs=10,               # Number of epochs
    weight_decay=0.01,                # Weight decay for regularization
    fp16=True,                        # Enable mixed-precision training
    logging_dir="./logs",             # Logging directory
    save_total_limit=2,               # Save only the 2 most recent checkpoints
    logging_steps=50,                 # Log training metrics every 50 steps
    save_steps=500,                   # Save checkpoint every 500 steps
    report_to="none"                  # Disable logging to external services
)



In [7]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

In [8]:
from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.7489,1.360994,0.6435,0.578245
2,0.9538,0.936257,0.748,0.716437
3,0.732,0.871089,0.7605,0.730766
4,0.6287,0.858074,0.769,0.743766
5,0.5227,0.8513,0.7735,0.753107
6,0.4569,0.861355,0.7755,0.762447
7,0.3892,0.88865,0.7725,0.758522


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.7489,1.360994,0.6435,0.578245
2,0.9538,0.936257,0.748,0.716437
3,0.732,0.871089,0.7605,0.730766
4,0.6287,0.858074,0.769,0.743766
5,0.5227,0.8513,0.7735,0.753107
6,0.4569,0.861355,0.7755,0.762447
7,0.3892,0.88865,0.7725,0.758522
8,0.3544,0.899549,0.7785,0.768882
9,0.3021,0.912921,0.7735,0.763898
10,0.2865,0.916069,0.7735,0.764248


TrainOutput(global_step=1250, training_loss=0.6803437080383301, metrics={'train_runtime': 547.1227, 'train_samples_per_second': 146.219, 'train_steps_per_second': 2.285, 'total_flos': 5263449538560000.0, 'train_loss': 0.6803437080383301, 'epoch': 10.0})

In [9]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.9160687327384949, 'eval_accuracy': 0.7735, 'eval_f1': 0.7642475556711658, 'eval_runtime': 5.9334, 'eval_samples_per_second': 337.077, 'eval_steps_per_second': 10.618, 'epoch': 10.0}


In [10]:
# Define the prediction function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    prediction = torch.argmax(probs, dim=1)
    return prediction.item()

# Load test.csv
test_file_path = 'test.csv'
test_data = pd.read_csv(test_file_path)

# Add a new column for predictions
test_data['prediction'] = test_data['text'].apply(predict_sentiment)

# Save the results to a new CSV
output_file_path = 'test_with_predictions.csv'
test_data.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

Predictions saved to test_with_predictions.csv


In [11]:
test_data = test_data.drop(columns=['text'])
test_data.to_csv('test_with_predictions.csv', index=False)

In [14]:
data

Unnamed: 0,id,prediction
0,0,27
1,1,16
2,2,21
3,3,21
4,4,21
...,...,...
14995,14995,9
14996,14996,9
14997,14997,12
14998,14998,1
