In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

# Load the dataset into a pandas DataFrame
df = pd.read_csv("/content/drive/MyDrive/EDA/Ai Prompt Generator/Ai Writting/Conversation/topical_chat.csv")  # Replace with your dataset path
X = df['message']
y = df['sentiment']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode the text data
def tokenize_text(text):
    input_ids = []
    attention_masks = []

    for sent in text:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Tokenize the training and validation text data
train_inputs, train_masks = tokenize_text(X_train)
val_inputs, val_masks = tokenize_text(X_val)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the sentiment labels
train_labels = torch.tensor(label_encoder.fit_transform(y_train))
val_labels = torch.tensor(label_encoder.transform(y_val))


Label ID: 0 => Original Name:  Angry\
Label ID: 1 => Original Name:  Curious to dive deeper\
Label ID: 2 => Original Name:  Disgusted\
Label ID: 3 => Original Name:  Fearful\
Label ID: 4 => Original Name:  Happy\
Label ID: 5 => Original Name:  Neutral\
Label ID: 6 => Original Name:  Sad\
Label ID: 7 => Original Name:  Surprised\




1   Curious to dive deeper\
2   sad\
3   FearFul \
4   Neutral	\
5   Happy\
6   Angry\
7   Surprised\
8   Disgusted


In [None]:
# Create a DataLoader for efficient batch processing
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


In [None]:
# Load the pre-trained BERT model with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['sentiment'].unique()))

# Set the device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 4  # 10 epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [None]:
# Fine-tuning the BERT model
print("Fine-tuning BERT...")
for epoch in range(4):  # Adjust the number of epochs as needed
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}", unit="batch"):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch

        model.zero_grad()

        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Avg training loss: {avg_loss:.4f}")

    # Evaluation on the validation set
    model.eval()
    val_loss = 0
    val_accuracy = 0

    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch

        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

        val_loss += loss.item()
        val_accuracy += (logits.argmax(dim=1) == labels).sum().item()

    avg_val_loss = val_loss / len(val_dataloader)
    val_accuracy = val_accuracy / len(X_val)
    print(f"Avg validation loss: {avg_val_loss:.4f}")
    print(f"Validation accuracy: {val_accuracy:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_bert_model")
tokenizer.save_pretrained("fine_tuned_bert_model")

Fine-tuning BERT...


Epoch 1: 100%|██████████| 4710/4710 [52:06<00:00,  1.51batch/s]


Avg training loss: 1.7184
Avg validation loss: 1.6921
Validation accuracy: 0.4228


Epoch 2: 100%|██████████| 4710/4710 [52:04<00:00,  1.51batch/s]


Avg training loss: 1.7178
Avg validation loss: 1.6921
Validation accuracy: 0.4228


Epoch 3: 100%|██████████| 4710/4710 [52:04<00:00,  1.51batch/s]


Avg training loss: 1.7172
Avg validation loss: 1.6921
Validation accuracy: 0.4228


Epoch 4: 100%|██████████| 4710/4710 [52:06<00:00,  1.51batch/s]


Avg training loss: 1.7168
Avg validation loss: 1.6921
Validation accuracy: 0.4228


('fine_tuned_bert_model/tokenizer_config.json',
 'fine_tuned_bert_model/special_tokens_map.json',
 'fine_tuned_bert_model/vocab.txt',
 'fine_tuned_bert_model/added_tokens.json')

In [None]:
model.save_pretrained("/content/drive/MyDrive/EDA/Ai Prompt Generator/Ai Writting/fine_tuned_bert_model")
tokenizer.save_pretrained("/content/drive/MyDrive/EDA/Ai Prompt Generator/Ai Writting/fine_tuned_bert_model")

('/content/drive/MyDrive/EDA/Ai Prompt Generator/Ai Writting/fine_tuned_bert_model/tokenizer_config.json',
 '/content/drive/MyDrive/EDA/Ai Prompt Generator/Ai Writting/fine_tuned_bert_model/special_tokens_map.json',
 '/content/drive/MyDrive/EDA/Ai Prompt Generator/Ai Writting/fine_tuned_bert_model/vocab.txt',
 '/content/drive/MyDrive/EDA/Ai Prompt Generator/Ai Writting/fine_tuned_bert_model/added_tokens.json')