# Multi Lingual BERT

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Ensure the directory exists, create it if it doesn't
save_directory = r'E:\Bangla-Sentiment-Analysis\Multi_FineTuned_BERT\New_Multi_FineTuned_4'
os.makedirs(save_directory, exist_ok=True)


# Load the preprocessed dataset
df = pd.read_excel(r"E:\Bangla-Sentiment-Analysis\Bangla_Dataset\final_preprocessed_dataset.xlsx")

#df = df[['clean_sentence', 'Sentiment']]
#df2 = pd.read_excel(r"C:\Users\Rizvi\Desktop\Bilstm_Bangla\product_reviews_bn_translated.xlsx")
#df2.head()
#df2 = df2[['translated_sentence', 'Sentiment']]
# Rename columns to have a common name for reviews
#df2 = df2.rename(columns={'translated_sentence': 'clean_sentence'})
#df = df.rename(columns={'clean_sentence': 'clean_sentence'})

# Concatenate the DataFrames vertically
#all_reviews_df = pd.concat([df, df2], axis=0, ignore_index=True)

# Print the shape of the concatenated DataFrame
#print("Shape of the concatenated DataFrame:", all_reviews_df.shape)
#df=all_reviews_df

df.shape

(34800, 2)

In [None]:
texts = df['clean_sentence'].tolist()  # Adjust this to the actual text column
labels = df['Sentiment'].tolist()  # Adjust this to the actual label column

# Map string labels to numeric values
label_map = {'Positive': 0, 'Negative': 1, 'Neutral': 2}  # Adjust the mapping if necessary
numeric_labels = [label_map[label] for label in labels]

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, numeric_labels, test_size=0.2)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the input texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Create a custom Dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure labels are long type for classification
        return item

    def __len__(self):
        return len(self.labels)

# Create Dataset objects
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(set(numeric_labels)))

# Check if a GPU is available and set device accordingly
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)  # Move model to GPU if available

# Define training arguments
training_args = TrainingArguments(
    #learning_rate=0.01,              # Learning rate
    output_dir=save_directory,  # Directory to store the results
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    warmup_steps=1740,
    weight_decay=0.1,
    logging_dir=os.path.join(save_directory, 'logs'),  # Log directory
    evaluation_strategy="steps",  # Evaluate every few steps
    logging_steps=100,  # Log every 100 steps
    save_steps=500,  # Save the model every 500 steps
    save_total_limit=2, # Only keep the 2 latest models
    eval_steps=100,  # Evaluate every 100 steps
    load_best_model_at_end=True,  # Load the best model when done
    metric_for_best_model="eval_loss",  # Early stop based on validation loss
)

# Create Trainer instance with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=6)],  # Stop if no improvement after 3 evaluations
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss
100,1.0709,1.052187
200,1.0231,1.027815
300,0.9709,1.004685
400,0.916,0.890642
500,0.8924,0.841093
600,0.8874,0.818589
700,0.8739,0.874492
800,0.8789,0.831631
900,0.9167,0.96474
1000,0.8403,0.841511


('C:\\Users\\Rizvi\\Desktop\\New_Multi_FineTuned_4\\tokenizer_config.json',
 'C:\\Users\\Rizvi\\Desktop\\New_Multi_FineTuned_4\\special_tokens_map.json',
 'C:\\Users\\Rizvi\\Desktop\\New_Multi_FineTuned_4\\vocab.txt',
 'C:\\Users\\Rizvi\\Desktop\\New_Multi_FineTuned_4\\added_tokens.json')