In [None]:
# sentiment_analysis_single_task_finetune.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import (
    AutoTokenizer,
    TFBertModel,
)
import logging
import random
import os
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.utils import resample
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

# -------------------------------
# 0. Environment Setup
# -------------------------------

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(42)

# Suppress TensorFlow warnings for cleaner output
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Download NLTK resources if not already
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Bengali stopwords and lemmatizer
# Note: NLTK may not have comprehensive Bengali stopwords. Consider using a custom list if needed.
try:
    stop_words = set(stopwords.words('bengali'))
except LookupError:
    print("Bengali stopwords not found. Skipping stopword removal.")
    stop_words = set()

lemmatizer = WordNetLemmatizer()

# -------------------------------
# 1. GPU Memory Management
# -------------------------------

# Enable memory growth to prevent TensorFlow from allocating all GPU memory at once
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Enabled memory growth for {len(gpus)} GPU(s).")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Running on CPU.")

# -------------------------------
# 2. Data Preparation
# -------------------------------

# Load the dataset
# Ensure the CSV has at least two columns: 'Text' and 'Category'
data_path = r"F:\Context-Resonance Transformer\Cricket\Cricket - Sheet1.csv"  # Update this path as needed
df = pd.read_csv(data_path)

# Select relevant columns
df = df[['Text', 'Category']]
print("Initial DataFrame:")
print(df.head())
print(f"Initial Data Shape: {df.shape}")

# Function to clean text
def clean_text(text):
    # Keep only Bengali characters: Unicode range for Bengali: \u0980-\u09FF
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    words = text.split()
    # Lemmatize and remove stopwords if available
    if stop_words:
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    else:
        words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Apply cleaning
df['Text'] = df['Text'].astype(str).apply(clean_text)
print("DataFrame after text cleaning:")
print(df.head())

# Upsampling 'Category' to balance classes

# Define a function to perform random upsampling
def upsample(df, target_column):
    # Get the maximum count of samples in any class
    max_count = df[target_column].value_counts().max()

    # Separate each class and upsample the minority classes
    upsampled_dfs = []
    for label in df[target_column].unique():
        # Get samples for the current label
        df_label = df[df[target_column] == label]

        # Upsample minority classes to match the majority class count
        df_upsampled = resample(
            df_label,
            replace=True,            # Sample with replacement
            n_samples=max_count,     # Match the number of samples in the majority class
            random_state=42          # Set random seed for reproducibility
        )
        upsampled_dfs.append(df_upsampled)

    # Combine the upsampled DataFrames
    return pd.concat(upsampled_dfs)

# Apply upsampling to 'Category'
df_upsampled = upsample(df, 'Category')

# Shuffle the DataFrame to mix the resampled classes
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class distribution
print("Category distribution after upsampling:")
print(df_upsampled['Category'].value_counts())

# Encode 'Category' labels
category_encoder = LabelEncoder()
df_upsampled['Category_encoded'] = category_encoder.fit_transform(df_upsampled['Category'])

# Verify encoding
print("Encoded Category:")
print(df_upsampled[['Category', 'Category_encoded']].head())

# -------------------------------
# 3. Model Configuration
# -------------------------------

# Define the list of pre-trained models to fine-tune
pretrained_models = {
    'bert-base-multilingual-cased': {
        'tokenizer': AutoTokenizer,
        'model': TFBertModel,
        'pretrained_name': 'bert-base-multilingual-cased'
    },
    'sagorsarker/bangla-bert-base': {
        'tokenizer': AutoTokenizer,
        'model': TFBertModel,
        'pretrained_name': 'sagorsarker/bangla-bert-base'
    }
}

# Define selected models
selected_models = list(pretrained_models.keys())

# -------------------------------
# 4. Tokenization
# -------------------------------

# Function to tokenize sentences
def tokenize_sentences(sentences, tokenizer, max_len=20, batch_size=32):
    """
    Tokenizes sentences in batches for efficiency.
    """
    input_ids = []
    attention_masks = []

    for i in tqdm(range(0, len(sentences), batch_size), desc="Tokenizing"):
        batch = sentences[i:i+batch_size]
        try:
            encoded = tokenizer(
                list(batch),
                add_special_tokens=True,
                max_length=max_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='tf'
            )
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])
        except Exception as e:
            print(f"Error during tokenization for batch starting at index {i}: {e}")

    # Concatenate all batches
    input_ids = tf.concat(input_ids, axis=0).numpy()
    attention_masks = tf.concat(attention_masks, axis=0).numpy()

    return input_ids, attention_masks

# Tokenize the data for each model and store in a dictionary
tokenized_data = {}

for model_name in selected_models:
    print(f"\nTokenizing data for model: {model_name}")
    tokenizer_class = pretrained_models[model_name]['tokenizer']
    pretrained_name = pretrained_models[model_name]['pretrained_name']
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {model_name}: {e}")
        continue
    input_ids, attention_masks = tokenize_sentences(df_upsampled['Text'].values, tokenizer, max_len=20, batch_size=32)
    tokenized_data[model_name] = {
        'input_ids': input_ids,
        'attention_masks': attention_masks
    }

# -------------------------------
# 5. Preparing Labels and Splits
# -------------------------------

# Define labels for single-task learning
labels_category = df_upsampled['Category_encoded'].values

# Split the data into training and testing sets for each model
X_train_ids_dict = {}
X_test_ids_dict = {}
X_train_masks_dict = {}
X_test_masks_dict = {}
y_train_category_dict = {}
y_test_category_dict = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue
    X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train_cat, y_test_cat = train_test_split(
        tokenized_data[model_name]['input_ids'],
        tokenized_data[model_name]['attention_masks'],
        labels_category,
        test_size=0.2,
        random_state=42,
        stratify=labels_category
    )
    X_train_ids_dict[model_name] = X_train_ids
    X_test_ids_dict[model_name] = X_test_ids
    X_train_masks_dict[model_name] = X_train_masks
    X_test_masks_dict[model_name] = X_test_masks
    y_train_category_dict[model_name] = y_train_cat
    y_test_category_dict[model_name] = y_test_cat

# -------------------------------
# 6. Model Building, Training, and Evaluation
# -------------------------------

# Number of classes
num_categories = df_upsampled['Category_encoded'].nunique()

# Function to build and compile the model
def build_model(pretrained_model_info, num_categories, max_len=20):
    """
    Builds a single-task model with shared pre-trained layers and a single output layer.
    """
    tokenizer_class = pretrained_model_info['tokenizer']
    model_class = pretrained_model_info['model']
    pretrained_name = pretrained_model_info['pretrained_name']

    # Load tokenizer and model
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {pretrained_name}: {e}")
        return None

    try:
        # Attempt to load the model with TensorFlow weights first
        base_model = model_class.from_pretrained(pretrained_name)
    except OSError:
        # If TensorFlow weights are unavailable, try loading PyTorch weights
        print(f"TensorFlow weights not found for {pretrained_name}. Attempting to load PyTorch weights.")
        try:
            base_model = model_class.from_pretrained(pretrained_name, from_pt=True)
        except Exception as e:
            print(f"Error loading model for {pretrained_name}: {e}")
            return None

    # Define inputs
    input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

    # Get base model outputs
    base_outputs = base_model(input_ids, attention_mask=attention_mask)
    pooled_output = base_outputs[1]

    # Shared Dense layer
    shared_dense = tf.keras.layers.Dense(128, activation='relu')(pooled_output)

    # Dropout layer for regularization
    shared_dense = tf.keras.layers.Dropout(0.3)(shared_dense)

    # Category output
    category_output = tf.keras.layers.Dense(num_categories, activation='softmax', name='category')(shared_dense)

    # Define the model
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=[category_output])

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss={
            'category': 'sparse_categorical_crossentropy',
        },
        metrics={
            'category': 'accuracy',
        }
    )

    return model, tokenizer

# Function to train and evaluate the model
def train_and_evaluate(model, X_train_ids, X_train_masks, y_train_cat,
                       X_test_ids, X_test_masks, y_test_cat, model_name, epochs=3, batch_size=32):
    """
    Trains the model and evaluates its performance on the test set.
    """
    print(f"\nTraining model: {model_name}")
    history = model.fit(
        {'input_ids': X_train_ids, 'attention_mask': X_train_masks},
        {'category': y_train_cat},
        validation_data=(
            {'input_ids': X_test_ids, 'attention_mask': X_test_masks},
            {'category': y_test_cat}
        ),
        epochs=epochs,
        batch_size=batch_size
    )

    # Evaluation
    print(f"\nEvaluating model: {model_name}")
    predictions = model.predict({'input_ids': X_test_ids, 'attention_mask': X_test_masks})
    pred_categories = np.argmax(predictions[0], axis=1)

    # Category Evaluation
    print(f"\nCategory Classification Report for {model_name}:")
    print(classification_report(y_test_cat, pred_categories, target_names=category_encoder.classes_))

    # Return history and predictions if needed
    return history, pred_categories

# Dictionary to store results
model_results = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue

    print(f"\nBuilding model for: {model_name}")
    pretrained_model_info = pretrained_models[model_name]
    model, tokenizer = build_model(pretrained_model_info, num_categories, max_len=20)

    if model is None:
        print(f"Skipping training for {model_name} due to build errors.")
        continue

    # Train and evaluate the model
    history, pred_categories = train_and_evaluate(
        model,
        X_train_ids_dict[model_name],
        X_train_masks_dict[model_name],
        y_train_category_dict[model_name],
        X_test_ids_dict[model_name],
        X_test_masks_dict[model_name],
        y_test_category_dict[model_name],
        model_name,
        epochs=3,
        batch_size=32
    )

    # Save the model and tokenizer
    save_dir = f'./fine_tuned_models/{model_name.replace("/", "_")}_category'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    try:
        model.save(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Model and tokenizer saved to {save_dir}")
    except Exception as e:
        print(f"Error saving model for {model_name}: {e}")

    # Store results
    model_results[model_name] = {
        'history': history,
        'pred_categories': pred_categories
    }

print("\nAll models have been trained and evaluated.")

# -------------------------------
# 7. Optional: Compare Model Performances
# -------------------------------

# Example: Plotting category accuracy for each model
plt.figure(figsize=(12, 6))
train_acc = []
val_acc = []
model_labels = []

for model_name in selected_models:
    if model_name not in model_results:
        continue
    history = model_results[model_name]['history']
    train_acc.append(history.history['category_accuracy'][-1])
    val_acc.append(history.history['val_category_accuracy'][-1])
    model_labels.append(model_name)

x = np.arange(len(model_labels))  # label locations
width = 0.35  # bar width

fig, ax = plt.subplots(figsize=(12, 6))
rects1 = ax.bar(x - width/2, train_acc, width, label='Train Accuracy')
rects2 = ax.bar(x + width/2, val_acc, width, label='Validation Accuracy')

# Add some text for labels, title and custom x-axis tick labels
ax.set_ylabel('Accuracy')
ax.set_title('Category Classification Accuracy by Model')
ax.set_xticks(x)
ax.set_xticklabels(model_labels, rotation=45)
ax.legend()

# Attach a text label above each bar
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()


In [2]:
# sentiment_analysis_single_task_finetune.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import (
    AutoTokenizer,
    TFBertModel,
)
import logging
import random
import os
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.utils import resample
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

# -------------------------------
# 0. Environment Setup
# -------------------------------

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(42)

# Suppress TensorFlow warnings for cleaner output
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Download NLTK resources if not already
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Bengali stopwords and lemmatizer
# Note: NLTK may not have comprehensive Bengali stopwords. Consider using a custom list if needed.
try:
    stop_words = set(stopwords.words('bengali'))
except LookupError:
    print("Bengali stopwords not found. Skipping stopword removal.")
    stop_words = set()

lemmatizer = WordNetLemmatizer()

# -------------------------------
# 1. GPU Memory Management
# -------------------------------

# Enable memory growth to prevent TensorFlow from allocating all GPU memory at once
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Enabled memory growth for {len(gpus)} GPU(s).")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Running on CPU.")

# -------------------------------
# 2. Data Preparation
# -------------------------------

# Load the dataset
# Ensure the CSV has at least two columns: 'Text' and 'Category'
data_path = r"F:\Context-Resonance Transformer\Restuarant\Restaurant - Sheet1.csv"  # Update this path as needed
df = pd.read_csv(data_path)

# Select relevant columns
df = df[['Text', 'Category']]
print("Initial DataFrame:")
print(df.head())
print(f"Initial Data Shape: {df.shape}")

# Function to clean text
def clean_text(text):
    # Keep only Bengali characters: Unicode range for Bengali: \u0980-\u09FF
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    words = text.split()
    # Lemmatize and remove stopwords if available
    if stop_words:
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    else:
        words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Apply cleaning
df['Text'] = df['Text'].astype(str).apply(clean_text)
print("DataFrame after text cleaning:")
print(df.head())

# Upsampling 'Category' to balance classes

# Define a function to perform random upsampling
def upsample(df, target_column):
    # Get the maximum count of samples in any class
    max_count = df[target_column].value_counts().max()

    # Separate each class and upsample the minority classes
    upsampled_dfs = []
    for label in df[target_column].unique():
        # Get samples for the current label
        df_label = df[df[target_column] == label]

        # Upsample minority classes to match the majority class count
        df_upsampled = resample(
            df_label,
            replace=True,            # Sample with replacement
            n_samples=max_count,     # Match the number of samples in the majority class
            random_state=42          # Set random seed for reproducibility
        )
        upsampled_dfs.append(df_upsampled)

    # Combine the upsampled DataFrames
    return pd.concat(upsampled_dfs)

# Apply upsampling to 'Category'
df_upsampled = upsample(df, 'Category')

# Shuffle the DataFrame to mix the resampled classes
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class distribution
print("Category distribution after upsampling:")
print(df_upsampled['Category'].value_counts())

# Encode 'Category' labels
category_encoder = LabelEncoder()
df_upsampled['Category_encoded'] = category_encoder.fit_transform(df_upsampled['Category'])

# Verify encoding
print("Encoded Category:")
print(df_upsampled[['Category', 'Category_encoded']].head())

# Verify number of unique classes
num_categories = df_upsampled['Category_encoded'].nunique()
print(f"Number of unique categories: {num_categories}")

# -------------------------------
# 3. Model Configuration
# -------------------------------

# Define the list of pre-trained models to fine-tune
pretrained_models = {
    'bert-base-multilingual-cased': {
        'tokenizer': AutoTokenizer,
        'model': TFBertModel,
        'pretrained_name': 'bert-base-multilingual-cased'
    },
    'sagorsarker/bangla-bert-base': {
        'tokenizer': AutoTokenizer,
        'model': TFBertModel,
        'pretrained_name': 'sagorsarker/bangla-bert-base'
    }
}

# Define selected models
selected_models = list(pretrained_models.keys())

# -------------------------------
# 4. Tokenization
# -------------------------------

# Function to tokenize sentences
def tokenize_sentences(sentences, tokenizer, max_len=20, batch_size=32):
    """
    Tokenizes sentences in batches for efficiency.
    """
    input_ids = []
    attention_masks = []

    for i in tqdm(range(0, len(sentences), batch_size), desc="Tokenizing"):
        batch = sentences[i:i+batch_size]
        try:
            encoded = tokenizer(
                list(batch),
                add_special_tokens=True,
                max_length=max_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='tf'
            )
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])
        except Exception as e:
            print(f"Error during tokenization for batch starting at index {i}: {e}")

    # Concatenate all batches
    input_ids = tf.concat(input_ids, axis=0).numpy()
    attention_masks = tf.concat(attention_masks, axis=0).numpy()

    return input_ids, attention_masks

# Tokenize the data for each model and store in a dictionary
tokenized_data = {}

for model_name in selected_models:
    print(f"\nTokenizing data for model: {model_name}")
    tokenizer_class = pretrained_models[model_name]['tokenizer']
    pretrained_name = pretrained_models[model_name]['pretrained_name']
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {model_name}: {e}")
        continue
    input_ids, attention_masks = tokenize_sentences(df_upsampled['Text'].values, tokenizer, max_len=20, batch_size=32)
    tokenized_data[model_name] = {
        'input_ids': input_ids,
        'attention_masks': attention_masks
    }

# -------------------------------
# 5. Preparing Labels and Splits
# -------------------------------

# Define labels for single-task learning
labels_category = df_upsampled['Category_encoded'].values

# Split the data into training and testing sets for each model
X_train_ids_dict = {}
X_test_ids_dict = {}
X_train_masks_dict = {}
X_test_masks_dict = {}
y_train_category_dict = {}
y_test_category_dict = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue
    X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train_cat, y_test_cat = train_test_split(
        tokenized_data[model_name]['input_ids'],
        tokenized_data[model_name]['attention_masks'],
        labels_category,
        test_size=0.2,
        random_state=42,
        stratify=labels_category
    )
    X_train_ids_dict[model_name] = X_train_ids
    X_test_ids_dict[model_name] = X_test_ids
    X_train_masks_dict[model_name] = X_train_masks
    X_test_masks_dict[model_name] = X_test_masks
    y_train_category_dict[model_name] = y_train_cat
    y_test_category_dict[model_name] = y_test_cat

# -------------------------------
# 6. Model Building, Training, and Evaluation
# -------------------------------

# Function to build and compile the model
def build_model(pretrained_model_info, num_categories, max_len=20):
    """
    Builds a single-task model with shared pre-trained layers and a single output layer.
    """
    tokenizer_class = pretrained_model_info['tokenizer']
    model_class = pretrained_model_info['model']
    pretrained_name = pretrained_model_info['pretrained_name']

    # Load tokenizer and model
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {pretrained_name}: {e}")
        return None

    try:
        # Attempt to load the model with TensorFlow weights first
        base_model = model_class.from_pretrained(pretrained_name)
    except OSError:
        # If TensorFlow weights are unavailable, try loading PyTorch weights
        print(f"TensorFlow weights not found for {pretrained_name}. Attempting to load PyTorch weights.")
        try:
            base_model = model_class.from_pretrained(pretrained_name, from_pt=True)
        except Exception as e:
            print(f"Error loading model for {pretrained_name}: {e}")
            return None

    # Define inputs
    input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

    # Get base model outputs
    base_outputs = base_model(input_ids, attention_mask=attention_mask)
    pooled_output = base_outputs[1]  # Typically the [CLS] token representation

    # Shared Dense layer
    shared_dense = tf.keras.layers.Dense(128, activation='relu')(pooled_output)

    # Dropout layer for regularization
    shared_dense = tf.keras.layers.Dropout(0.3)(shared_dense)

    # Category output
    category_output = tf.keras.layers.Dense(num_categories, activation='softmax', name='category')(shared_dense)

    # Define the model
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=[category_output])

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss={
            'category': 'sparse_categorical_crossentropy',
        },
        metrics={
            'category': 'accuracy',
        }
    )

    return model, tokenizer

# Function to train and evaluate the model
def train_and_evaluate(model, X_train_ids, X_train_masks, y_train_cat,
                       X_test_ids, X_test_masks, y_test_cat, model_name, epochs=3, batch_size=32):
    """
    Trains the model and evaluates its performance on the test set.
    """
    print(f"\nTraining model: {model_name}")
    history = model.fit(
        {'input_ids': X_train_ids, 'attention_mask': X_train_masks},
        {'category': y_train_cat},
        validation_data=(
            {'input_ids': X_test_ids, 'attention_mask': X_test_masks},
            {'category': y_test_cat}
        ),
        epochs=epochs,
        batch_size=batch_size
    )

    # Evaluation
    print(f"\nEvaluating model: {model_name}")
    predictions = model.predict({'input_ids': X_test_ids, 'attention_mask': X_test_masks})
    
    # For single-output models, predictions is a single NumPy array
    # Apply argmax directly on the predictions array
    pred_categories = np.argmax(predictions, axis=1)

    # Category Evaluation
    print(f"\nCategory Classification Report for {model_name}:")
    print(classification_report(y_test_cat, pred_categories, target_names=category_encoder.classes_))

    # Return history and predictions if needed
    return history, pred_categories

# Dictionary to store results
model_results = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue

    print(f"\nBuilding model for: {model_name}")
    pretrained_model_info = pretrained_models[model_name]
    model, tokenizer = build_model(pretrained_model_info, num_categories, max_len=20)

    if model is None:
        print(f"Skipping training for {model_name} due to build errors.")
        continue

    # Train and evaluate the model
    history, pred_categories = train_and_evaluate(
        model,
        X_train_ids_dict[model_name],
        X_train_masks_dict[model_name],
        y_train_category_dict[model_name],
        X_test_ids_dict[model_name],
        X_test_masks_dict[model_name],
        y_test_category_dict[model_name],
        model_name,
        epochs=3,
        batch_size=32
    )

    # Save the model and tokenizer
    save_dir = f'./fine_tuned_models/{model_name.replace("/", "_")}_category'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    try:
        model.save(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Model and tokenizer saved to {save_dir}")
    except Exception as e:
        print(f"Error saving model for {model_name}: {e}")

    # Store results
    model_results[model_name] = {
        'history': history,
        'pred_categories': pred_categories
    }

print("\nAll models have been trained and evaluated.")

# -------------------------------
# 7. Optional: Compare Model Performances
# -------------------------------

# Example: Plotting category accuracy for each model
plt.figure(figsize=(12, 6))
train_acc = []
val_acc = []
model_labels = []

for model_name in selected_models:
    if model_name not in model_results:
        continue
    history = model_results[model_name]['history']
    train_acc.append(history.history['category_accuracy'][-1])
    val_acc.append(history.history['val_category_accuracy'][-1])
    model_labels.append(model_name)

x = np.arange(len(model_labels))  # label locations
width = 0.35  # bar width

fig, ax = plt.subplots(figsize=(12, 6))
rects1 = ax.bar(x - width/2, train_acc, width, label='Train Accuracy')
rects2 = ax.bar(x + width/2, val_acc, width, label='Validation Accuracy')

# Add some text for labels, title and custom x-axis tick labels
ax.set_ylabel('Accuracy')
ax.set_title('Category Classification Accuracy by Model')
ax.set_xticks(x)
ax.set_xticklabels(model_labels, rotation=45)
ax.legend()

# Attach a text label above each bar
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Enabled memory growth for 1 GPU(s).
Initial DataFrame:
                                                Text                 Category
0              স্টাফ কিন্তু, আমাদের জন্য ভয়ঙ্কর ছিল।                  service
1  শুধুমাত্র,রিডামিং ফ্যাক্টর খাদ্য ছিল,পুরোপুরি ...                     food
2  শুধুমাত্র,রিডামিং ফ্যাক্টর খাদ্য ছিল,পুরোপুরি ...  anecdotes/miscellaneous
3  খাবার একদমই ব্যতিক্রমী, একটি খুব সক্ষম রান্নাঘ...                     food
4  যেখানে গাব্রিয়েলা লোকালি আপনাকে শুভেচ্ছা জানা...                  service
Initial Data Shape: (2059, 2)
DataFrame after text cleaning:
                                                Text                 Category
0                                       স্টাফ ভয়ঙ্কর                  service
1  শুধুমাত্ররিডামিং ফ্যাক্টর খাদ্য ছিলপুরোপুরি ন্...                     food
2  শুধুমাত্ররিডামিং ফ্যাক্টর খাদ্য ছিলপুরোপুরি ন্...  anecdotes/miscellaneous
3  খাবার একদমই ব্যতিক্রমী সক্ষম রান্নাঘর গর্বের খ...                     food
4  গাব্রিয়েলা লোকালি আপনা

Tokenizing: 100%|██████████| 111/111 [00:00<00:00, 860.27it/s]



Tokenizing data for model: sagorsarker/bangla-bert-base


Tokenizing: 100%|██████████| 111/111 [00:00<00:00, 1037.14it/s]



Building model for: bert-base-multilingual-cased


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w


Training model: bert-base-multilingual-cased
Epoch 1/3
Epoch 2/3
Epoch 3/3

Evaluating model: bert-base-multilingual-cased

Category Classification Report for bert-base-multilingual-cased:
                         precision    recall  f1-score   support

               ambience       0.68      0.70      0.69       142
anecdotes/miscellaneous       0.77      0.69      0.73       142
                   food       0.59      0.61      0.60       142
                  price       0.82      0.82      0.82       142
                service       0.68      0.71      0.69       142

               accuracy                           0.70       710
              macro avg       0.71      0.70      0.70       710
           weighted avg       0.71      0.70      0.70       710





Model and tokenizer saved to ./fine_tuned_models/bert-base-multilingual-cased_category

Building model for: sagorsarker/bangla-bert-base


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w


Training model: sagorsarker/bangla-bert-base
Epoch 1/3
Epoch 2/3
Epoch 3/3

Evaluating model: sagorsarker/bangla-bert-base

Category Classification Report for sagorsarker/bangla-bert-base:
                         precision    recall  f1-score   support

               ambience       0.68      0.82      0.75       142
anecdotes/miscellaneous       0.77      0.82      0.80       142
                   food       0.77      0.60      0.67       142
                  price       0.78      0.85      0.81       142
                service       0.87      0.76      0.81       142

               accuracy                           0.77       710
              macro avg       0.78      0.77      0.77       710
           weighted avg       0.78      0.77      0.77       710





Model and tokenizer saved to ./fine_tuned_models/sagorsarker_bangla-bert-base_category

All models have been trained and evaluated.


KeyError: 'category_accuracy'

<Figure size 1200x600 with 0 Axes>

In [None]:
# sentiment_analysis_polarity_single_task_finetune.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import (
    AutoTokenizer,
    TFBertModel,
)
import logging
import random
import os
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.utils import resample
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

# -------------------------------
# 0. Environment Setup
# -------------------------------

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(42)

# Suppress TensorFlow warnings for cleaner output
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Download NLTK resources if not already
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Bengali stopwords and lemmatizer
# Note: NLTK may not have comprehensive Bengali stopwords. Consider using a custom list if needed.
try:
    stop_words = set(stopwords.words('bengali'))
except LookupError:
    print("Bengali stopwords not found. Skipping stopword removal.")
    stop_words = set()

lemmatizer = WordNetLemmatizer()

# -------------------------------
# 1. GPU Memory Management
# -------------------------------

# Enable memory growth to prevent TensorFlow from allocating all GPU memory at once
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Enabled memory growth for {len(gpus)} GPU(s).")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Running on CPU.")

# -------------------------------
# 2. Data Preparation
# -------------------------------

# Load the dataset
# Ensure the CSV has at least two columns: 'Text' and 'Polarity'
data_path = r"F:\Context-Resonance Transformer\Restuarant\Restaurant - Sheet1.csv"  # Update this path as needed
df = pd.read_csv(data_path)

# Select relevant columns
df = df[['Text', 'Polarity']]
print("Initial DataFrame:")
print(df.head())
print(f"Initial Data Shape: {df.shape}")

# Function to clean text
def clean_text(text):
    # Keep only Bengali characters: Unicode range for Bengali: \u0980-\u09FF
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    words = text.split()
    # Lemmatize and remove stopwords if available
    if stop_words:
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    else:
        words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Apply cleaning
df['Text'] = df['Text'].astype(str).apply(clean_text)
print("DataFrame after text cleaning:")
print(df.head())

# Upsampling 'Polarity' to balance classes

# Define a function to perform random upsampling
def upsample(df, target_column):
    # Get the maximum count of samples in any class
    max_count = df[target_column].value_counts().max()

    # Separate each class and upsample the minority classes
    upsampled_dfs = []
    for label in df[target_column].unique():
        # Get samples for the current label
        df_label = df[df[target_column] == label]

        # Upsample minority classes to match the majority class count
        df_upsampled = resample(
            df_label,
            replace=True,            # Sample with replacement
            n_samples=max_count,     # Match the number of samples in the majority class
            random_state=42          # Set random seed for reproducibility
        )
        upsampled_dfs.append(df_upsampled)

    # Combine the upsampled DataFrames
    return pd.concat(upsampled_dfs)

# Apply upsampling to 'Polarity'
df_upsampled = upsample(df, 'Polarity')

# Shuffle the DataFrame to mix the resampled classes
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class distribution
print("Polarity distribution after upsampling:")
print(df_upsampled['Polarity'].value_counts())

# Encode 'Polarity' labels
polarity_encoder = LabelEncoder()
df_upsampled['Polarity_encoded'] = polarity_encoder.fit_transform(df_upsampled['Polarity'])

# Verify encoding
print("Encoded Polarity:")
print(df_upsampled[['Polarity', 'Polarity_encoded']].head())

# Verify number of unique classes
num_polarities = df_upsampled['Polarity_encoded'].nunique()
print(f"Number of unique polarities: {num_polarities}")

# -------------------------------
# 3. Model Configuration
# -------------------------------

# Define the list of pre-trained models to fine-tune
pretrained_models = {
    'bert-base-multilingual-cased': {
        'tokenizer': AutoTokenizer,
        'model': TFBertModel,
        'pretrained_name': 'bert-base-multilingual-cased'
    },
    'sagorsarker/bangla-bert-base': {
        'tokenizer': AutoTokenizer,
        'model': TFBertModel,
        'pretrained_name': 'sagorsarker/bangla-bert-base'
    }
}

# Define selected models
selected_models = list(pretrained_models.keys())

# -------------------------------
# 4. Tokenization
# -------------------------------

# Function to tokenize sentences
def tokenize_sentences(sentences, tokenizer, max_len=20, batch_size=32):
    """
    Tokenizes sentences in batches for efficiency.
    """
    input_ids = []
    attention_masks = []

    for i in tqdm(range(0, len(sentences), batch_size), desc="Tokenizing"):
        batch = sentences[i:i+batch_size]
        try:
            encoded = tokenizer(
                list(batch),
                add_special_tokens=True,
                max_length=max_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='tf'
            )
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])
        except Exception as e:
            print(f"Error during tokenization for batch starting at index {i}: {e}")

    # Concatenate all batches
    input_ids = tf.concat(input_ids, axis=0).numpy()
    attention_masks = tf.concat(attention_masks, axis=0).numpy()

    return input_ids, attention_masks

# Tokenize the data for each model and store in a dictionary
tokenized_data = {}

for model_name in selected_models:
    print(f"\nTokenizing data for model: {model_name}")
    tokenizer_class = pretrained_models[model_name]['tokenizer']
    pretrained_name = pretrained_models[model_name]['pretrained_name']
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {model_name}: {e}")
        continue
    input_ids, attention_masks = tokenize_sentences(df_upsampled['Text'].values, tokenizer, max_len=20, batch_size=32)
    tokenized_data[model_name] = {
        'input_ids': input_ids,
        'attention_masks': attention_masks
    }

# -------------------------------
# 5. Preparing Labels and Splits
# -------------------------------

# Define labels for single-task learning
labels_polarity = df_upsampled['Polarity_encoded'].values

# Split the data into training and testing sets for each model
X_train_ids_dict = {}
X_test_ids_dict = {}
X_train_masks_dict = {}
X_test_masks_dict = {}
y_train_polarity_dict = {}
y_test_polarity_dict = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue
    X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train_pol, y_test_pol = train_test_split(
        tokenized_data[model_name]['input_ids'],
        tokenized_data[model_name]['attention_masks'],
        labels_polarity,
        test_size=0.2,
        random_state=42,
        stratify=labels_polarity
    )
    X_train_ids_dict[model_name] = X_train_ids
    X_test_ids_dict[model_name] = X_test_ids
    X_train_masks_dict[model_name] = X_train_masks
    X_test_masks_dict[model_name] = X_test_masks
    y_train_polarity_dict[model_name] = y_train_pol
    y_test_polarity_dict[model_name] = y_test_pol

# -------------------------------
# 6. Model Building, Training, and Evaluation
# -------------------------------

# Function to build and compile the model
def build_model(pretrained_model_info, num_polarities, max_len=20):
    """
    Builds a single-task model with shared pre-trained layers and a single output layer.
    """
    tokenizer_class = pretrained_model_info['tokenizer']
    model_class = pretrained_model_info['model']
    pretrained_name = pretrained_model_info['pretrained_name']

    # Load tokenizer and model
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {pretrained_name}: {e}")
        return None

    try:
        # Attempt to load the model with TensorFlow weights first
        base_model = model_class.from_pretrained(pretrained_name)
    except OSError:
        # If TensorFlow weights are unavailable, try loading PyTorch weights
        print(f"TensorFlow weights not found for {pretrained_name}. Attempting to load PyTorch weights.")
        try:
            base_model = model_class.from_pretrained(pretrained_name, from_pt=True)
        except Exception as e:
            print(f"Error loading model for {pretrained_name}: {e}")
            return None

    # Define inputs
    input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

    # Get base model outputs
    base_outputs = base_model(input_ids, attention_mask=attention_mask)
    pooled_output = base_outputs[1]  # Typically the [CLS] token representation

    # Shared Dense layer
    shared_dense = tf.keras.layers.Dense(128, activation='relu')(pooled_output)

    # Dropout layer for regularization
    shared_dense = tf.keras.layers.Dropout(0.3)(shared_dense)

    # Polarity output
    polarity_output = tf.keras.layers.Dense(num_polarities, activation='softmax', name='polarity')(shared_dense)

    # Define the model
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=[polarity_output])

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss={
            'polarity': 'sparse_categorical_crossentropy',
        },
        metrics={
            'polarity': 'accuracy',
        }
    )

    return model, tokenizer

# Function to train and evaluate the model
def train_and_evaluate(model, X_train_ids, X_train_masks, y_train_pol,
                       X_test_ids, X_test_masks, y_test_pol, model_name, epochs=3, batch_size=32):
    """
    Trains the model and evaluates its performance on the test set.
    """
    print(f"\nTraining model: {model_name}")
    history = model.fit(
        {'input_ids': X_train_ids, 'attention_mask': X_train_masks},
        {'polarity': y_train_pol},
        validation_data=(
            {'input_ids': X_test_ids, 'attention_mask': X_test_masks},
            {'polarity': y_test_pol}
        ),
        epochs=epochs,
        batch_size=batch_size
    )

    # Evaluation
    print(f"\nEvaluating model: {model_name}")
    predictions = model.predict({'input_ids': X_test_ids, 'attention_mask': X_test_masks})
    
    # For single-output models, predictions is a single NumPy array
    # Apply argmax directly on the predictions array
    pred_polarities = np.argmax(predictions, axis=1)

    # Polarity Evaluation
    print(f"\nPolarity Classification Report for {model_name}:")
    print(classification_report(y_test_pol, pred_polarities, target_names=polarity_encoder.classes_))

    # Return history and predictions if needed
    return history, pred_polarities

# Dictionary to store results
model_results = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue

    print(f"\nBuilding model for: {model_name}")
    pretrained_model_info = pretrained_models[model_name]
    model, tokenizer = build_model(pretrained_model_info, num_polarities, max_len=20)

    if model is None:
        print(f"Skipping training for {model_name} due to build errors.")
        continue

    # Train and evaluate the model
    history, pred_polarities = train_and_evaluate(
        model,
        X_train_ids_dict[model_name],
        X_train_masks_dict[model_name],
        y_train_polarity_dict[model_name],
        X_test_ids_dict[model_name],
        X_test_masks_dict[model_name],
        y_test_polarity_dict[model_name],
        model_name,
        epochs=3,
        batch_size=32
    )

    # Save the model and tokenizer
    save_dir = f'./fine_tuned_models/{model_name.replace("/", "_")}_polarity'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    try:
        model.save(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Model and tokenizer saved to {save_dir}")
    except Exception as e:
        print(f"Error saving model for {model_name}: {e}")

    # Store results
    model_results[model_name] = {
        'history': history,
        'pred_polarities': pred_polarities
    }

print("\nAll models have been trained and evaluated.")

# -------------------------------
# 7. Optional: Compare Model Performances
# -------------------------------

# Example: Plotting polarity accuracy for each model
plt.figure(figsize=(12, 6))
train_acc = []
val_acc = []
model_labels = []

for model_name in selected_models:
    if model_name not in model_results:
        continue
    history = model_results[model_name]['history']
    train_acc.append(history.history['polarity_accuracy'][-1])
    val_acc.append(history.history['val_polarity_accuracy'][-1])
    model_labels.append(model_name)

x = np.arange(len(model_labels))  # label locations
width = 0.35  # bar width

fig, ax = plt.subplots(figsize=(12, 6))
rects1 = ax.bar(x - width/2, train_acc, width, label='Train Accuracy')
rects2 = ax.bar(x + width/2, val_acc, width, label='Validation Accuracy')

# Add some text for labels, title and custom x-axis tick labels
ax.set_ylabel('Accuracy')
ax.set_title('Polarity Classification Accuracy by Model')
ax.set_xticks(x)
ax.set_xticklabels(model_labels, rotation=45)
ax.legend()

# Attach a text label above each bar
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Enabled memory growth for 1 GPU(s).
Initial DataFrame:
                                                Text  Polarity
0              স্টাফ কিন্তু, আমাদের জন্য ভয়ঙ্কর ছিল।  negative
1  শুধুমাত্র,রিডামিং ফ্যাক্টর খাদ্য ছিল,পুরোপুরি ...  positive
2  শুধুমাত্র,রিডামিং ফ্যাক্টর খাদ্য ছিল,পুরোপুরি ...  negative
3  খাবার একদমই ব্যতিক্রমী, একটি খুব সক্ষম রান্নাঘ...  positive
4  যেখানে গাব্রিয়েলা লোকালি আপনাকে শুভেচ্ছা জানা...  positive
Initial Data Shape: (2059, 2)
DataFrame after text cleaning:
                                                Text  Polarity
0                                       স্টাফ ভয়ঙ্কর  negative
1  শুধুমাত্ররিডামিং ফ্যাক্টর খাদ্য ছিলপুরোপুরি ন্...  positive
2  শুধুমাত্ররিডামিং ফ্যাক্টর খাদ্য ছিলপুরোপুরি ন্...  negative
3  খাবার একদমই ব্যতিক্রমী সক্ষম রান্নাঘর গর্বের খ...  positive
4  গাব্রিয়েলা লোকালি আপনাকে শুভেচ্ছা আপনাকে খেতে...  positive
Polarity distribution after upsampling:
Polarity
negative    1221
positive    1221
conflict    1221
neutral     1221
Name: count

Tokenizing: 100%|██████████| 153/153 [00:00<00:00, 894.54it/s]



Tokenizing data for model: sagorsarker/bangla-bert-base


Tokenizing: 100%|██████████| 153/153 [00:00<00:00, 1013.02it/s]



Building model for: bert-base-multilingual-cased


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w


Training model: bert-base-multilingual-cased
Epoch 1/3
Epoch 2/3
Epoch 3/3

Evaluating model: bert-base-multilingual-cased

Polarity Classification Report for bert-base-multilingual-cased:
              precision    recall  f1-score   support

    conflict       0.92      1.00      0.96       244
    negative       0.89      0.77      0.82       244
     neutral       0.78      0.96      0.86       245
    positive       0.89      0.72      0.80       244

    accuracy                           0.86       977
   macro avg       0.87      0.86      0.86       977
weighted avg       0.87      0.86      0.86       977





Model and tokenizer saved to ./fine_tuned_models/bert-base-multilingual-cased_polarity

Building model for: sagorsarker/bangla-bert-base


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w


Training model: sagorsarker/bangla-bert-base
Epoch 1/3
Epoch 2/3
Epoch 3/3

Evaluating model: sagorsarker/bangla-bert-base

Polarity Classification Report for sagorsarker/bangla-bert-base:
              precision    recall  f1-score   support

    conflict       0.95      1.00      0.98       244
    negative       0.89      0.86      0.87       244
     neutral       0.84      0.96      0.90       245
    positive       0.96      0.80      0.87       244

    accuracy                           0.90       977
   macro avg       0.91      0.90      0.90       977
weighted avg       0.91      0.90      0.90       977





In [1]:
# sentiment_analysis_category_single_task_finetune_xlm_roberta.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import (
    AutoTokenizer,
    TFXLMRobertaModel,
)
import logging
import random
import os
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.utils import resample
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

# -------------------------------
# 0. Environment Setup
# -------------------------------

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(42)

# Suppress TensorFlow warnings for cleaner output
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Download NLTK resources if not already
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Bengali stopwords and lemmatizer
# Note: NLTK may not have comprehensive Bengali stopwords. Consider using a custom list if needed.
try:
    stop_words = set(stopwords.words('bengali'))
except LookupError:
    print("Bengali stopwords not found. Skipping stopword removal.")
    stop_words = set()

lemmatizer = WordNetLemmatizer()

# -------------------------------
# 1. GPU Memory Management
# -------------------------------

# Enable memory growth to prevent TensorFlow from allocating all GPU memory at once
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Enabled memory growth for {len(gpus)} GPU(s).")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Running on CPU.")

# -------------------------------
# 2. Data Preparation
# -------------------------------

# Load the dataset
# Ensure the CSV has at least two columns: 'Text' and 'Category'
data_path = r"F:\Context-Resonance Transformer\Cricket\Cricket - Sheet1.csv"  # Update this path as needed
df = pd.read_csv(data_path)

# Select relevant columns
df = df[['Text', 'Category']]
print("Initial DataFrame:")
print(df.head())
print(f"Initial Data Shape: {df.shape}")

# Function to clean text
def clean_text(text):
    # Keep only Bengali characters: Unicode range for Bengali: \u0980-\u09FF
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    words = text.split()
    # Lemmatize and remove stopwords if available
    if stop_words:
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    else:
        words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Apply cleaning
df['Text'] = df['Text'].astype(str).apply(clean_text)
print("DataFrame after text cleaning:")
print(df.head())

# Upsampling 'Category' to balance classes

# Define a function to perform random upsampling
def upsample(df, target_column):
    # Get the maximum count of samples in any class
    max_count = df[target_column].value_counts().max()

    # Separate each class and upsample the minority classes
    upsampled_dfs = []
    for label in df[target_column].unique():
        # Get samples for the current label
        df_label = df[df[target_column] == label]

        # Upsample minority classes to match the majority class count
        df_upsampled = resample(
            df_label,
            replace=True,            # Sample with replacement
            n_samples=max_count,     # Match the number of samples in the majority class
            random_state=42          # Set random seed for reproducibility
        )
        upsampled_dfs.append(df_upsampled)

    # Combine the upsampled DataFrames
    return pd.concat(upsampled_dfs)

# Apply upsampling to 'Category'
df_upsampled = upsample(df, 'Category')

# Shuffle the DataFrame to mix the resampled classes
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class distribution
print("Category distribution after upsampling:")
print(df_upsampled['Category'].value_counts())

# Encode 'Category' labels
category_encoder = LabelEncoder()
df_upsampled['Category_encoded'] = category_encoder.fit_transform(df_upsampled['Category'])

# Verify encoding
print("Encoded Category:")
print(df_upsampled[['Category', 'Category_encoded']].head())

# Verify number of unique classes
num_categories = df_upsampled['Category_encoded'].nunique()
print(f"Number of unique categories: {num_categories}")

# -------------------------------
# 3. Model Configuration
# -------------------------------

# Define the list of pre-trained models to fine-tune
pretrained_models = {
    'xlm-roberta-base': {
        'tokenizer': AutoTokenizer,
        'model': TFXLMRobertaModel,
        'pretrained_name': 'xlm-roberta-base'
    }
}

# Define selected models
selected_models = list(pretrained_models.keys())

# -------------------------------
# 4. Tokenization
# -------------------------------

# Function to tokenize sentences
def tokenize_sentences(sentences, tokenizer, max_len=20, batch_size=32):
    """
    Tokenizes sentences in batches for efficiency.
    """
    input_ids = []
    attention_masks = []

    for i in tqdm(range(0, len(sentences), batch_size), desc="Tokenizing"):
        batch = sentences[i:i+batch_size]
        try:
            encoded = tokenizer(
                list(batch),
                add_special_tokens=True,
                max_length=max_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='tf'
            )
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])
        except Exception as e:
            print(f"Error during tokenization for batch starting at index {i}: {e}")

    # Concatenate all batches
    input_ids = tf.concat(input_ids, axis=0).numpy()
    attention_masks = tf.concat(attention_masks, axis=0).numpy()

    return input_ids, attention_masks

# Tokenize the data for each model and store in a dictionary
tokenized_data = {}

for model_name in selected_models:
    print(f"\nTokenizing data for model: {model_name}")
    tokenizer_class = pretrained_models[model_name]['tokenizer']
    pretrained_name = pretrained_models[model_name]['pretrained_name']
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {model_name}: {e}")
        continue
    input_ids, attention_masks = tokenize_sentences(df_upsampled['Text'].values, tokenizer, max_len=20, batch_size=32)
    tokenized_data[model_name] = {
        'input_ids': input_ids,
        'attention_masks': attention_masks
    }

# -------------------------------
# 5. Preparing Labels and Splits
# -------------------------------

# Define labels for single-task learning
labels_category = df_upsampled['Category_encoded'].values

# Split the data into training and testing sets for each model
X_train_ids_dict = {}
X_test_ids_dict = {}
X_train_masks_dict = {}
X_test_masks_dict = {}
y_train_category_dict = {}
y_test_category_dict = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue
    X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train_cat, y_test_cat = train_test_split(
        tokenized_data[model_name]['input_ids'],
        tokenized_data[model_name]['attention_masks'],
        labels_category,
        test_size=0.2,
        random_state=42,
        stratify=labels_category
    )
    X_train_ids_dict[model_name] = X_train_ids
    X_test_ids_dict[model_name] = X_test_ids
    X_train_masks_dict[model_name] = X_train_masks
    X_test_masks_dict[model_name] = X_test_masks
    y_train_category_dict[model_name] = y_train_cat
    y_test_category_dict[model_name] = y_test_cat

# -------------------------------
# 6. Model Building, Training, and Evaluation
# -------------------------------

# Function to build and compile the model
def build_model(pretrained_model_info, num_categories, max_len=20):
    """
    Builds a single-task model with shared pre-trained layers and a single output layer.
    """
    tokenizer_class = pretrained_model_info['tokenizer']
    model_class = pretrained_model_info['model']
    pretrained_name = pretrained_model_info['pretrained_name']

    # Load tokenizer and model
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {pretrained_name}: {e}")
        return None

    try:
        # Attempt to load the model with TensorFlow weights first
        base_model = model_class.from_pretrained(pretrained_name)
    except OSError:
        # If TensorFlow weights are unavailable, try loading PyTorch weights
        print(f"TensorFlow weights not found for {pretrained_name}. Attempting to load PyTorch weights.")
        try:
            base_model = model_class.from_pretrained(pretrained_name, from_pt=True)
        except Exception as e:
            print(f"Error loading model for {pretrained_name}: {e}")
            return None

    # Define inputs
    input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

    # Get base model outputs
    base_outputs = base_model(input_ids, attention_mask=attention_mask)
    pooled_output = base_outputs[1]  # Typically the [CLS] token representation

    # Shared Dense layer
    shared_dense = tf.keras.layers.Dense(128, activation='relu')(pooled_output)

    # Dropout layer for regularization
    shared_dense = tf.keras.layers.Dropout(0.3)(shared_dense)

    # Category output
    category_output = tf.keras.layers.Dense(num_categories, activation='softmax', name='category')(shared_dense)

    # Define the model
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=[category_output])

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss={
            'category': 'sparse_categorical_crossentropy',
        },
        metrics={
            'category': 'accuracy',
        }
    )

    return model, tokenizer

# Function to train and evaluate the model
def train_and_evaluate(model, X_train_ids, X_train_masks, y_train_cat,
                       X_test_ids, X_test_masks, y_test_cat, model_name, epochs=3, batch_size=32):
    """
    Trains the model and evaluates its performance on the test set.
    """
    print(f"\nTraining model: {model_name}")
    history = model.fit(
        {'input_ids': X_train_ids, 'attention_mask': X_train_masks},
        {'category': y_train_cat},
        validation_data=(
            {'input_ids': X_test_ids, 'attention_mask': X_test_masks},
            {'category': y_test_cat}
        ),
        epochs=epochs,
        batch_size=batch_size
    )

    # Evaluation
    print(f"\nEvaluating model: {model_name}")
    predictions = model.predict({'input_ids': X_test_ids, 'attention_mask': X_test_masks})
    
    # For single-output models, predictions is a single NumPy array
    # Apply argmax directly on the predictions array
    pred_categories = np.argmax(predictions, axis=1)

    # Category Evaluation
    print(f"\nCategory Classification Report for {model_name}:")
    print(classification_report(y_test_cat, pred_categories, target_names=category_encoder.classes_))

    # Return history and predictions if needed
    return history, pred_categories

# Dictionary to store results
model_results = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue

    print(f"\nBuilding model for: {model_name}")
    pretrained_model_info = pretrained_models[model_name]
    model, tokenizer = build_model(pretrained_model_info, num_categories, max_len=20)

    if model is None:
        print(f"Skipping training for {model_name} due to build errors.")
        continue

    # Train and evaluate the model
    history, pred_categories = train_and_evaluate(
        model,
        X_train_ids_dict[model_name],
        X_train_masks_dict[model_name],
        y_train_category_dict[model_name],
        X_test_ids_dict[model_name],
        X_test_masks_dict[model_name],
        y_test_category_dict[model_name],
        model_name,
        epochs=3,
        batch_size=32
    )

    # Save the model and tokenizer
    save_dir = f'./fine_tuned_models/{model_name.replace("/", "_")}_category'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    try:
        model.save(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Model and tokenizer saved to {save_dir}")
    except Exception as e:
        print(f"Error saving model for {model_name}: {e}")

    # Store results
    model_results[model_name] = {
        'history': history,
        'pred_categories': pred_categories
    }

print("\nAll models have been trained and evaluated.")

# -------------------------------
# 7. Optional: Compare Model Performances
# -------------------------------

# Example: Plotting category accuracy for each model
plt.figure(figsize=(12, 6))
train_acc = []
val_acc = []
model_labels = []

for model_name in selected_models:
    if model_name not in model_results:
        continue
    history = model_results[model_name]['history']
    train_acc.append(history.history['category_accuracy'][-1])
    val_acc.append(history.history['val_category_accuracy'][-1])
    model_labels.append(model_name)

x = np.arange(len(model_labels))  # label locations
width = 0.35  # bar width

fig, ax = plt.subplots(figsize=(12, 6))
rects1 = ax.bar(x - width/2, train_acc, width, label='Train Accuracy')
rects2 = ax.bar(x + width/2, val_acc, width, label='Validation Accuracy')

# Add some text for labels, title and custom x-axis tick labels
ax.set_ylabel('Accuracy')
ax.set_title('Category Classification Accuracy by Model')
ax.set_xticks(x)
ax.set_xticklabels(model_labels, rotation=45)
ax.legend()

# Attach a text label above each bar
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Enabled memory growth for 1 GPU(s).
Initial DataFrame:
                                                Text Category
0  জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...    other
1  জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...     team
2               বাংলাদেশের পরে ভারতের সাপর্ট ই করি ?     team
3                              সৌম্যকে বাদ দেওয়া হোক  batting
4  প্রথমটি হচ্ছে, কোচ অত:পর সাকিব,সাকিব আর সাকিবর...     team
Initial Data Shape: (2979, 2)
DataFrame after text cleaning:
                                                Text Category
0  জয় বাংলা কাপ স্বাধীনতার মাস মার্চে মাথা চমৎকার...    other
1  জয় বাংলা কাপ স্বাধীনতার মাস মার্চে মাথা চমৎকার...     team
2                           বাংলাদেশের ভারতের সাপর্ট     team
3                                        সৌম্যকে বাদ  batting
4            প্রথমটি কোচ অতপর সাকিবসাকিব সাকিবরে দলে     team
Category distribution after upsampling:
Category
bowling            1010
other              1010
team management    1010
team               

Tokenizing: 100%|██████████| 158/158 [00:00<00:00, 195.26it/s]



Building model for: xlm-roberta-base


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.



Training model: xlm-roberta-base
Epoch 1/3
Epoch 2/3
Epoch 3/3

Evaluating model: xlm-roberta-base

Category Classification Report for xlm-roberta-base:
                 precision    recall  f1-score   support

        batting       0.52      0.86      0.65       202
        bowling       0.80      0.49      0.61       202
          other       0.55      0.50      0.53       202
           team       0.64      0.59      0.62       202
team management       0.81      0.72      0.76       202

       accuracy                           0.63      1010
      macro avg       0.66      0.63      0.63      1010
   weighted avg       0.66      0.63      0.63      1010





Model and tokenizer saved to ./fine_tuned_models/xlm-roberta-base_category

All models have been trained and evaluated.


KeyError: 'category_accuracy'

<Figure size 1200x600 with 0 Axes>

In [1]:
# sentiment_analysis_polarity_single_task_finetune_xlm_roberta.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import (
    AutoTokenizer,
    TFXLMRobertaModel,
)
import logging
import random
import os
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.utils import resample
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

# -------------------------------
# 0. Environment Setup
# -------------------------------

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(42)

# Suppress TensorFlow warnings for cleaner output
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Download NLTK resources if not already
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Bengali stopwords and lemmatizer
# Note: NLTK may not have comprehensive Bengali stopwords. Consider using a custom list if needed.
try:
    stop_words = set(stopwords.words('bengali'))
except LookupError:
    print("Bengali stopwords not found. Skipping stopword removal.")
    stop_words = set()

lemmatizer = WordNetLemmatizer()

# -------------------------------
# 1. GPU Memory Management
# -------------------------------

# Enable memory growth to prevent TensorFlow from allocating all GPU memory at once
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Enabled memory growth for {len(gpus)} GPU(s).")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Running on CPU.")

# -------------------------------
# 2. Data Preparation
# -------------------------------

# Load the dataset
# Ensure the CSV has at least two columns: 'Text' and 'Polarity'
data_path = r"F:\Context-Resonance Transformer\Cricket\Cricket - Sheet1.csv"  # Update this path as needed
df = pd.read_csv(data_path)

# Select relevant columns
df = df[['Text', 'Polarity']]
print("Initial DataFrame:")
print(df.head())
print(f"Initial Data Shape: {df.shape}")

# Function to clean text
def clean_text(text):
    # Keep only Bengali characters: Unicode range for Bengali: \u0980-\u09FF
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    words = text.split()
    # Lemmatize and remove stopwords if available
    if stop_words:
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    else:
        words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Apply cleaning
df['Text'] = df['Text'].astype(str).apply(clean_text)
print("DataFrame after text cleaning:")
print(df.head())

# Upsampling 'Polarity' to balance classes

# Define a function to perform random upsampling
def upsample(df, target_column):
    # Get the maximum count of samples in any class
    max_count = df[target_column].value_counts().max()

    # Separate each class and upsample the minority classes
    upsampled_dfs = []
    for label in df[target_column].unique():
        # Get samples for the current label
        df_label = df[df[target_column] == label]

        # Upsample minority classes to match the majority class count
        df_upsampled = resample(
            df_label,
            replace=True,            # Sample with replacement
            n_samples=max_count,     # Match the number of samples in the majority class
            random_state=42          # Set random seed for reproducibility
        )
        upsampled_dfs.append(df_upsampled)

    # Combine the upsampled DataFrames
    return pd.concat(upsampled_dfs)

# Apply upsampling to 'Polarity'
df_upsampled = upsample(df, 'Polarity')

# Shuffle the DataFrame to mix the resampled classes
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class distribution
print("Polarity distribution after upsampling:")
print(df_upsampled['Polarity'].value_counts())

# Encode 'Polarity' labels
polarity_encoder = LabelEncoder()
df_upsampled['Polarity_encoded'] = polarity_encoder.fit_transform(df_upsampled['Polarity'])

# Verify encoding
print("Encoded Polarity:")
print(df_upsampled[['Polarity', 'Polarity_encoded']].head())

# Verify number of unique classes
num_polarities = df_upsampled['Polarity_encoded'].nunique()
print(f"Number of unique polarities: {num_polarities}")

# -------------------------------
# 3. Model Configuration
# -------------------------------

# Define the list of pre-trained models to fine-tune
pretrained_models = {
    'xlm-roberta-base': {
        'tokenizer': AutoTokenizer,
        'model': TFXLMRobertaModel,
        'pretrained_name': 'xlm-roberta-base'
    }
}

# Define selected models
selected_models = list(pretrained_models.keys())

# -------------------------------
# 4. Tokenization
# -------------------------------

# Function to tokenize sentences
def tokenize_sentences(sentences, tokenizer, max_len=20, batch_size=32):
    """
    Tokenizes sentences in batches for efficiency.
    """
    input_ids = []
    attention_masks = []

    for i in tqdm(range(0, len(sentences), batch_size), desc="Tokenizing"):
        batch = sentences[i:i+batch_size]
        try:
            encoded = tokenizer(
                list(batch),
                add_special_tokens=True,
                max_length=max_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='tf'
            )
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])
        except Exception as e:
            print(f"Error during tokenization for batch starting at index {i}: {e}")

    # Concatenate all batches
    input_ids = tf.concat(input_ids, axis=0).numpy()
    attention_masks = tf.concat(attention_masks, axis=0).numpy()

    return input_ids, attention_masks

# Tokenize the data for each model and store in a dictionary
tokenized_data = {}

for model_name in selected_models:
    print(f"\nTokenizing data for model: {model_name}")
    tokenizer_class = pretrained_models[model_name]['tokenizer']
    pretrained_name = pretrained_models[model_name]['pretrained_name']
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {model_name}: {e}")
        continue
    input_ids, attention_masks = tokenize_sentences(df_upsampled['Text'].values, tokenizer, max_len=20, batch_size=32)
    tokenized_data[model_name] = {
        'input_ids': input_ids,
        'attention_masks': attention_masks
    }

# -------------------------------
# 5. Preparing Labels and Splits
# -------------------------------

# Define labels for single-task learning
labels_polarity = df_upsampled['Polarity_encoded'].values

# Split the data into training and testing sets for each model
X_train_ids_dict = {}
X_test_ids_dict = {}
X_train_masks_dict = {}
X_test_masks_dict = {}
y_train_polarity_dict = {}
y_test_polarity_dict = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue
    X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train_pol, y_test_pol = train_test_split(
        tokenized_data[model_name]['input_ids'],
        tokenized_data[model_name]['attention_masks'],
        labels_polarity,
        test_size=0.2,
        random_state=42,
        stratify=labels_polarity
    )
    X_train_ids_dict[model_name] = X_train_ids
    X_test_ids_dict[model_name] = X_test_ids
    X_train_masks_dict[model_name] = X_train_masks
    X_test_masks_dict[model_name] = X_test_masks
    y_train_polarity_dict[model_name] = y_train_pol
    y_test_polarity_dict[model_name] = y_test_pol

# -------------------------------
# 6. Model Building, Training, and Evaluation
# -------------------------------

# Function to build and compile the model
def build_model(pretrained_model_info, num_polarities, max_len=20):
    """
    Builds a single-task model with shared pre-trained layers and a single output layer.
    """
    tokenizer_class = pretrained_model_info['tokenizer']
    model_class = pretrained_model_info['model']
    pretrained_name = pretrained_model_info['pretrained_name']

    # Load tokenizer and model
    try:
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)
    except Exception as e:
        print(f"Error loading tokenizer for {pretrained_name}: {e}")
        return None

    try:
        # Attempt to load the model with TensorFlow weights first
        base_model = model_class.from_pretrained(pretrained_name)
    except OSError:
        # If TensorFlow weights are unavailable, try loading PyTorch weights
        print(f"TensorFlow weights not found for {pretrained_name}. Attempting to load PyTorch weights.")
        try:
            base_model = model_class.from_pretrained(pretrained_name, from_pt=True)
        except Exception as e:
            print(f"Error loading model for {pretrained_name}: {e}")
            return None

    # Define inputs
    input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

    # Get base model outputs
    base_outputs = base_model(input_ids, attention_mask=attention_mask)
    pooled_output = base_outputs[1]  # Typically the [CLS] token representation

    # Shared Dense layer
    shared_dense = tf.keras.layers.Dense(128, activation='relu')(pooled_output)

    # Dropout layer for regularization
    shared_dense = tf.keras.layers.Dropout(0.3)(shared_dense)

    # Polarity output
    polarity_output = tf.keras.layers.Dense(num_polarities, activation='softmax', name='polarity')(shared_dense)

    # Define the model
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=[polarity_output])

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss={
            'polarity': 'sparse_categorical_crossentropy',
        },
        metrics={
            'polarity': 'accuracy',
        }
    )

    return model, tokenizer

# Function to train and evaluate the model
def train_and_evaluate(model, X_train_ids, X_train_masks, y_train_pol,
                       X_test_ids, X_test_masks, y_test_pol, model_name, epochs=3, batch_size=32):
    """
    Trains the model and evaluates its performance on the test set.
    """
    print(f"\nTraining model: {model_name}")
    history = model.fit(
        {'input_ids': X_train_ids, 'attention_mask': X_train_masks},
        {'polarity': y_train_pol},
        validation_data=(
            {'input_ids': X_test_ids, 'attention_mask': X_test_masks},
            {'polarity': y_test_pol}
        ),
        epochs=epochs,
        batch_size=batch_size
    )

    # Evaluation
    print(f"\nEvaluating model: {model_name}")
    predictions = model.predict({'input_ids': X_test_ids, 'attention_mask': X_test_masks})
    
    # For single-output models, predictions is a single NumPy array
    # Apply argmax directly on the predictions array
    pred_polarities = np.argmax(predictions, axis=1)

    # Polarity Evaluation
    print(f"\nPolarity Classification Report for {model_name}:")
    print(classification_report(y_test_pol, pred_polarities, target_names=polarity_encoder.classes_))

    # Return history and predictions if needed
    return history, pred_polarities

# Dictionary to store results
model_results = {}

for model_name in selected_models:
    if model_name not in tokenized_data:
        print(f"Skipping model {model_name} due to previous errors.")
        continue

    print(f"\nBuilding model for: {model_name}")
    pretrained_model_info = pretrained_models[model_name]
    model, tokenizer = build_model(pretrained_model_info, num_polarities, max_len=20)

    if model is None:
        print(f"Skipping training for {model_name} due to build errors.")
        continue

    # Train and evaluate the model
    history, pred_polarities = train_and_evaluate(
        model,
        X_train_ids_dict[model_name],
        X_train_masks_dict[model_name],
        y_train_polarity_dict[model_name],
        X_test_ids_dict[model_name],
        X_test_masks_dict[model_name],
        y_test_polarity_dict[model_name],
        model_name,
        epochs=3,
        batch_size=32
    )

    # Save the model and tokenizer
    save_dir = f'./fine_tuned_models/{model_name.replace("/", "_")}_polarity'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    try:
        model.save(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Model and tokenizer saved to {save_dir}")
    except Exception as e:
        print(f"Error saving model for {model_name}: {e}")

    # Store results
    model_results[model_name] = {
        'history': history,
        'pred_polarities': pred_polarities
    }

print("\nAll models have been trained and evaluated.")

# -------------------------------
# 7. Optional: Compare Model Performances
# -------------------------------

# Example: Plotting polarity accuracy for each model
plt.figure(figsize=(12, 6))
train_acc = []
val_acc = []
model_labels = []

for model_name in selected_models:
    if model_name not in model_results:
        continue
    history = model_results[model_name]['history']
    train_acc.append(history.history['polarity_accuracy'][-1])
    val_acc.append(history.history['val_polarity_accuracy'][-1])
    model_labels.append(model_name)

x = np.arange(len(model_labels))  # label locations
width = 0.35  # bar width

fig, ax = plt.subplots(figsize=(12, 6))
rects1 = ax.bar(x - width/2, train_acc, width, label='Train Accuracy')
rects2 = ax.bar(x + width/2, val_acc, width, label='Validation Accuracy')

# Add some text for labels, title and custom x-axis tick labels
ax.set_ylabel('Accuracy')
ax.set_title('Polarity Classification Accuracy by Model')
ax.set_xticks(x)
ax.set_xticklabels(model_labels, rotation=45)
ax.legend()

# Attach a text label above each bar
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Enabled memory growth for 1 GPU(s).
Initial DataFrame:
                                                Text  Polarity
0  জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...  positive
1  জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...  positive
2               বাংলাদেশের পরে ভারতের সাপর্ট ই করি ?  positive
3                              সৌম্যকে বাদ দেওয়া হোক  negative
4  প্রথমটি হচ্ছে, কোচ অত:পর সাকিব,সাকিব আর সাকিবর...  positive
Initial Data Shape: (2979, 2)
DataFrame after text cleaning:
                                                Text  Polarity
0  জয় বাংলা কাপ স্বাধীনতার মাস মার্চে মাথা চমৎকার...  positive
1  জয় বাংলা কাপ স্বাধীনতার মাস মার্চে মাথা চমৎকার...  positive
2                           বাংলাদেশের ভারতের সাপর্ট  positive
3                                        সৌম্যকে বাদ  negative
4            প্রথমটি কোচ অতপর সাকিবসাকিব সাকিবরে দলে  positive
Polarity distribution after upsampling:
Polarity
neutral     2152
positive    2152
negative    2152
Name: count, dtype: int64
En

Tokenizing: 100%|██████████| 202/202 [00:00<00:00, 237.87it/s]



Building model for: xlm-roberta-base


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.



Training model: xlm-roberta-base
Epoch 1/3
Epoch 2/3
Epoch 3/3

Evaluating model: xlm-roberta-base

Polarity Classification Report for xlm-roberta-base:
              precision    recall  f1-score   support

    negative       0.56      0.68      0.62       430
     neutral       0.48      0.57      0.52       431
    positive       0.71      0.45      0.55       431

    accuracy                           0.56      1292
   macro avg       0.59      0.56      0.56      1292
weighted avg       0.59      0.56      0.56      1292





Model and tokenizer saved to ./fine_tuned_models/xlm-roberta-base_polarity

All models have been trained and evaluated.


KeyError: 'polarity_accuracy'

<Figure size 1200x600 with 0 Axes>