In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Model
from transformers import BertTokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from sklearn.utils import resample
import re
import nltk

# -------------------------------
# 0. Environment Setup
# -------------------------------

# Set random seeds for reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(42)

# Suppress TensorFlow warnings for cleaner output
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Download NLTK resources
nltk.download('punkt')

# Initialize stopwords and lemmatizer
# Note: NLTK does not include Bengali stopwords by default
# Use a custom list or skip stopword removal if unavailable
try:
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('bengali'))
except OSError:
    print("Bengali stopwords not found. Skipping stopword removal.")
    stop_words = set()

# Initialize lemmatizer (WordNetLemmatizer is for English)
# Consider removing lemmatization for Bengali or use a Bengali-specific lemmatizer
# lemmatizer = WordNetLemmatizer()

# -------------------------------
# 1. Load and Preprocess the Dataset
# -------------------------------

# Load the dataset
df = pd.read_csv(r"F:\Context-Resonance Transformer\Cricket\Cricket - Sheet1.csv")
df = df[['Text', 'Category', 'Polarity']]

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep only Bengali characters
    text = re.sub(r'\d+', '', text)                 # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()        # Remove extra spaces
    words = text.split()
    if stop_words:
        # Remove stopwords
        words = [word for word in words if word not in stop_words]
    # Optionally, remove lemmatization if not suitable for Bengali
    # words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply cleaning
df['Text'] = df['Text'].apply(clean_text)

# -------------------------------
# 2. Upsampling for Class Balance
# -------------------------------

def upsample(df, target_column):
    max_count = df[target_column].value_counts().max()
    upsampled_dfs = []
    for label in df[target_column].unique():
        df_label = df[df[target_column] == label]
        if len(df_label) < max_count:
            df_upsampled = resample(
                df_label,
                replace=True,
                n_samples=max_count,
                random_state=42
            )
            upsampled_dfs.append(df_upsampled)
        else:
            upsampled_dfs.append(df_label)
    return pd.concat(upsampled_dfs)

# Upsample 'Category' and 'Polarity' separately
# Note: Upsampling both can lead to a large dataset
df = upsample(df, 'Category')
df = upsample(df, 'Polarity')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# -------------------------------
# 3. Encode Labels
# -------------------------------

label_encoder_cat = LabelEncoder()
label_encoder_pol = LabelEncoder()

df['Category_encoded'] = label_encoder_cat.fit_transform(df['Category'])
df['Polarity_encoded'] = label_encoder_pol.fit_transform(df['Polarity'])

# Display encoded labels
print("Encoded Category and Polarity:")
print(df[['Category', 'Category_encoded', 'Polarity', 'Polarity_encoded']].head())

# -------------------------------
# 4. Tokenize the Text
# -------------------------------

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize_data(df, max_length=128):
    input_ids = []
    attention_masks = []
    for sentence in df['Text']:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,      # Add '[CLS]' and '[SEP]'
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    # Convert lists to tensors
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    return input_ids, attention_masks

input_ids, attention_masks = tokenize_data(df)

# -------------------------------
# 5. Split the Data
# -------------------------------

# Split the data into training and testing sets
X_train, X_test, att_mask_train, att_mask_test, y_cat_train, y_cat_test, y_pol_train, y_pol_test = train_test_split(
    input_ids.numpy(), attention_masks.numpy(),
    df['Category_encoded'].values, df['Polarity_encoded'].values,
    test_size=0.2, random_state=42, stratify=df[['Category_encoded', 'Polarity_encoded']]
)

# -------------------------------
# 6. One-Hot Encode the Labels
# -------------------------------

num_cat_classes = len(label_encoder_cat.classes_)
num_pol_classes = len(label_encoder_pol.classes_)

y_cat_train = to_categorical(y_cat_train, num_classes=num_cat_classes)
y_cat_test = to_categorical(y_cat_test, num_classes=num_cat_classes)
y_pol_train = to_categorical(y_pol_train, num_classes=num_pol_classes)
y_pol_test = to_categorical(y_pol_test, num_classes=num_pol_classes)

# -------------------------------
# 7. Define the Multi-Task LSTM Model
# -------------------------------

# Define input layers
input_ids_layer = Input(shape=(128,), dtype='int32', name='input_ids')
attention_mask_layer = Input(shape=(128,), dtype='int32', name='attention_mask')

# Embedding layer with mask_zero=True to handle padding
embedding_layer = Embedding(
    input_dim=tokenizer.vocab_size, 
    output_dim=128, 
    input_length=128, 
    mask_zero=True
)(input_ids_layer)

# Shared LSTM layer (unidirectional)
lstm_layer = LSTM(128, return_sequences=False)(embedding_layer)
dropout_layer = Dropout(0.3)(lstm_layer)

# Task-specific Dense layers
category_output = Dense(num_cat_classes, activation='softmax', name='Category')(dropout_layer)
polarity_output = Dense(num_pol_classes, activation='softmax', name='Polarity')(dropout_layer)

# Define the model
model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=[category_output, polarity_output])

# -------------------------------
# 8. Compile the Model
# -------------------------------

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss={
        'Category': 'categorical_crossentropy',
        'Polarity': 'categorical_crossentropy'
    },
    metrics={
        'Category': 'accuracy',
        'Polarity': 'accuracy'
    }
)

# Display the model summary
print(model.summary())

# -------------------------------
# 9. Train the Model with EarlyStopping
# -------------------------------

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    [X_train, att_mask_train],
    {'Category': y_cat_train, 'Polarity': y_pol_train},
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)

# -------------------------------
# 10. Evaluate the Model
# -------------------------------

results = model.evaluate(
    [X_test, att_mask_test],
    {'Category': y_cat_test, 'Polarity': y_pol_test}
)

print(f"Test Loss and Accuracy: {results}")

# -------------------------------
# 11. Classification Report with Macro Metrics
# -------------------------------

# Predict on test data
predictions = model.predict([X_test, att_mask_test])

# Convert predictions and true labels from one-hot to integer labels
y_cat_pred = np.argmax(predictions[0], axis=1)
y_pol_pred = np.argmax(predictions[1], axis=1)

y_cat_true = np.argmax(y_cat_test, axis=1)
y_pol_true = np.argmax(y_pol_test, axis=1)

# Generate classification reports
report_cat = classification_report(
    y_cat_true, y_cat_pred, 
    target_names=label_encoder_cat.classes_, 
    digits=4,
    zero_division=0
)

report_pol = classification_report(
    y_pol_true, y_pol_pred, 
    target_names=label_encoder_pol.classes_, 
    digits=4,
    zero_division=0
)

print("Classification Report for Category:")
print(report_cat)

print("Classification Report for Polarity:")
print(report_pol)

# Extract and print macro precision, recall, F1-score
def extract_macro_metrics(report):
    report_dict = classification_report(
        y_true, y_pred, 
        target_names=label_encoder.classes_, 
        digits=4,
        zero_division=0,
        output_dict=True
    )
    macro_p = report_dict['macro avg']['precision']
    macro_r = report_dict['macro avg']['recall']
    macro_f1 = report_dict['macro avg']['f1-score']
    return macro_p, macro_r, macro_f1

# For Category
report_cat_dict = classification_report(
    y_cat_true, y_cat_pred, 
    target_names=label_encoder_cat.classes_, 
    digits=4,
    zero_division=0,
    output_dict=True
)

macro_p_cat = report_cat_dict['macro avg']['precision']
macro_r_cat = report_cat_dict['macro avg']['recall']
macro_f1_cat = report_cat_dict['macro avg']['f1-score']

print("\nMacro Metrics for Category:")
print(f"Precision: {macro_p_cat:.4f}")
print(f"Recall:    {macro_r_cat:.4f}")
print(f"F1-Score:  {macro_f1_cat:.4f}")

# For Polarity
report_pol_dict = classification_report(
    y_pol_true, y_pol_pred, 
    target_names=label_encoder_pol.classes_, 
    digits=4,
    zero_division=0,
    output_dict=True
)

macro_p_pol = report_pol_dict['macro avg']['precision']
macro_r_pol = report_pol_dict['macro avg']['recall']
macro_f1_pol = report_pol_dict['macro avg']['f1-score']

print("\nMacro Metrics for Polarity:")
print(f"Precision: {macro_p_pol:.4f}")
print(f"Recall:    {macro_r_pol:.4f}")
print(f"F1-Score:  {macro_f1_pol:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Encoded Category and Polarity:
          Category  Category_encoded  Polarity  Polarity_encoded
0          bowling                 1  positive                 2
1          bowling                 1  positive                 2
2          batting                 0   neutral                 1
3             team                 3   neutral                 1
4  team management                 4   neutral                 1




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 128, 128)     15302016    ['input_ids[0][0]']              
                                                                                                  
 lstm (LSTM)                    (None, 128)          131584      ['embedding[0][0]']              
                                                                                                  
 dropout (Dropout)              (None, 128)          0           ['lstm[0][0]']                   
                                                                                              

In [20]:
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Dropout, Multiply, Lambda

In [21]:
# Define the multi-task GRU model
input_ids_layer = Input(shape=(128,), dtype='int32', name='input_ids')
attention_mask_layer = Input(shape=(128,), dtype='int32', name='attention_mask')

# Embedding layer
embedding_layer = Embedding(
    input_dim=tokenizer.vocab_size, 
    output_dim=128, 
    input_length=128, 
    mask_zero=True
)(input_ids_layer)

# Apply the attention mask to the embeddings
mask_float = Lambda(lambda x: K.cast(x, dtype='float32'))(attention_mask_layer)
masked_embedding = Multiply()([embedding_layer, mask_float])

# Shared GRU layers
gru_layer = GRU(128, return_sequences=False)(masked_embedding)
dropout_layer = Dropout(0.3)(gru_layer)

# Task-specific output layers
category_output = Dense(len(y_cat_train[0]), activation='softmax', name='Category')(dropout_layer)
polarity_output = Dense(len(y_polarity_train[0]), activation='softmax', name='Polarity')(dropout_layer)

# Define the model
model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=[category_output, polarity_output])

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss={'Category': 'categorical_crossentropy', 'Polarity': 'categorical_crossentropy'},
    metrics={'Category': 'accuracy', 'Polarity': 'accuracy'}
)

# Train the model with EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    [X_train, att_mask_train],
    {'Category': y_cat_train, 'Polarity': y_polarity_train},
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)

# Evaluate the model
results = model.evaluate(
    [X_test, att_mask_test],
    {'Category': y_cat_test, 'Polarity': y_polarity_test}
)

print(f"Test Loss and Accuracy: {results}")

# Classification Report
predictions = model.predict([X_test, att_mask_test])
y_cat_pred = np.argmax(predictions[0], axis=1)
y_polarity_pred = np.argmax(predictions[1], axis=1)

print("Classification Report for Category:")
print(classification_report(np.argmax(y_cat_test, axis=1), y_cat_pred))

print("\nClassification Report for Polarity:")
print(classification_report(np.argmax(y_polarity_test, axis=1), y_polarity_pred))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss and Accuracy: [2.5316624641418457, 1.4876947402954102, 1.0439684391021729, 0.3333333432674408, 0.5271428823471069]
Classification Report for Category:
              precision    recall  f1-score   support

           0       0.43      0.10      0.17       458
           1       0.34      0.96      0.50       583
           2       0.21      0.01      0.02       343
           3       0.29      0.22      0.25       406
           4       0.00      0.00      0.00       310

    accuracy                           0.33      2100
   macro avg       0.25      0.26      0.19      2100
weighted avg       0.28      0.33      0.23      2100


Classification Report for Polarity:
              precision    recall  f1-score   support

           0       0.50      0.68      0.58       718
           1       0.49      0.26      0.34       689
           2       0.58      0.64      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from keras.callbacks import Callback
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.layers import Layer, GRU, Bidirectional, Dense, Input, Reshape, GlobalAveragePooling1D
import nltk
from nltk.corpus import wordnet
import random
from lime.lime_text import LimeTextExplainer

nltk.download('punkt')
df = pd.read_csv(r"F:\Context-Resonance Transformer\Cricket\Cricket - Sheet1.csv")
df.head()
df = df[['Text', 'Category', 'Polarity']]
df.head()
df['Category'].value_counts()
df['Polarity'].value_counts()



import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize Bengali stopwords and lemmatizer
stop_words = set(stopwords.words('bengali'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep only Bengali characters
    text = re.sub(r'\d+', '', text)                 # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()        # Remove extra spaces

    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

df['Text'] = df['Text'].apply(clean_text)
df.head()
from sklearn.utils import resample

# Define a function to perform random upsampling
def upsample(df, target_column):
    # Get the maximum count of samples in any class
    max_count = df[target_column].value_counts().max()

    # Separate each class and upsample the minority classes
    upsampled_dfs = []
    for label in df[target_column].unique():
        # Get samples for the current label
        df_label = df[df[target_column] == label]

        # Upsample minority classes to match the majority class count
        df_upsampled = resample(
            df_label,
            replace=True,            # Sample with replacement
            n_samples=max_count,     # Match the number of samples in the majority class
            random_state=42          # Set random seed for reproducibility
        )
        upsampled_dfs.append(df_upsampled)

    # Combine the upsampled DataFrames
    return pd.concat(upsampled_dfs)

# Apply upsampling to 'Category' and 'Polarity'
df_upsampled_category = upsample(df, 'Category')
df_upsampled_polarity = upsample(df_upsampled_category, 'Polarity')

# Shuffle the DataFrame to mix the resampled classes
df_upsampled = df_upsampled_polarity.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class distribution
print("Category distribution after upsampling:")
print(df_upsampled['Category'].value_counts())
print("\nPolarity distribution after upsampling:")
print(df_upsampled['Polarity'].value_counts())

df_upsampled.head()
from sklearn.preprocessing import LabelEncoder

category_encoder = LabelEncoder()
polarity_encoder = LabelEncoder()

df_upsampled['Category_encoded'] = category_encoder.fit_transform(df_upsampled['Category'])
df_upsampled['Polarity_encoded'] = polarity_encoder.fit_transform(df_upsampled['Polarity'])

# Tokenize the text using DistilBERT with padding and truncation
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='np')

df_upsampled['tokens'] = df_upsampled['Text'].apply(lambda x: tokenize_function(x))

# Train-test split
train_df, test_df = train_test_split(df_upsampled, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)


# Convert to TensorFlow Dataset
def create_tensor_dataset(df):
    # Tokenize input text and convert to TensorFlow tensors
    inputs = tokenizer(list(df['Text']), padding='max_length', truncation=True, max_length=128, return_tensors='tf')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Convert labels to tensors
    labels_category = tf.convert_to_tensor(df['Category_encoded'].values)
    labels_polarity = tf.convert_to_tensor(df['Polarity_encoded'].values)

    return tf.data.Dataset.from_tensor_slices(((input_ids, attention_mask), (labels_category, labels_polarity)))

def tokenize_data(df_upsampled, max_length=128):
    input_ids = []
    attention_masks = []
    
    for sentence in df_upsampled['Text']:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True, 
            max_length=max_length,    
            padding='max_length',    
            truncation=True,           
            return_attention_mask=True, 
            return_tensors='tf'        
        )
        
        # Append to lists
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    # Convert lists to tensors
    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)
    
    # Squeeze the extra dimension
    input_ids = tf.squeeze(input_ids, axis=1)
    attention_masks = tf.squeeze(attention_masks, axis=1)
    
    return input_ids, attention_masks

input_ids, attention_masks = tokenize_data(df_upsampled)

label_1 = tf.convert_to_tensor(df_upsampled['Category'])
label_2 = tf.convert_to_tensor(df_upsampled['Polarity'])

print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention masks shape: {attention_masks.shape}")
print(f"Label 1 (Category) shape: {label_1.shape}")
print(f"Label 2 (Polarity) shape: {label_2.shape}")
# Ensure input_ids and attention_masks are converted to integer type tensors
input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32)
attention_masks = tf.convert_to_tensor(attention_masks, dtype=tf.int32)


import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Initialize label encoders for the string labels
label_encoder_1 = LabelEncoder()
label_encoder_2 = LabelEncoder()

# Encode string labels into integers
df_upsampled['Category'] = label_encoder_1.fit_transform(df_upsampled['Category'])
df_upsampled['Polarity'] = label_encoder_2.fit_transform(df_upsampled['Polarity'])

# Convert labels to TensorFlow tensors
label_1 = tf.convert_to_tensor(df_upsampled['Category'], dtype=tf.int32)
label_2 = tf.convert_to_tensor(df_upsampled['Polarity'], dtype=tf.int32)

# Ensure input_ids and attention_masks are correctly formatted as tensors
input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32)
attention_masks = tf.convert_to_tensor(attention_masks, dtype=tf.int32)

# Split the data into training and testing sets
X_train, X_test, att_mask_train, att_mask_test, y_cat_train, y_cat_test, y_gender_train, y_gender_test = train_test_split(
    input_ids.numpy(), attention_masks.numpy(),
    label_1.numpy(), label_2.numpy(),  
    test_size=0.2, random_state=42
)
input_shape = X_train.shape[1]


import tensorflow as tf

def create_bilstm_model(input_shape):
    input_ids = tf.keras.layers.Input(shape=(input_shape,), dtype='int32', name='input_ids')
    attention_masks = tf.keras.layers.Input(shape=(input_shape,), dtype='int32', name='attention_masks')

    # Embedding layer
    embedding_layer = tf.keras.layers.Embedding(input_dim=tokenizer.vocab_size, output_dim=128)(input_ids)

    # First BiLSTM layer with dropout
    lstm_output = tf.keras.layers.Bidirectional(
        tf.keras.layers.GRU(128, return_sequences=True, dropout=0.3))(embedding_layer)

    # Second BiLSTM layer
    lstm_output_2 = tf.keras.layers.Bidirectional(
        tf.keras.layers.GRU(64, return_sequences=False, dropout=0.3))(lstm_output)

    # Dense layer before output layers
    dense_layer = tf.keras.layers.Dense(64, activation='relu')(lstm_output_2)

    # Dropout layer for regularization
    dropout_layer = tf.keras.layers.Dropout(0.3)(dense_layer)

    # Output layers for multi-task learning
    output_category = tf.keras.layers.Dense(5, activation='softmax', name='Category')(dropout_layer)
    output_polarity = tf.keras.layers.Dense(4, activation='softmax', name='Polarity')(dropout_layer)

    # Define the model with inputs and outputs
    model = tf.keras.Model(inputs=[input_ids, attention_masks],
                           outputs=[output_category, output_polarity])

    return model

# Instantiate the BiLSTM model
bilstm_model = create_bilstm_model(input_shape)

# Compile the model
bilstm_model.compile(
    optimizer='adam',
    loss={'Category': 'sparse_categorical_crossentropy', 'Polarity': 'sparse_categorical_crossentropy'},
    metrics={'Category': 'accuracy', 'Polarity': 'accuracy'}
)

# Display the model summary
bilstm_model.summary()


from tensorflow.keras.callbacks import EarlyStopping
         
# Define EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,          # Stop training after 3 epochs of no improvement
    restore_best_weights=True  # Restore the best weights from the epoch with the lowest validation loss
)

# Train the model with EarlyStopping
history = bilstm_model.fit(
    [X_train, att_mask_train],  # Inputs
    {'Category': y_cat_train, 'Polarity': y_gender_train},  # Outputs
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]  # Add the EarlyStopping callback
)
# Evaluate the model on the test set
results = bilstm_model.evaluate(
    [X_test, att_mask_test],
    {'Category': y_cat_test, 'Polarity': y_gender_test}
)

print(f"Test Loss and Accuracy: {results}")


from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np

# Step 1: Get Predictions
predictions = bilstm_model.predict([X_test, att_mask_test])

# Step 2: Convert predictions to class labels
y_cat_pred = np.argmax(predictions[0], axis=1)   # 'Category' prediction
y_gender_pred = np.argmax(predictions[1], axis=1)  # 'Polarity' prediction

# Step 3: Generate Classification Report with zero_division specified
# For Category
print("Classification Report for Category:")
print(classification_report(y_cat_test, y_cat_pred, zero_division=0))

# For Polarity
print("\nClassification Report for Polarity:")
print(classification_report(y_gender_test, y_gender_pred, zero_division=0))

# If you want the macro-averaged precision, recall, and F1 scores separately:
cat_precision = precision_score(y_cat_test, y_cat_pred, average='macro', zero_division=0)
cat_recall = recall_score(y_cat_test, y_cat_pred, average='macro', zero_division=0)
cat_f1 = f1_score(y_cat_test, y_cat_pred, average='macro', zero_division=0)

gender_precision = precision_score(y_gender_test, y_gender_pred, average='macro', zero_division=0)
gender_recall = recall_score(y_gender_test, y_gender_pred, average='macro', zero_division=0)
gender_f1 = f1_score(y_gender_test, y_gender_pred, average='macro', zero_division=0)

print("\nMacro-Averaged Scores for Category:")
print(f"Precision: {cat_precision:.4f}, Recall: {cat_recall:.4f}, F1 Score: {cat_f1:.4f}")

print("\nMacro-Averaged Scores for Polarity:")
print(f"Precision: {gender_precision:.4f}, Recall: {gender_recall:.4f}, F1 Score: {gender_f1:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Category distribution after upsampling:
Category
bowling            2799
batting            2226
team               2094
other              1913
team management    1468
Name: count, dtype: int64

Polarity distribution after upsampling:
Polarity
negative    3500
neutral     3500
positive    3500
Name: count, dtype: int64




Input IDs shape: (10500, 128)
Attention masks shape: (10500, 128)
Label 1 (Category) shape: (10500,)
Label 2 (Polarity) shape: (10500,)
Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 embedding_20 (Embedding)       (None, 128, 128)     15302016    ['input_ids[0][0]']              
                                                                                                  
 bidirectional_9 (Bidirectional  (None, 128, 256)    198144      ['embedding_20[0][0]']           
 )                                                                                                
                                                      

In [1]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from keras.callbacks import Callback
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.layers import Layer, LSTM, Bidirectional, Dense, Input, Reshape, GlobalAveragePooling1D
import nltk
from nltk.corpus import wordnet
import random
from lime.lime_text import LimeTextExplainer

nltk.download('punkt')
df = pd.read_csv(r"F:\Context-Resonance Transformer\Cricket\Cricket - Sheet1.csv")
df.head()
df = df[['Text', 'Category', 'Polarity']]
df.head()
df['Category'].value_counts()
df['Polarity'].value_counts()



import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize Bengali stopwords and lemmatizer
stop_words = set(stopwords.words('bengali'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep only Bengali characters
    text = re.sub(r'\d+', '', text)                 # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()        # Remove extra spaces

    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

df['Text'] = df['Text'].apply(clean_text)
df.head()
from sklearn.utils import resample

# Define a function to perform random upsampling
def upsample(df, target_column):
    # Get the maximum count of samples in any class
    max_count = df[target_column].value_counts().max()

    # Separate each class and upsample the minority classes
    upsampled_dfs = []
    for label in df[target_column].unique():
        # Get samples for the current label
        df_label = df[df[target_column] == label]

        # Upsample minority classes to match the majority class count
        df_upsampled = resample(
            df_label,
            replace=True,            # Sample with replacement
            n_samples=max_count,     # Match the number of samples in the majority class
            random_state=42          # Set random seed for reproducibility
        )
        upsampled_dfs.append(df_upsampled)

    # Combine the upsampled DataFrames
    return pd.concat(upsampled_dfs)

# Apply upsampling to 'Category' and 'Polarity'
df_upsampled_category = upsample(df, 'Category')
df_upsampled_polarity = upsample(df_upsampled_category, 'Polarity')

# Shuffle the DataFrame to mix the resampled classes
df_upsampled = df_upsampled_polarity.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class distribution
print("Category distribution after upsampling:")
print(df_upsampled['Category'].value_counts())
print("\nPolarity distribution after upsampling:")
print(df_upsampled['Polarity'].value_counts())

df_upsampled.head()
from sklearn.preprocessing import LabelEncoder

category_encoder = LabelEncoder()
polarity_encoder = LabelEncoder()

df_upsampled['Category_encoded'] = category_encoder.fit_transform(df_upsampled['Category'])
df_upsampled['Polarity_encoded'] = polarity_encoder.fit_transform(df_upsampled['Polarity'])

# Tokenize the text using DistilBERT with padding and truncation
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='np')

df_upsampled['tokens'] = df_upsampled['Text'].apply(lambda x: tokenize_function(x))

# Train-test split
train_df, test_df = train_test_split(df_upsampled, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)


# Convert to TensorFlow Dataset
def create_tensor_dataset(df):
    # Tokenize input text and convert to TensorFlow tensors
    inputs = tokenizer(list(df['Text']), padding='max_length', truncation=True, max_length=128, return_tensors='tf')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Convert labels to tensors
    labels_category = tf.convert_to_tensor(df['Category_encoded'].values)
    labels_polarity = tf.convert_to_tensor(df['Polarity_encoded'].values)

    return tf.data.Dataset.from_tensor_slices(((input_ids, attention_mask), (labels_category, labels_polarity)))

def tokenize_data(df_upsampled, max_length=128):
    input_ids = []
    attention_masks = []
    
    for sentence in df_upsampled['Text']:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True, 
            max_length=max_length,    
            padding='max_length',    
            truncation=True,           
            return_attention_mask=True, 
            return_tensors='tf'        
        )
        
        # Append to lists
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    # Convert lists to tensors
    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)
    
    # Squeeze the extra dimension
    input_ids = tf.squeeze(input_ids, axis=1)
    attention_masks = tf.squeeze(attention_masks, axis=1)
    
    return input_ids, attention_masks

input_ids, attention_masks = tokenize_data(df_upsampled)

label_1 = tf.convert_to_tensor(df_upsampled['Category'])
label_2 = tf.convert_to_tensor(df_upsampled['Polarity'])

print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention masks shape: {attention_masks.shape}")
print(f"Label 1 (Category) shape: {label_1.shape}")
print(f"Label 2 (Polarity) shape: {label_2.shape}")
# Ensure input_ids and attention_masks are converted to integer type tensors
input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32)
attention_masks = tf.convert_to_tensor(attention_masks, dtype=tf.int32)


import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Initialize label encoders for the string labels
label_encoder_1 = LabelEncoder()
label_encoder_2 = LabelEncoder()

# Encode string labels into integers
df_upsampled['Category'] = label_encoder_1.fit_transform(df_upsampled['Category'])
df_upsampled['Polarity'] = label_encoder_2.fit_transform(df_upsampled['Polarity'])

# Convert labels to TensorFlow tensors
label_1 = tf.convert_to_tensor(df_upsampled['Category'], dtype=tf.int32)
label_2 = tf.convert_to_tensor(df_upsampled['Polarity'], dtype=tf.int32)

# Ensure input_ids and attention_masks are correctly formatted as tensors
input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32)
attention_masks = tf.convert_to_tensor(attention_masks, dtype=tf.int32)

# Split the data into training and testing sets
X_train, X_test, att_mask_train, att_mask_test, y_cat_train, y_cat_test, y_gender_train, y_gender_test = train_test_split(
    input_ids.numpy(), attention_masks.numpy(),
    label_1.numpy(), label_2.numpy(),  
    test_size=0.2, random_state=42
)
input_shape = X_train.shape[1]


import tensorflow as tf

def create_bilstm_model(input_shape):
    input_ids = tf.keras.layers.Input(shape=(input_shape,), dtype='int32', name='input_ids')
    attention_masks = tf.keras.layers.Input(shape=(input_shape,), dtype='int32', name='attention_masks')

    # Embedding layer
    embedding_layer = tf.keras.layers.Embedding(input_dim=tokenizer.vocab_size, output_dim=128)(input_ids)

    # First BiLSTM layer with dropout
    lstm_output = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.3))(embedding_layer)

    # Second BiLSTM layer
    lstm_output_2 = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=False, dropout=0.3))(lstm_output)

    # Dense layer before output layers
    dense_layer = tf.keras.layers.Dense(64, activation='relu')(lstm_output_2)

    # Dropout layer for regularization
    dropout_layer = tf.keras.layers.Dropout(0.3)(dense_layer)

    # Output layers for multi-task learning
    output_category = tf.keras.layers.Dense(5, activation='softmax', name='Category')(dropout_layer)
    output_polarity = tf.keras.layers.Dense(4, activation='softmax', name='Polarity')(dropout_layer)

    # Define the model with inputs and outputs
    model = tf.keras.Model(inputs=[input_ids, attention_masks],
                           outputs=[output_category, output_polarity])

    return model

# Instantiate the BiLSTM model
bilstm_model = create_bilstm_model(input_shape)

# Compile the model
bilstm_model.compile(
    optimizer='adam',
    loss={'Category': 'sparse_categorical_crossentropy', 'Polarity': 'sparse_categorical_crossentropy'},
    metrics={'Category': 'accuracy', 'Polarity': 'accuracy'}
)

# Display the model summary
bilstm_model.summary()


from tensorflow.keras.callbacks import EarlyStopping
         
# Define EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,          # Stop training after 3 epochs of no improvement
    restore_best_weights=True  # Restore the best weights from the epoch with the lowest validation loss
)

# Train the model with EarlyStopping
history = bilstm_model.fit(
    [X_train, att_mask_train],  # Inputs
    {'Category': y_cat_train, 'Polarity': y_gender_train},  # Outputs
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]  # Add the EarlyStopping callback
)
# Evaluate the model on the test set
results = bilstm_model.evaluate(
    [X_test, att_mask_test],
    {'Category': y_cat_test, 'Polarity': y_gender_test}
)

print(f"Test Loss and Accuracy: {results}")


from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np

# Step 1: Get Predictions
predictions = bilstm_model.predict([X_test, att_mask_test])

# Step 2: Convert predictions to class labels
y_cat_pred = np.argmax(predictions[0], axis=1)   # 'Category' prediction
y_gender_pred = np.argmax(predictions[1], axis=1)  # 'Polarity' prediction

# Step 3: Generate Classification Report with zero_division specified
# For Category
print("Classification Report for Category:")
print(classification_report(y_cat_test, y_cat_pred, zero_division=0))

# For Polarity
print("\nClassification Report for Polarity:")
print(classification_report(y_gender_test, y_gender_pred, zero_division=0))

# If you want the macro-averaged precision, recall, and F1 scores separately:
cat_precision = precision_score(y_cat_test, y_cat_pred, average='macro', zero_division=0)
cat_recall = recall_score(y_cat_test, y_cat_pred, average='macro', zero_division=0)
cat_f1 = f1_score(y_cat_test, y_cat_pred, average='macro', zero_division=0)

gender_precision = precision_score(y_gender_test, y_gender_pred, average='macro', zero_division=0)
gender_recall = recall_score(y_gender_test, y_gender_pred, average='macro', zero_division=0)
gender_f1 = f1_score(y_gender_test, y_gender_pred, average='macro', zero_division=0)

print("\nMacro-Averaged Scores for Category:")
print(f"Precision: {cat_precision:.4f}, Recall: {cat_recall:.4f}, F1 Score: {cat_f1:.4f}")

print("\nMacro-Averaged Scores for Polarity:")
print(f"Precision: {gender_precision:.4f}, Recall: {gender_recall:.4f}, F1 Score: {gender_f1:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Category distribution after upsampling:
Category
bowling            2799
batting            2226
team               2094
other              1913
team management    1468
Name: count, dtype: int64

Polarity distribution after upsampling:
Polarity
negative    3500
neutral     3500
positive    3500
Name: count, dtype: int64




Input IDs shape: (10500, 128)
Attention masks shape: (10500, 128)
Label 1 (Category) shape: (10500,)
Label 2 (Polarity) shape: (10500,)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 128, 128)     15302016    ['input_ids[0][0]']              
                                                                                                  
 bidirectional (Bidirectional)  (None, 128, 256)     263168      ['embedding[0][0]']              
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 128)         1643