In [1]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from keras.callbacks import Callback
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.layers import Layer, GRU, Bidirectional, Dense, Input, Reshape, GlobalAveragePooling1D
import nltk
from nltk.corpus import wordnet
import random
from lime.lime_text import LimeTextExplainer

nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
df = pd.read_csv("./Cricket - Sheet1.csv")
df = df[['Text', 'Category', 'Polarity']]
df.head()

Unnamed: 0,Text,Category,Polarity
0,জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...,other,positive
1,জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...,team,positive
2,বাংলাদেশের পরে ভারতের সাপর্ট ই করি ?,team,positive
3,সৌম্যকে বাদ দেওয়া হোক,batting,negative
4,"প্রথমটি হচ্ছে, কোচ অত:পর সাকিব,সাকিব আর সাকিবর...",team,positive


In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize Bengali stopwords and lemmatizer
stop_words = set(stopwords.words('bengali'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep only Bengali characters
    text = re.sub(r'\d+', '', text)                 # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()        # Remove extra spaces

    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

df['Text'] = df['Text'].apply(clean_text)
df.head()

Unnamed: 0,Text,Category,Polarity
0,জয় বাংলা কাপ স্বাধীনতার মাস মার্চে মাথা চমৎকার...,other,positive
1,জয় বাংলা কাপ স্বাধীনতার মাস মার্চে মাথা চমৎকার...,team,positive
2,বাংলাদেশের ভারতের সাপর্ট,team,positive
3,সৌম্যকে বাদ,batting,negative
4,প্রথমটি কোচ অতপর সাকিবসাকিব সাকিবরে দলে,team,positive


In [15]:
from sklearn.utils import resample

# Define a function to perform random upsampling
def upsample(df, target_column):
    # Get the maximum count of samples in any class
    max_count = df[target_column].value_counts().max()

    # Separate each class and upsample the minority classes
    upsampled_dfs = []
    for label in df[target_column].unique():
        # Get samples for the current label
        df_label = df[df[target_column] == label]

        # Upsample minority classes to match the majority class count
        df_upsampled = resample(
            df_label,
            replace=True,            # Sample with replacement
            n_samples=max_count,     # Match the number of samples in the majority class
            random_state=42          # Set random seed for reproducibility
        )
        upsampled_dfs.append(df_upsampled)

    # Combine the upsampled DataFrames
    return pd.concat(upsampled_dfs)

# Apply upsampling to 'Category' and 'Polarity'
df_upsampled_category = upsample(df, 'Category')
df_upsampled_polarity = upsample(df_upsampled_category, 'Polarity')

# Shuffle the DataFrame to mix the resampled classes
df_upsampled = df_upsampled_polarity.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class distribution
print("Category distribution after upsampling:")
print(df_upsampled['Category'].value_counts())
print("\nPolarity distribution after upsampling:")
print(df_upsampled['Polarity'].value_counts())


Category distribution after upsampling:
Category
bowling            2799
batting            2226
team               2094
other              1913
team management    1468
Name: count, dtype: int64

Polarity distribution after upsampling:
Polarity
negative    3500
neutral     3500
positive    3500
Name: count, dtype: int64


In [16]:
from sklearn.preprocessing import LabelEncoder

category_encoder = LabelEncoder()
polarity_encoder = LabelEncoder()

df_upsampled['Category_encoded'] = category_encoder.fit_transform(df_upsampled['Category'])
df_upsampled['Polarity_encoded'] = polarity_encoder.fit_transform(df_upsampled['Polarity'])

In [17]:
# Tokenize the text using DistilBERT with padding and truncation
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')



In [18]:
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='np')

df_upsampled['tokens'] = df_upsampled['Text'].apply(lambda x: tokenize_function(x))

# Train-test split
train_df, test_df = train_test_split(df_upsampled, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [19]:
# Convert to TensorFlow Dataset
def create_tensor_dataset(df):
    # Tokenize input text and convert to TensorFlow tensors
    inputs = tokenizer(list(df['Text']), padding='max_length', truncation=True, max_length=128, return_tensors='tf')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Convert labels to tensors
    labels_category = tf.convert_to_tensor(df['Category_encoded'].values)
    labels_polarity = tf.convert_to_tensor(df['Polarity_encoded'].values)

    return tf.data.Dataset.from_tensor_slices(((input_ids, attention_mask), (labels_category, labels_polarity)))

In [20]:
# Create train and validation datasets
train_dataset = create_tensor_dataset(train_df)
val_dataset = create_tensor_dataset(val_df)

In [21]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GRU, GlobalAveragePooling1D
from tensorflow.keras import Model
from transformers import DistilBertTokenizer, TFDistilBertModel

In [22]:
# Initialize tokenizer and BERT model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

# Tokenize inputs
def tokenize_inputs(texts, max_length=128):
    inputs = tokenizer(
        texts, return_tensors="tf", padding="max_length",
        truncation=True, max_length=max_length
    )
    return inputs['input_ids'], inputs['attention_mask']

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [23]:
# GCN Layer with Residual Connections
class GCNLayer(tf.keras.layers.Layer):
    def __init__(self, units, activation='relu', **kwargs):
        super(GCNLayer, self).__init__(**kwargs)
        self.units = units
        self.activation = activation
        self.dense = Dense(units, use_bias=False)

    def call(self, features, adj_matrix):
        # Normalize the adjacency matrix
        adj_matrix = self.normalize_adjacency_matrix(adj_matrix)

        # Perform graph convolution: H' = A*X*W
        h = tf.matmul(adj_matrix, features)  # A*X
        h = self.dense(h)  # Apply weight matrix W

        # Apply the activation function
        if self.activation:
            h = tf.keras.activations.get(self.activation)(h)

        # Add residual connection: Output = H + features (node-to-node residual)
        h = h + features

        return h

    def normalize_adjacency_matrix(self, adj_matrix):
        """ Normalize adjacency matrix by adding self-loops and applying symmetric normalization """
        # Add self-loops (identity matrix)
        batch_size = tf.shape(adj_matrix)[0]
        num_tokens = tf.shape(adj_matrix)[1]
    
        adj_matrix = adj_matrix + tf.eye(num_tokens, batch_shape=[batch_size], dtype=adj_matrix.dtype)
    
        # Symmetric normalization: D^(-1/2) * A * D^(-1/2)
        degree_matrix = tf.reduce_sum(adj_matrix, axis=-1)
        degree_matrix_inv_sqrt = tf.pow(degree_matrix + 1e-6, -0.5)
        degree_matrix_inv_sqrt = tf.linalg.diag(degree_matrix_inv_sqrt)
    
        adj_matrix_normalized = tf.matmul(degree_matrix_inv_sqrt, adj_matrix)
        adj_matrix_normalized = tf.matmul(adj_matrix_normalized, degree_matrix_inv_sqrt)
    
        return adj_matrix_normalized

In [24]:
# Hierarchical GNN with Task-Specific Heads
class HierarchicalGNN(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(HierarchicalGNN, self).__init__(**kwargs)
        self.shared_gcn = GCNLayer(units)
        self.task_specific_gcn = GCNLayer(units)
        self.pooling = GlobalAveragePooling1D()

    def call(self, features, adj_matrix):
        shared_features = self.shared_gcn(features, adj_matrix)
        task_specific_features = self.task_specific_gcn(shared_features, adj_matrix)
        return task_specific_features
    
class GRUFusionLayer(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(GRUFusionLayer, self).__init__(**kwargs)
        # Ensure GRU output matches adjusted features dimension (128)
        self.gru = GRU(128, return_sequences=True)

    def call(self, x):
        return self.gru(x)

In [25]:
# MultiTask GNN Model with BERT and Residual Connections
class MultiTaskGNNModel(Model):
    def __init__(self, gnn_units, category_output_size, polarity_output_size, num_heads, **kwargs):
        super(MultiTaskGNNModel, self).__init__(**kwargs)
        self.distilbert = bert_model
        self.gru_fusion = GRUFusionLayer(gnn_units)

        # Multi-Head Attention Layer
        self.multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=64)

        # Pooling layers to reduce the sequence dimension
        self.category_pooling = GlobalAveragePooling1D()
        self.polarity_pooling = GlobalAveragePooling1D()

        # Separate task-specific Dense layers for final outputs
        self.category_output_layer = Dense(category_output_size, activation='softmax', name='category_output')
        self.polarity_output_layer = Dense(polarity_output_size, activation='softmax', name='polarity_output')

        # Projection layer to reduce BERT embedding dimensionality to match attention mechanism
        self.feature_adjustment_layer = Dense(128, activation='relu')

        # Hierarchical GNN layers for each task with residual connection support
        self.category_gnn = HierarchicalGNN(units=gnn_units)
        self.polarity_gnn = HierarchicalGNN(units=gnn_units)

        self.num_heads = num_heads

    def call(self, input_ids, attention_mask):
        # Pass through DistilBERT to get embeddings
        bert_output = self.distilbert(input_ids, attention_mask=attention_mask).last_hidden_state

        # Project BERT embeddings to match the GNN units dimension
        adjusted_features = self.feature_adjustment_layer(bert_output)

        # Create dynamic adjacency matrix using multi-head attention
        adj_matrix = self.adjacency_matrix(adjusted_features)

        # GRU Fusion for Contextualized Embeddings
        fused_features = self.gru_fusion(adjusted_features)

        # Adding a residual connection between GRU output and original BERT embeddings
        fused_features_with_residual = fused_features + adjusted_features

        # Multi-Task GNN Heads with residual connections
        category_features = self.category_gnn(fused_features_with_residual, adj_matrix)
        category_features_with_residual = category_features + fused_features_with_residual

        polarity_features = self.polarity_gnn(fused_features_with_residual, adj_matrix)
        polarity_features_with_residual = polarity_features + fused_features_with_residual

        # Pooling to collapse the sequence dimension
        category_features_pooled = self.category_pooling(category_features_with_residual)
        polarity_features_pooled = self.polarity_pooling(polarity_features_with_residual)

        # Final task-specific outputs
        category_output = self.category_output_layer(category_features_pooled)
        polarity_output = self.polarity_output_layer(polarity_features_pooled)

        return category_output, polarity_output

    def adjacency_matrix(self, bert_embeddings):
        # Multi-Head Attention to create adjacency matrix
        attention_output = self.multi_head_attention(query=bert_embeddings, key=bert_embeddings, value=bert_embeddings)
        adj_matrix = tf.nn.softmax(attention_output, axis=-1)

        # identity matrix for self-loops
        batch_size = tf.shape(adj_matrix)[0]
        num_tokens = tf.shape(adj_matrix)[1]
        identity_matrix = tf.eye(num_tokens, batch_shape=[batch_size], dtype=adj_matrix.dtype)
        adj_matrix = adj_matrix + identity_matrix

        return adj_matrix

In [26]:
input_ids = Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(128,), dtype=tf.int32, name='attention_mask')

gnn_units = 128
category_output_size = 5  
polarity_output_size = 3 
num_heads = 4

multi_task_gnn_model = MultiTaskGNNModel(
    gnn_units=gnn_units,
    num_heads=num_heads,
    category_output_size=category_output_size,
    polarity_output_size=polarity_output_size
)

# Build model
output_category, output_polarity = multi_task_gnn_model(input_ids, attention_mask)
model = Model(inputs=[input_ids, attention_mask], outputs=[output_category, output_polarity])

# Model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 multi_task_gnn_model (MultiTas  ((None, 5),         66758920    ['input_ids[0][0]',              
 kGNNModel)                      (None, 3))                       'attention_mask[0][0]']         
                                                                                                  
Total params: 66,758,920
Trainable params: 66,758,920
Non-trainable params: 0
________________

In [27]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay

# Define the learning rate schedule
lr_schedule = ExponentialDecay(
    initial_learning_rate=1e-5,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True
)

# Define the optimizer with the learning rate schedule
optimizer = Adam(
    learning_rate=lr_schedule,
    clipvalue=1.0
)

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=['sparse_categorical_crossentropy', 'sparse_categorical_crossentropy'],
    metrics=['accuracy']
)


In [28]:
from tensorflow.keras.callbacks import EarlyStopping


In [29]:
early_stopping = EarlyStopping(
    monitor='val_loss',         # Metric to monitor
    patience=8,                 # Number of epochs with no improvement
    restore_best_weights=True,  # Restore model weights from the epoch with the best value of the monitored metric
    mode='min',                 # In this case, we want to minimize the validation loss
    verbose=1                   # Verbosity mode
)


In [30]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint(
    'best_model.h5',
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)

history = model.fit(
    train_dataset.batch(16), 
    epochs=50,  # Set a higher number of epochs since EarlyStopping will likely stop earlier
    validation_data=val_dataset.batch(16),
    callbacks=[early_stopping, checkpoint]
)



Epoch 1/50
Epoch 1: val_loss improved from inf to 1.71488, saving model to best_model.h5
Epoch 2/50
Epoch 2: val_loss improved from 1.71488 to 0.93825, saving model to best_model.h5
Epoch 3/50
Epoch 3: val_loss improved from 0.93825 to 0.66719, saving model to best_model.h5
Epoch 4/50
 84/473 [====>.........................] - ETA: 1:12 - loss: 0.6968 - multi_task_gnn_model_loss: 0.4424 - multi_task_gnn_model_1_loss: 0.2544 - multi_task_gnn_model_accuracy: 0.8430 - multi_task_gnn_model_1_accuracy: 0.9137

KeyboardInterrupt: 