In [1]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from keras.callbacks import Callback
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.layers import Layer, GRU, Bidirectional, Dense, Input, Reshape, GlobalAveragePooling1D
import nltk
from nltk.corpus import wordnet
import random
from lime.lime_text import LimeTextExplainer

nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mhose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_excel("./Bengali_data.xlsx")
df.head()

Unnamed: 0,comment,Category,Gender,comment react number,label
0,ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা...,Actor,Female,1.0,sexual
1,ঘরে বসে শুট করতে কেমন লেগেছে? ক্যামেরাতে কে ছি...,Singer,Male,2.0,not bully
2,"অরে বাবা, এই টা কোন পাগল????",Actor,Female,2.0,not bully
3,ক্যাপ্টেন অফ বাংলাদেশ,Sports,Male,0.0,not bully
4,পটকা মাছ,Politician,Male,0.0,troll


In [3]:
df.dropna(inplace=True)
df['label'].value_counts()

label
not bully    15339
troll        10462
sexual        8928
religious     7575
threat        1694
Name: count, dtype: int64

In [4]:
df = df[['comment', 'label']]
df.head()

Unnamed: 0,comment,label
0,ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা...,sexual
1,ঘরে বসে শুট করতে কেমন লেগেছে? ক্যামেরাতে কে ছি...,not bully
2,"অরে বাবা, এই টা কোন পাগল????",not bully
3,ক্যাপ্টেন অফ বাংলাদেশ,not bully
4,পটকা মাছ,troll


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
# Initialize Bengali stopwords and lemmatizer
stop_words = set(stopwords.words('bengali'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep only Bengali characters
    text = re.sub(r'\d+', '', text)                 # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()        # Remove extra spaces

    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

df['comment'] = df['comment'].apply(clean_text)
df.head()

Unnamed: 0,comment,label
0,হালার পুত মদ খাওয়ার সময় রাতের বেলা মদ খাই দি...,sexual
1,ঘরে শুট কেমন লেগেছে ক্যামেরাতে,not bully
2,অরে বাবা টা পাগল,not bully
3,ক্যাপ্টেন অফ বাংলাদেশ,not bully
4,পটকা মাছ,troll


In [7]:
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [8]:
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

# Tokenize the text data
def tokenize_text(text):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,  # Limit the sequence length
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    return encoding['input_ids'], encoding['attention_mask']



In [9]:
# Apply the tokenizer to each comment in the dataset
input_ids = []
attention_masks = []

for comment in df['comment']:
    ids, mask = tokenize_text(comment)
    input_ids.append(ids)
    attention_masks.append(mask)

# Convert lists to tensors
input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)

In [10]:
# Encode the labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
labels = to_categorical(df['label_encoded'])

In [13]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Convert tensors to numpy arrays for compatibility with train_test_split
input_ids_np = input_ids.numpy()
attention_masks_np = attention_masks.numpy()
labels_np = labels  # labels are already in a suitable format

# Split the data into training and test sets
X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = train_test_split(
    input_ids_np, attention_masks_np, labels_np, test_size=0.2, random_state=42
)

# Now, split the training set again to create a validation set
X_train_ids, X_val_ids, X_train_masks, X_val_masks, y_train, y_val = train_test_split(
    X_train_ids, X_train_masks, y_train, test_size=0.1, random_state=42
)

# Convert back to tensors
X_train_ids = tf.constant(X_train_ids)
X_val_ids = tf.constant(X_val_ids)
X_test_ids = tf.constant(X_test_ids)

X_train_masks = tf.constant(X_train_masks)
X_val_masks = tf.constant(X_val_masks)
X_test_masks = tf.constant(X_test_masks)

y_train = tf.constant(y_train)
y_val = tf.constant(y_val)
y_test = tf.constant(y_test)

# Print shapes to confirm the splits
print("Shape of training input IDs:", X_train_ids.shape)
print("Shape of validation input IDs:", X_val_ids.shape)
print("Shape of test input IDs:", X_test_ids.shape)

print("Shape of training attention masks:", X_train_masks.shape)
print("Shape of validation attention masks:", X_val_masks.shape)
print("Shape of test attention masks:", X_test_masks.shape)

print("Shape of training labels:", y_train.shape)
print("Shape of validation labels:", y_val.shape)
print("Shape of test labels:", y_test.shape)


Shape of training input IDs: (31678, 128)
Shape of validation input IDs: (3520, 128)
Shape of test input IDs: (8800, 128)
Shape of training attention masks: (31678, 128)
Shape of validation attention masks: (3520, 128)
Shape of test attention masks: (8800, 128)
Shape of training labels: (31678, 5)
Shape of validation labels: (3520, 5)
Shape of test labels: (8800, 5)


In [14]:
from transformers import TFDistilBertModel
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [16]:
# Load the pre-trained DistilBERT model
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

# Freeze the DistilBERT model layers to avoid updating their weights
distilbert_model.trainable = True 

# Define the model architecture
input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Pass inputs through DistilBERT
bert_outputs = distilbert_model(input_ids, attention_mask=attention_mask)
sequence_output = bert_outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

# Global Average Pooling over the sequence output
x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)

# Output layer for classification (5 classes in your case)
output = Dense(5, activation='softmax')(x)

# Define the model
model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=2e-5),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Print model summary
model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_distil_bert_model_1 (TFDist  TFBaseModelOutput(l  134734080  ['input_ids[0][0]',              
 ilBertModel)                   ast_hidden_state=(N               'attention_mask[0][0]']         
                                one, 128, 768),                                                   
                                 hidden_states=None                                         

In [17]:
# Set up EarlyStopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    [X_train_ids, X_train_masks], y_train,
    validation_data=([X_val_ids, X_val_masks], y_val),
    epochs=5,
    batch_size=32,
    callbacks=[early_stopping]
)


Epoch 1/5
139/990 [===>..........................] - ETA: 5:52 - loss: 1.2400 - accuracy: 0.4802

KeyboardInterrupt: 