In [None]:
!pip install emoji

In [None]:
!pip install torch==2.3.0 transformers==4.41.2 huggingface_hub --upgrade


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
import emoji

In [None]:

df = pd.read_csv("Final_data.csv", encoding='latin1')

In [None]:
df.head(4)

In [None]:

df["Category"].value_counts()

In [None]:
sns.countplot(df["Category"])

In [None]:


def clean_texts(text):
    cleantext = emoji.replace_emoji(text, replace='')
    cleantext = re.sub(r'http\S+\s', ' ', text)
    cleantext = re.sub(r'@\S+', ' ', cleantext)
    cleantext = re.sub(r'#\S+', ' ', cleantext)
    cleantext = re.sub(r'@\S+', ' ', cleantext)
    cleantext = re.sub(r'[%s]' % re.escape(r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleantext)
    cleantext = re.sub(r'[0-9]+', ' ', cleantext)
    cleantext = re.sub(r'[^\x00-\x7f]', r' ', cleantext)
    cleantext = re.sub(r'\s+', ' ', cleantext)
    return cleantext

In [None]:
clean_texts("11 Remember when your mom said eating all that junk food was going to make you sick? Well, she wasn't wrong. Anxiety, depression, mental disorders are really gut #biome disorders. #LoveYourBiome  https://www.bbc.com/news/health-43815370ÃÂ Ã¢ÂÂ¦")

In [None]:
df["text"]= df["text"].apply(lambda x: clean_texts(x))

In [None]:
df["text"][10]

In [None]:

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# **BERT**

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

embeddings_list = []

for text in tqdm(df['text'], desc="Encoding tweets"):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=256)

    with torch.no_grad():
        outputs = bert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    embeddings_list.append(cls_embedding)

df["bert_embedding"] = embeddings_list


In [None]:
df["bert_embedding"].head(5)

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Bidirectional

X = np.stack(df['bert_embedding'].values)
y = df['Category'].values


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)


# **BERT + BI-LSTM**

```
# This is formatted as code
```



In [None]:

from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Bidirectional, Reshape
from tensorflow.keras.models import Model

input_dim = 768  # BERT CLS embedding size
num_classes = len(label_encoder.classes_)

inputs = Input(shape=(input_dim,), name='bert_input')

# Replace Lambda with Reshape
x = Reshape((1, input_dim))(inputs)

x = Bidirectional(LSTM(64, return_sequences=False))(x)
x = Dropout(0.4)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.4)(x)
outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)


history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=20,
    callbacks=[early_stop]
)

In [None]:

y_pred_probs = model.predict(X_test)

y_pred = np.argmax(y_pred_probs, axis=1)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))

# Accuracy
plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Loss
plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')


scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]


plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])


for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(BERT + LSTM) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **BERT + LSTM**

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Reshape
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

input_dim = 768
num_classes = len(label_encoder.classes_)


inputs = Input(shape=(input_dim,), name='bert_input')
x = Reshape((1, input_dim))(inputs)
x = LSTM(128, return_sequences=False)(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)


outputs = Dense(num_classes, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)


model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


model.summary()


early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=20,
    callbacks=[early_stop]
)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)


if len(y_test.shape) > 1 and y_test.shape[1] > 1:
    y_test = np.argmax(y_test, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(BERT + LSTM) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **BERT + BILSTM**

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Bidirectional, Reshape
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping


input_dim = 768
num_classes = len(label_encoder.classes_)

inputs = Input(shape=(input_dim,), name='bert_input')
x = Reshape((1, input_dim))(inputs)
x = Bidirectional(LSTM(128, return_sequences=False))(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)

outputs = Dense(num_classes, activation='softmax')(x)


model = Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


model.summary()


early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)


history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=20,
    callbacks=[early_stop]
)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)


if len(y_test.shape) > 1 and y_test.shape[1] > 1:
    y_test = np.argmax(y_test, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(BERT + BILSTM) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **BERT + GRU**

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, GRU, Reshape
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

input_dim = 768
num_classes = len(label_encoder.classes_)

inputs = Input(shape=(input_dim,), name='bert_input')

# Reshape to fit GRU input (timesteps=1, features=input_dim)
x = Reshape((1, input_dim))(inputs)
x = GRU(128, return_sequences=False)(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)


outputs = Dense(num_classes, activation='softmax')(x)


model_GRU = Model(inputs=inputs, outputs=outputs)


model_GRU.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


model_GRU.summary()


early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)


history = model_GRU.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=20,
    callbacks=[early_stop]
)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


y_pred_probs = model_GRU.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)


if len(y_test.shape) > 1 and y_test.shape[1] > 1:
    y_test = np.argmax(y_test, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(BERT + GRU) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **BERT + BIGRU**

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, GRU, Bidirectional, Reshape
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping


input_dim = 768
num_classes = len(label_encoder.classes_)


inputs = Input(shape=(input_dim,), name='bert_input')


x = Reshape((1, input_dim))(inputs)
x = Bidirectional(GRU(128, return_sequences=False))(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
outputs = Dense(num_classes, activation='softmax')(x)


model = Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


model.summary()


early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)


history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=20,
    callbacks=[early_stop]
)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)


if len(y_test.shape) > 1 and y_test.shape[1] > 1:
    y_test = np.argmax(y_test, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(BERT + BIGRU) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **BERT + Logistic regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)



In [None]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt



acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(Logistic regression) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **BERT + Randomforest classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

r_clf = RandomForestClassifier(n_estimators=200, random_state=42)
r_clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


y_pred = r_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(Randomforest classifier) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **BERT + XGBOOST**

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

clf_xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)


clf_xgb.fit(X_train, y_train)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


y_pred = r_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(Randomforest classifier) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **BERT + Lightgbm**

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

clf_lgb =LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    random_state=42
)


clf_lgb.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


y_pred = clf_lgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(Randomforest classifier) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **SBERT + LSTM**

In [None]:
df["text"].head(4)

In [None]:
!pip uninstall sentence-transformers transformers huggingface_hub -y


In [None]:
!pip uninstall -y transformers sentence-transformers huggingface_hub


In [None]:
!pip install huggingface_hub==0.16.4
!pip install transformers==4.33.1
!pip install sentence-transformers==2.2.2



In [None]:
!pip install --upgrade pip
!pip install --upgrade sentence-transformers


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
print("SBERT loaded successfully!")


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sbert_model.encode(X_cleaned, batch_size=16,
                                show_progress_bar=True)

X = embeddings.reshape((embeddings.shape[0], 1, embeddings.shape[1]))
y = y_cleaned

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

inputs = Input(shape=(X.shape[1], X.shape[2]))
x = LSTM(128, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4))(inputs)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.5)(x)
outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20, batch_size=16,
    callbacks=[early_stop],
    verbose=1)

loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np


y_pred_prob = model.predict(X_test, batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (SBERT + LSTM)")
plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(SBERT + LSTM) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **SBERT + BILSTM**

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sbert_model.encode(X_cleaned, batch_size=16,
                                show_progress_bar=True)

X = embeddings.reshape((embeddings.shape[0], 1, embeddings.shape[1]))
y = y_cleaned

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

inputs = Input(shape=(X.shape[1], X.shape[2]))
x = Bidirectional(LSTM(64, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4)))(inputs)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.5)(x)
outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20, batch_size=16,
    callbacks=[early_stop],
    verbose=1)

loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np


y_pred_prob = model.predict(X_test, batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (SBERT + BILSTM)")
plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(SBERT + BILSTM) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **SBERT + GRU**

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sbert_model.encode(X_cleaned, batch_size=16,
                                show_progress_bar=True)

X = embeddings.reshape((embeddings.shape[0], 1, embeddings.shape[1]))
y = y_cleaned

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

inputs = Input(shape=(X.shape[1], X.shape[2]))
x = GRU(64, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4))(inputs)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.5)(x)
outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20, batch_size=16,
    callbacks=[early_stop],
    verbose=1)

loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np


y_pred_prob = model.predict(X_test, batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (SBERT + GRU)")
plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(SBERT + GRU) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **SBERT + BIGRU**

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sbert_model.encode(X_cleaned, batch_size=16,
                                show_progress_bar=True)

X = embeddings.reshape((embeddings.shape[0], 1, embeddings.shape[1]))
y = y_cleaned

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

inputs = Input(shape=(X.shape[1], X.shape[2]))
x = Bidirectional(GRU(64, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4)))(inputs)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.5)(x)
outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20, batch_size=16,
    callbacks=[early_stop],
    verbose=1)

loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np


y_pred_prob = model.predict(X_test, batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (SBERT + BIGRU)")
plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])

for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")

plt.ylim(0,1.1)
plt.title("(SBERT + BIGRU) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **SBERT + Logistic regression**

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sbert_model.encode(X_cleaned, batch_size=16, show_progress_bar=True)

X = embeddings  # keep 2D
y = np.array(y_cleaned)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (SBERT + BIGRU)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("Logistic Regression Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()

# **SBERT + xgboost**

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sbert_model.encode(X_cleaned, batch_size=16, show_progress_bar=True)

X = embeddings  # keep 2D
y = np.array(y_cleaned)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print("[INFO] Training XGBoost classifier...")
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(X_train, y_train)







In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

y_pred = xgb_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (SBERT + xgboost)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("xgboost Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()

# **sbert + Randomforest classifier**

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sbert_model.encode(X_cleaned, batch_size=16, show_progress_bar=True)

X = embeddings  # keep 2D
y = np.array(y_cleaned)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
print("[INFO] Training Random Forest classifier...")
rf_model = RandomForestClassifier(
    n_estimators=300,       # number of trees
    max_depth=20,           # limit depth to avoid overfitting
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)











In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

y_pred = rf_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (SBERT + xgboost)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("xgboost Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()

# **SBERT + SVM**

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sbert_model.encode(X_cleaned, batch_size=16, show_progress_bar=True)

X = embeddings
y = np.array(y_cleaned)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print("[INFO] Training SVM classifier...")
svm_model = SVC(
    kernel='rbf',       # radial basis function kernel
    C=10.0,             # regularization parameter
    gamma='scale',      # kernel coefficient
    probability=True,   # needed if you want predict_proba
    random_state=42
)

svm_model.fit(X_train, y_train)



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

y_pred = svm_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (SBERT + xgboost)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("xgboost Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()

# **DistilBert + LSTM**

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 1e-4


X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
y = np.array(y_cleaned)


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert = TFDistilBertModel.from_pretrained("distilbert-base-uncased", from_pt=True)
distilbert.trainable = False

encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"].numpy()
attention_mask = encodings["attention_mask"].numpy()


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, y, test_size=0.2, stratify=y, random_state=42
)


class DistilBertEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, distilbert_model, **kwargs):
        super().__init__(**kwargs)
        self.distilbert = distilbert_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # shape: (batch, seq_len, 768)


input_ids_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")


bert_embeddings = DistilBertEmbeddingLayer(distilbert)([input_ids_layer, attention_mask_layer])

x = LSTM(64, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4))(bert_embeddings)
x = Dropout(0.6)(x)
x = Dense(128, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.6)(x)
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class


model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=LR), loss="sparse_categorical_crossentropy", metrics=["accuracy"])


early_stop = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)


history = model.fit(
    [X_train_ids, X_train_mask], y_train,
    validation_data=([X_test_ids, X_test_mask], y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=1
)


loss, acc = model.evaluate([X_test_ids, X_test_mask], y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


model.summary()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict([X_test_ids, X_test_mask], batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (DISTILBERT + LSTM)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("(DISTILBERT + LSTM) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **DISTILBERT + BILSTM**

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 1e-4


X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
y = np.array(y_cleaned)


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert = TFDistilBertModel.from_pretrained("distilbert-base-uncased", from_pt=True)
distilbert.trainable = False

encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"].numpy()
attention_mask = encodings["attention_mask"].numpy()


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, y, test_size=0.2, stratify=y, random_state=42
)


class DistilBertEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, distilbert_model, **kwargs):
        super().__init__(**kwargs)
        self.distilbert = distilbert_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # shape: (batch, seq_len, 768)


input_ids_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")


bert_embeddings = DistilBertEmbeddingLayer(distilbert)([input_ids_layer, attention_mask_layer])

x = Bidirectional(
    LSTM(64, return_sequences=False, kernel_regularizer=regularizers.l2(3e-5)))(bert_embeddings)
x = Dropout(0.6)(x)
x = Dense(128, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.6)(x)
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class


model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=LR), loss="sparse_categorical_crossentropy", metrics=["accuracy"])


early_stop = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)


history = model.fit(
    [X_train_ids, X_train_mask], y_train,
    validation_data=([X_test_ids, X_test_mask], y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=1
)


loss, acc = model.evaluate([X_test_ids, X_test_mask], y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


model.summary()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict([X_test_ids, X_test_mask], batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (DISTILBERT + BILSTM)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("(DISTILBERT + BILSTM) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **DISTILBERT + GRU**

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Bidirectional,GRU
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 1e-4


X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
y = np.array(y_cleaned)


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert = TFDistilBertModel.from_pretrained("distilbert-base-uncased", from_pt=True)
distilbert.trainable = False

encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"].numpy()
attention_mask = encodings["attention_mask"].numpy()


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, y, test_size=0.2, stratify=y, random_state=42
)


class DistilBertEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, distilbert_model, **kwargs):
        super().__init__(**kwargs)
        self.distilbert = distilbert_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # shape: (batch, seq_len, 768)


input_ids_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")


bert_embeddings = DistilBertEmbeddingLayer(distilbert)([input_ids_layer, attention_mask_layer])

x = GRU(64, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4))(bert_embeddings)
x = Dropout(0.6)(x)
x = Dense(128, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.6)(x)
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class


model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=LR), loss="sparse_categorical_crossentropy", metrics=["accuracy"])


early_stop = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)


history = model.fit(
    [X_train_ids, X_train_mask], y_train,
    validation_data=([X_test_ids, X_test_mask], y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=1
)


loss, acc = model.evaluate([X_test_ids, X_test_mask], y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


model.summary()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict([X_test_ids, X_test_mask], batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (DISTILBERT + GRU)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("(DISTILBERT + GRU) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **DISTILBERT + BIGRU**

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Bidirectional,GRU
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 1e-4


X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
y = np.array(y_cleaned)


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert = TFDistilBertModel.from_pretrained("distilbert-base-uncased", from_pt=True)
distilbert.trainable = False

encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"].numpy()
attention_mask = encodings["attention_mask"].numpy()


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, y, test_size=0.2, stratify=y, random_state=42
)


class DistilBertEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, distilbert_model, **kwargs):
        super().__init__(**kwargs)
        self.distilbert = distilbert_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # shape: (batch, seq_len, 768)


input_ids_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")


bert_embeddings = DistilBertEmbeddingLayer(distilbert)([input_ids_layer, attention_mask_layer])

x = Bidirectional(
        GRU(128, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4))
    )(bert_embeddings)
x = Dropout(0.6)(x)
x = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.6)(x)
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class


model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=LR), loss="sparse_categorical_crossentropy", metrics=["accuracy"])


early_stop = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)


history = model.fit(
    [X_train_ids, X_train_mask], y_train,
    validation_data=([X_test_ids, X_test_mask], y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=1
)


loss, acc = model.evaluate([X_test_ids, X_test_mask], y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


model.summary()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict([X_test_ids, X_test_mask], batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (DISTILBERT + BIGRU)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("(DISTILBERT + BIGRU) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **DISTILBERT + Logistic regression**

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

MAX_LEN = 256
BATCH_SIZE = 16

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert = TFDistilBertModel.from_pretrained("distilbert-base-uncased", from_pt=True)
distilbert.trainable = False


encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]


embeddings_list = []
for i in tqdm(range(0, len(input_ids), BATCH_SIZE)):
    batch_input_ids = input_ids[i:i+BATCH_SIZE]
    batch_attention_mask = attention_mask[i:i+BATCH_SIZE]
    outputs = distilbert(batch_input_ids, attention_mask=batch_attention_mask)
    batch_embeds = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()  # mean pooling
    embeddings_list.append(batch_embeds)

embeddings = np.concatenate(embeddings_list, axis=0)


X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y, test_size=0.2, stratify=y, random_state=42
)


clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)

print("Accuracy:", clf.score(X_test, y_test))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (distilBERT + logistic regression)")
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("Logistic Regression Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **DISTILBERT + XGBOOST**

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from tqdm import tqdm

MAX_LEN = 256
BATCH_SIZE = 16

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert = TFDistilBertModel.from_pretrained("distilbert-base-uncased", from_pt=True)
distilbert.trainable = False


encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]


embeddings_list = []
for i in tqdm(range(0, len(input_ids), BATCH_SIZE)):
    batch_input_ids = input_ids[i:i+BATCH_SIZE]
    batch_attention_mask = attention_mask[i:i+BATCH_SIZE]
    outputs = distilbert(batch_input_ids, attention_mask=batch_attention_mask)
    batch_embeds = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()  # mean pooling
    embeddings_list.append(batch_embeds)

embeddings = np.concatenate(embeddings_list, axis=0)


X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y, test_size=0.2, stratify=y, random_state=42
)


print("[INFO] Training XGBoost classifier...")
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

y_pred = xgb_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (distilbert + xgboost)")
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title(" xgboost Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **DISTILBERT + RANDOMFOREST**

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier

MAX_LEN = 256
BATCH_SIZE = 16

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert = TFDistilBertModel.from_pretrained("distilbert-base-uncased", from_pt=True)
distilbert.trainable = False


encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]


embeddings_list = []
for i in tqdm(range(0, len(input_ids), BATCH_SIZE)):
    batch_input_ids = input_ids[i:i+BATCH_SIZE]
    batch_attention_mask = attention_mask[i:i+BATCH_SIZE]
    outputs = distilbert(batch_input_ids, attention_mask=batch_attention_mask)
    batch_embeds = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()  # mean pooling
    embeddings_list.append(batch_embeds)

embeddings = np.concatenate(embeddings_list, axis=0)


X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y, test_size=0.2, stratify=y, random_state=42
)


print("[INFO] Training Random Forest classifier...")
rf_model = RandomForestClassifier(
    n_estimators=300,       # number of trees
    max_depth=20,           # limit depth to avoid overfitting
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

y_pred = rf_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (distilBERT + Randomforest)")
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("Randdomforest Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **DISTILBERT + SVM**

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC

MAX_LEN = 256
BATCH_SIZE = 16

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert = TFDistilBertModel.from_pretrained("distilbert-base-uncased", from_pt=True)
distilbert.trainable = False


encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]


embeddings_list = []
for i in tqdm(range(0, len(input_ids), BATCH_SIZE)):
    batch_input_ids = input_ids[i:i+BATCH_SIZE]
    batch_attention_mask = attention_mask[i:i+BATCH_SIZE]
    outputs = distilbert(batch_input_ids, attention_mask=batch_attention_mask)
    batch_embeds = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()  # mean pooling
    embeddings_list.append(batch_embeds)

embeddings = np.concatenate(embeddings_list, axis=0)


X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y, test_size=0.2, stratify=y, random_state=42
)


print("[INFO] Training SVM classifier...")
svm_model = SVC(
    kernel='rbf',       # radial basis function kernel
    C=10.0,             # regularization parameter
    gamma='scale',      # kernel coefficient
    probability=True,   # needed if you want predict_proba
    random_state=42
)

svm_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

y_pred = svm_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (distilBERT + svm)")
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("svm Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **Roberta + Lstm**

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 1e-4


X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
y = np.array(y_cleaned)

print("[INFO] Loading RoBERTa...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base", from_pt=True)


roberta_model.trainable = False


encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"].numpy()
attention_mask = encodings["attention_mask"].numpy()


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, y, test_size=0.2, stratify=y, random_state=42
)


class RobertaEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, roberta_model, **kwargs):
        super().__init__(**kwargs)
        self.roberta = roberta_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # (batch, seq_len, hidden_size)


input_ids_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

roberta_embeddings = RobertaEmbeddingLayer(roberta_model)([input_ids_layer, attention_mask_layer])


x = LSTM(128, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4))(roberta_embeddings)
x = Dropout(0.4)(x)
x = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.4)(x)
outputs = Dense(num_classes, activation="softmax")(x)


model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=LR),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()
early_stop = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)


history = model.fit(
    [X_train_ids, X_train_mask], y_train,
    validation_data=([X_test_ids, X_test_mask], y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
loss, acc = model.evaluate([X_test_ids, X_test_mask], y_test, verbose=0)
print(f"\n Test Accuracy: {acc:.4f}")




In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict([X_test_ids, X_test_mask], batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix ( Roberta + LSTM)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("(Roberta + LSTM) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **Roberta + BILSTM**

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 1e-4


X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
y = np.array(y_cleaned)

print("[INFO] Loading RoBERTa...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base", from_pt=True)


roberta_model.trainable = False


encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"].numpy()
attention_mask = encodings["attention_mask"].numpy()


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, y, test_size=0.2, stratify=y, random_state=42
)


class RobertaEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, roberta_model, **kwargs):
        super().__init__(**kwargs)
        self.roberta = roberta_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # (batch, seq_len, hidden_size)


input_ids_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

roberta_embeddings = RobertaEmbeddingLayer(roberta_model)([input_ids_layer, attention_mask_layer])


x = Bidirectional(
    LSTM(64, return_sequences=False, kernel_regularizer=regularizers.l2(3e-5))
)(roberta_embeddings)
x = Dropout(0.4)(x)
x = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.4)(x)
outputs = Dense(num_classes, activation="softmax")(x)


model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=LR),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()
early_stop = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)


history = model.fit(
    [X_train_ids, X_train_mask], y_train,
    validation_data=([X_test_ids, X_test_mask], y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
loss, acc = model.evaluate([X_test_ids, X_test_mask], y_test, verbose=0)
print(f"\n Test Accuracy: {acc:.4f}")




In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict([X_test_ids, X_test_mask], batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix ( Roberta + BILSTM)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("(Roberta + BILSTM) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **Roberta + GRU**

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 1e-4


X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
y = np.array(y_cleaned)

print("[INFO] Loading RoBERTa...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base", from_pt=True)


roberta_model.trainable = False


encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"].numpy()
attention_mask = encodings["attention_mask"].numpy()


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, y, test_size=0.2, stratify=y, random_state=42
)


class RobertaEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, roberta_model, **kwargs):
        super().__init__(**kwargs)
        self.roberta = roberta_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # (batch, seq_len, hidden_size)


input_ids_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

roberta_embeddings = RobertaEmbeddingLayer(roberta_model)([input_ids_layer, attention_mask_layer])


x = GRU(64, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4))(roberta_embeddings)
x = Dropout(0.4)(x)
x = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.4)(x)
outputs = Dense(num_classes, activation="softmax")(x)


model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=LR),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()
early_stop = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)


history = model.fit(
    [X_train_ids, X_train_mask], y_train,
    validation_data=([X_test_ids, X_test_mask], y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
loss, acc = model.evaluate([X_test_ids, X_test_mask], y_test, verbose=0)
print(f"\n Test Accuracy: {acc:.4f}")




In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict([X_test_ids, X_test_mask], batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix ( Roberta + GRU)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("(Roberta + GRU ) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# **Roberta + BIGRU**

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 1e-4


X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y_cleaned = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
y = np.array(y_cleaned)

print("[INFO] Loading RoBERTa...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base", from_pt=True)


roberta_model.trainable = False


encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"].numpy()
attention_mask = encodings["attention_mask"].numpy()


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, y, test_size=0.2, stratify=y, random_state=42
)


class RobertaEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, roberta_model, **kwargs):
        super().__init__(**kwargs)
        self.roberta = roberta_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # (batch, seq_len, hidden_size)


input_ids_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

roberta_embeddings = RobertaEmbeddingLayer(roberta_model)([input_ids_layer, attention_mask_layer])


x = Bidirectional(GRU(64, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4)))(roberta_embeddings)
x = Dropout(0.4)(x)
x = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = Dropout(0.4)(x)
outputs = Dense(num_classes, activation="softmax")(x)


model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=LR),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()
early_stop = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)


history = model.fit(
    [X_train_ids, X_train_mask], y_train,
    validation_data=([X_test_ids, X_test_mask], y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
loss, acc = model.evaluate([X_test_ids, X_test_mask], y_test, verbose=0)
print(f"\n Test Accuracy: {acc:.4f}")




In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict([X_test_ids, X_test_mask], batch_size=20)
y_pred = np.argmax(y_pred_prob, axis=1)


cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix ( Roberta + GRU)")
plt.show()


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("(Roberta + GRU ) Model Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,5))


plt.subplot(1,2,1)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1,2,2)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
x = Bidirectional(
        GRU(64, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4))
    )(bert_embeddings)

x = Bidirectional(GRU(64, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4)))(roberta_embeddings)


# **Roberta + Logistc regression**

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import tensorflow as tf
from tqdm import tqdm

MAX_LEN = 256
BATCH_SIZE = 16


X_cleaned = df['text'].tolist()
y_raw = df['Category'].values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)


print("[INFO] Loading RoBERTa...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base", from_pt=True)


def get_roberta_embeddings(texts, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i + batch_size]
        encodings = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=MAX_LEN,
            return_tensors="tf"
        )
        outputs = roberta_model(encodings["input_ids"], attention_mask=encodings["attention_mask"])
        cls_embeds = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(cls_embeds)
    return np.concatenate(embeddings, axis=0)


cls_embeddings = get_roberta_embeddings(X_cleaned, batch_size=BATCH_SIZE)


X_train, X_test, y_train, y_test = train_test_split(cls_embeddings, y, test_size=0.2, stratify=y, random_state=42)


print("[INFO] Training Logistic Regression...")
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)


y_pred = log_reg.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {acc:.4f}")



In [None]:

y_pred = log_reg.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {acc:.4f}")


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (Roberta + logistic regression)")
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("Logistic Regression Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


**Roberta + RandomforestClassifer**

In [None]:

from transformers import RobertaTokenizer, TFRobertaModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import tensorflow as tf
from tqdm import tqdm


MAX_LEN = 128
BATCH_SIZE = 16

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)


print("[INFO] Loading RoBERTa model...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base", from_pt=True)


for layer in roberta_model.layers:
    layer.trainable = False


print("[INFO] Tokenizing texts...")
encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]


print("[INFO] Generating embeddings in batches...")
embeddings_list = []
dataset = tf.data.Dataset.from_tensor_slices((input_ids, attention_mask)).batch(BATCH_SIZE)

for batch_input_ids, batch_attention_mask in tqdm(dataset, total=len(X_cleaned)//BATCH_SIZE + 1):
    outputs = roberta_model(batch_input_ids, attention_mask=batch_attention_mask)
    # Mean pooling over token embeddings
    batch_embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    embeddings_list.append(batch_embeddings)

cls_embeddings = tf.concat(embeddings_list, axis=0).numpy()
print(f"[INFO] Embeddings shape: {cls_embeddings.shape}")


X_train, X_test, y_train, y_test = train_test_split(
    cls_embeddings, y, test_size=0.2, stratify=y, random_state=42
)


print("[INFO] Training Random Forest classifier...")
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)


y_pred = rf_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\n Test Accuracy: {acc:.4f}")



In [None]:

y_pred = rf_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {acc:.4f}")



from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (Roberta + Randomforest classifeir)")
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("Randomforestclassifier Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **Roberta + xgboost**

In [None]:


from transformers import RobertaTokenizer, TFRobertaModel
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import tensorflow as tf
from tqdm import tqdm


MAX_LEN = 128
BATCH_SIZE = 16

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)


print("[INFO] Loading RoBERTa model...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base", from_pt=True)


for layer in roberta_model.layers:
    layer.trainable = False


print("[INFO] Tokenizing texts...")
encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]


print("[INFO] Generating RoBERTa embeddings in batches...")
embeddings_list = []
dataset = tf.data.Dataset.from_tensor_slices((input_ids, attention_mask)).batch(BATCH_SIZE)

for batch_input_ids, batch_attention_mask in tqdm(dataset, total=len(X_cleaned)//BATCH_SIZE + 1):
    outputs = roberta_model(batch_input_ids, attention_mask=batch_attention_mask)
    # Use mean pooling instead of CLS to reduce noise
    batch_embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    embeddings_list.append(batch_embeddings)

cls_embeddings = tf.concat(embeddings_list, axis=0).numpy()
print(f"[INFO] Embeddings shape: {cls_embeddings.shape}")


X_train, X_test, y_train, y_test = train_test_split(
    cls_embeddings, y, test_size=0.2, stratify=y, random_state=42
)


print("[INFO] Training XGBoost classifier...")
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    tree_method='hist',        # memory-efficient
    device='cuda' if tf.config.list_physical_devices('GPU') else 'cpu'
)

xgb_model.fit(X_train, y_train)


y_pred = xgb_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\n Test Accuracy: {acc:.4f}")



In [None]:

y_pred = xgb_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {acc:.4f}")



from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (Roberta + xgboost)")
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("xgboost Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()


# **Roberta + svm**

In [None]:


from transformers import RobertaTokenizer, TFRobertaModel
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import tensorflow as tf
from tqdm import tqdm

MAX_LEN = 128
BATCH_SIZE = 16

X_cleaned = df['text'].tolist()
y_raw = df['Category'].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)


print("[INFO] Loading RoBERTa model...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base", from_pt=True)

# Freeze all layers to save memory
for layer in roberta_model.layers:
    layer.trainable = False


print("[INFO] Tokenizing texts...")
encodings = tokenizer(
    X_cleaned,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="tf"
)

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]


print("[INFO] Generating RoBERTa embeddings in batches...")
embeddings_list = []
dataset = tf.data.Dataset.from_tensor_slices((input_ids, attention_mask)).batch(BATCH_SIZE)

for batch_input_ids, batch_attention_mask in tqdm(dataset, total=len(X_cleaned)//BATCH_SIZE + 1):
    outputs = roberta_model(batch_input_ids, attention_mask=batch_attention_mask)
    # Mean pooling over sequence length for stable representation
    batch_embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    embeddings_list.append(batch_embeddings)

cls_embeddings = tf.concat(embeddings_list, axis=0).numpy()
print(f"[INFO] Embeddings shape: {cls_embeddings.shape}")  # (samples, 768)


X_train, X_test, y_train, y_test = train_test_split(
    cls_embeddings, y, test_size=0.2, stratify=y, random_state=42
)

print("[INFO] Training SVM classifier...")
svm_model = SVC(
    kernel='rbf',
    C=10.0,
    gamma='scale',
    probability=True,
    random_state=42
)

svm_model.fit(X_train, y_train)


y_pred = svm_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\n Test Accuracy: {acc:.4f}")


In [None]:

y_pred = svm_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {acc:.4f}")



from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

plt.figure(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix (Roberta + xgboost)")
plt.show()




acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

scores = [acc, prec, rec, f1]
labels = ["Accuracy", "Precision", "Recall", "F1-score"]

plt.figure(figsize=(8,5))
bars = plt.bar(labels, scores, color=['skyblue','lightgreen','salmon','orange'])
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=12, fontweight="bold")
plt.ylim(0,1.1)
plt.title("xgboost Performance Metrics", fontsize=14, fontweight="bold")
plt.ylabel("Score", fontsize=12)
plt.show()
