## Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf
from tensorflow.keras.layers import Input, GlobalAveragePooling1D, Dropout, Dense
from tensorflow.keras.models import Model
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score, adjusted_rand_score, classification_report
from sklearn.cluster import KMeans
%matplotlib inline

## Importing variables from the preprocessing notebook as csv files

In [2]:
X = pd.read_csv('X.csv')
y = np.loadtxt('y.csv', delimiter=',')

## Loading Bert Model

In [3]:
# Convert DataFrame column to a list
text_data = X['preprocessed_text'].tolist()

# Define the max_sequence_length
max_sequence_length = 512

# Load pre-trained BERT model and tokenizer
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [5]:
# Convert text data to BERT-compatible input format
encoded_inputs = tokenizer.batch_encode_plus(
    text_data,
    padding=True,
    truncation=True,
    max_length=max_sequence_length,
    return_tensors='tf'
)

X_input_ids, X_attention_mask = encoded_inputs['input_ids'], encoded_inputs['attention_mask']



In [6]:
# Define the architecture for feature extraction
input_ids = Input(shape=(max_sequence_length,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(max_sequence_length,), dtype=tf.int32, name='attention_mask')

# BERT encoding layer
bert_output = bert_model(input_ids, attention_mask=attention_mask)[0]
pooled_output = GlobalAveragePooling1D()(bert_output)
dropout = Dropout(0.2)(pooled_output)

# Output layer
output = Dense(768, activation='linear', name='output')(dropout)

# Create the feature extraction model
bert_feature_extractor = Model(inputs=[input_ids, attention_mask], outputs=output)

# Extract BERT embeddings for the text data
bert_embeddings = bert_feature_extractor.predict([X_input_ids, X_attention_mask])




In [7]:
# Perform K-means clustering
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(bert_embeddings)

print("Cluster Labels:", cluster_labels)



Cluster Labels: [1 0 0 ... 0 1 1]


## Evaluation of the model

In [8]:
# Evaluate clustering performance using metrics
ari = adjusted_rand_score(y, cluster_labels)
print("Adjusted Rand Index (ARI):", ari)


Adjusted Rand Index (ARI): 0.09623830775611983


In [9]:
# Generate a classification report for cluster evaluation
classification_report_result = classification_report(y, cluster_labels)
print("Classification Report:\n", classification_report_result)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.41      0.73      0.52      2328
         1.0       0.87      0.63      0.73      6600

    accuracy                           0.66      8928
   macro avg       0.64      0.68      0.63      8928
weighted avg       0.75      0.66      0.68      8928



In [10]:
silhouette_avg = silhouette_score(bert_embeddings, cluster_labels)
print("Silhouette Score:", silhouette_avg)


Silhouette Score: 0.078048244


In [11]:
cluster_centers = kmeans.cluster_centers_
print("Cluster Centers:", cluster_centers)


Cluster Centers: [[ 0.05763868 -0.4476834  -0.32423437 ... -0.16701706  0.4735224
   0.12791094]
 [ 0.08766183 -0.38542557 -0.35601804 ... -0.2611677   0.38144022
   0.22810794]]


In [12]:
db_index = davies_bouldin_score(bert_embeddings, cluster_labels)
print("Davies-Bouldin Index:", db_index)


Davies-Bouldin Index: 3.2022658619515876


In [13]:
ch_index = calinski_harabasz_score(bert_embeddings, cluster_labels)
print("Calinski-Harabasz Index:", ch_index)


Calinski-Harabasz Index: 824.064150356986
