# Importing all the libraries

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
# *
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy

In [74]:
# Loading Dataset in a variable
df = pd.read_csv('Problem_Dataset.csv')

In [75]:
# we will save Obs and Type column respectively
X = df['Obs'].values
y = df['Type'].values

In [76]:
# Converting Strings labels to Integer
label_encoder = LabelEncoder()
y_encode = label_encoder.fit_transform(y)
# calculating number of classes i.e number of labels which will be further use for encoding
num_classes = len(label_encoder.classes_)
print(num_classes)

7


In [77]:
# Splitting data in train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encode, test_size=0.2, random_state=42)

In [78]:
# here tokenizer is used to convert the sentences in X_train to numeric values for every word
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [79]:
# here padding is done so that the variable words in sentences can be made of same size
X_train_padded = keras.preprocessing.sequence.pad_sequences(X_train_seq)
X_val_padded = keras.preprocessing.sequence.pad_sequences(X_val_seq, maxlen=X_train_padded.shape[1])

In [80]:
# Performing one hot encoding, as we are working with binary we will convert all the labels in binary and store them as one's and zero's
y_train_oneHot = to_categorical(y_train, num_classes=num_classes)
y_val_oneHot = to_categorical(y_val, num_classes=num_classes)

In [81]:
# function which will help in creating a model for every single label
def modelCreation(activation1, activation2,labels):
    models = {}
    for label in labels:
        # Define the model architecture (adjust as needed)
        model = keras.Sequential([
        layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X_train_padded.shape[1]),
        layers.LSTM(64),
        layers.Dense(32, activation1),
        layers.Dense(num_classes, activation2)
    ])


# Binary Classification

In [82]:
modelCreation("relu","softmax",label_encoder.classes_ )

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train_oneHot, validation_data=(X_val_padded, y_val_oneHot), epochs=5, batch_size=32, verbose=2)

models[label] = model

In [84]:
# Evaluating the models *
for label, model in models.items():
    y_val_pred_oneHot = model.predict(X_val_padded)
    y_val_pred = label_encoder.inverse_transform(y_val_pred_oneHot.argmax(axis=1))
    y_val_true = label_encoder.inverse_transform(y_val)

    accuracy = accuracy_score(y_val_true, y_val_pred)
    report = classification_report(y_val_true, y_val_pred,output_dict=True)

In [85]:
# print(f"Label: {label}")
# print(f"Validation Accuracy: {accuracy:.4f}")
# print(f"Classification Report:\n{report}")
# print("="*40)

In [86]:
report_df = pd.DataFrame(report).transpose()
# Save the DataFrame to a CSV file
report_df.to_csv('classification_report_Binary.csv', index=True)
print(y_val_pred)
print()
# predictions_df = pd.DataFrame(y_val_pred, columns=label_encoder.classes_)
# predictions_df.to_csv('multiLabel_predictions.csv', index=False)

[[4.9584755e-01 2.6452059e-01 3.6234066e-03 3.9595002e-04 1.6485045e-02
  9.7187907e-01]
 [1.6411774e-02 9.9613396e-03 9.6898508e-01 1.9018471e-02 1.1485673e-03
  9.9951589e-01]
 [7.4048615e-01 2.8948782e-02 8.4154733e-02 1.3777857e-03 9.9352515e-01
  1.0086340e-02]
 ...
 [6.3254207e-02 8.3833485e-04 1.3617864e-02 9.7336346e-01 4.0961144e-04
  9.9921101e-01]
 [9.0549594e-01 2.4006036e-03 1.2031073e-01 9.3064914e-03 9.5914114e-01
  7.7293612e-02]
 [9.7800118e-01 1.4644356e-03 1.7673150e-02 8.4942821e-03 2.8774500e-01
  6.4360970e-01]]



# Multilabel Classification Model

In [51]:
label_binarizer = MultiLabelBinarizer()
y_encoded = label_binarizer.fit_transform(y)
num_classes = len(label_binarizer.classes_)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

# Pad sequences to ensure consistent length
X_train_padded = keras.preprocessing.sequence.pad_sequences(X_train_seq)
X_val_padded = keras.preprocessing.sequence.pad_sequences(X_val_seq, maxlen=X_train_padded.shape[1])

In [52]:
model = keras.Sequential([
    layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X_train_padded.shape[1]),
    layers.LSTM(64),
    layers.Dense(32, activation='relu'),
    layers.Dense(num_classes, activation='sigmoid')  # Sigmoid for multi-label classification
])

In [53]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
X_val_padded = keras.preprocessing.sequence.pad_sequences(X_val_seq, maxlen=X_train_padded.shape[1])
# Train the model with early stopping
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model.fit(X_train_padded, y_train, validation_data=(X_val_padded, y_val), epochs=10, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
y_val_pred = model.predict(X_val_padded)
y_val_pred_binary = (y_val_pred > 0.5).astype(int)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [57]:
report = classification_report(y_val, y_val_pred_binary, target_names=label_binarizer.classes_,output_dict=True)
# print("Classification Report:\n", report)

In [58]:
report_df = pd.DataFrame(report).transpose()
# Save the DataFrame to a CSV file
report_df.to_csv('classification_report_Multi.csv', index=True)

predictions_df = pd.DataFrame(y_val_pred_binary, columns=label_binarizer.classes_)
predictions_df.to_csv('multiLabel_predictions.csv', index=False)

# BERT Model (Some Issues) *

In [89]:
# Tokenize the text data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='tf', max_length=128)
X_val_encoded = tokenizer(X_val.tolist(), padding=True, truncation=True, return_tensors='tf', max_length=128)

In [90]:
# Load pre-trained BERT model
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [91]:
# Compile the model
bert_model.compile(optimizer=Adam(learning_rate=2e-5),
                   loss=BinaryCrossentropy(),
                   metrics=[BinaryAccuracy()])

# Train the model with early stopping
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
bert_model.fit(X_train_encoded, y_train, validation_data=(X_val_encoded, y_val), epochs=5, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
y_val_pred_logits = bert_model.predict(X_val_encoded)
y_val_pred = tf.sigmoid(y_val_pred_logits).numpy()
y_val_pred_binary = (y_val_pred > 0.5).astype(int)

# Report precision, recall, F1-score
report = classification_report(y_val, y_val_pred_binary, target_names=label_binarizer.classes_)
print("Classification Report:\n", report)

Epoch 1/5


Exception ignored in: <function AtomicFunction.__del__ at 0x000001FC89F15940>
Traceback (most recent call last):
  File "C:\Users\ual-laptop\anaconda3\Lib\site-packages\tensorflow\python\eager\polymorphic_function\atomic_function.py", line 292, in __del__
KeyboardInterrupt: 


ValueError: Cannot generate a hashable key for IteratorSpec(({'input_ids': TensorSpec(shape=(None, 38), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 38), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 38), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None)),) because the _serialize() method returned an unsupproted value of type <class 'transformers.tokenization_utils_base.BatchEncoding'>