In [None]:
!pip install tensorflow transformers



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report

In [None]:
# Load and preprocess the data
df_train = pd.read_json(path_or_buf='/content/drive/MyDrive/Diagnosis/train.jsonl', lines=True)
df_test = pd.read_json(path_or_buf='/content/drive/MyDrive/Diagnosis/test.jsonl', lines=True)


In [None]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
df_train['encoded_labels'] = label_encoder.fit_transform(df_train['output_text'])
df_test['encoded_labels'] = label_encoder.transform(df_test['output_text'])


In [None]:
# Text preprocessing function
def preprocessing(text):
    text_without_tags = re.sub(r'<[^>]+>', ' ', text)
    words = text_without_tags.lower().split()
    return ' '.join(words)

# Apply text preprocessing to train and test data
df_train['input_text'] = df_train['input_text'].astype(str).apply(preprocessing)
df_test['input_text'] = df_test['input_text'].astype(str).apply(preprocessing)


In [None]:
# Split the data into train and validation sets
train_df, val_df = train_test_split(df_train, test_size=0.1, stratify=df_train['output_text'])

# Tokenize the text data
max_length = 150
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenized_train = tokenizer(list(train_df['input_text']), padding=True, truncation=True, max_length=max_length, return_tensors="tf")
tokenized_val = tokenizer(list(val_df['input_text']), padding=True, truncation=True, max_length=max_length, return_tensors="tf")

# Convert labels to one-hot encoded format
Y_train_encoded = tf.keras.utils.to_categorical(train_df['encoded_labels'], num_classes=len(label_encoder.classes_))
Y_val_encoded = tf.keras.utils.to_categorical(val_df['encoded_labels'], num_classes=len(label_encoder.classes_))

# Create TensorFlow datasets
BATCH_SIZE = 64
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_train),
    Y_train_encoded
)).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_val),
    Y_val_encoded
)).batch(BATCH_SIZE)

# Define the model
num_classes = len(label_encoder.classes_)
int2label = dict(enumerate(label_encoder.classes_))
label2int = {v: k for k, v in int2label.items()}

model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=num_classes,
    id2label=int2label,
    label2id=label2int,
    output_attentions=True
)

model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(learning_rate=3e-5),
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=20
)



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Evaluate the model on the test set
test_x = df_test['input_text'].apply(preprocessing)
tokenized_test = tokenizer(list(test_x), padding=True, truncation=True, max_length=max_length, return_tensors="tf")
Y_test_encoded = tf.keras.utils.to_categorical(label_encoder.transform(df_test['output_text']), num_classes=num_classes)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_test),
    Y_test_encoded
)).batch(BATCH_SIZE)

test_loss, test_accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

Test Loss: 0.14306169748306274, Test Accuracy: 0.9622641801834106


In [None]:
#print the classification report for more detailed metrics
predictions = model.predict(test_dataset)
predicted_labels = tf.argmax(predictions.logits, axis=1)
true_labels = tf.argmax(tf.constant(Y_test_encoded), axis=1)

print(classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_))

                                 precision    recall  f1-score   support

                        allergy       1.00      1.00      1.00        10
                      arthritis       1.00      1.00      1.00        10
               bronchial asthma       0.91      1.00      0.95        10
           cervical spondylosis       0.91      1.00      0.95        10
                    chicken pox       0.90      0.90      0.90        10
                    common cold       1.00      1.00      1.00        10
                         dengue       0.89      0.80      0.84        10
                       diabetes       0.91      1.00      0.95        10
                  drug reaction       1.00      1.00      1.00         8
               fungal infection       1.00      1.00      1.00         9
gastroesophageal reflux disease       1.00      0.90      0.95        10
                   hypertension       0.91      1.00      0.95        10
                       impetigo       1.00      1.

In [None]:
# Save the model and weights
model.save_pretrained('/content/drive/MyDrive/Diagnosis/bert')


In [None]:
loaded_model = TFAutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/Diagnosis/bert')


Some layers from the model checkpoint at /content/drive/MyDrive/Diagnosis/bert were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/Diagnosis/bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
input_text = "I have a hard time swallowing and I feel like food is getting stuck in my throat. I belch a lot and feel bloated. I have a bad taste in my mouth all the time."


In [None]:
# Preprocess the input text
preprocessed_input = preprocessing(input_text)

# Tokenize the input text
tokenized_input = tokenizer(preprocessed_input, padding=True, truncation=True, max_length=max_length, return_tensors="tf")

# Make predictions
predictions = loaded_model.predict(dict(tokenized_input))


# Get predicted probabilities
predicted_probabilities = tf.nn.softmax(predictions.logits, axis=1).numpy()

# Get the two most probable classes and their probabilities
top_classes_indices = np.argsort(predicted_probabilities[0])[::-1][:2]
top_classes = [int2label[i] for i in top_classes_indices]
top_probabilities = [predicted_probabilities[0][i] for i in top_classes_indices]

print(f'The two most probable classes are: {top_classes}')
print(f'With corresponding probabilities: {top_probabilities}')


The two most probable classes are: ['peptic ulcer disease', 'gastroesophageal reflux disease']
With corresponding probabilities: [0.8459054, 0.08260557]
