In [1]:
!pip install tensorflow transformers



In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [5]:
!pip install huggingface_hub




In [3]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report
from huggingface_hub import notebook_login
from datasets import load_dataset



In [4]:

# Load the dataset
dataset = load_dataset("gretelai/symptom_to_diagnosis")


# Convert train_dataset to DataFrame
df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])

Downloading readme:   0%|          | 0.00/2.46k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.5k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
df_train['input_text'] = df_train['input_text'].astype(str)
df_test['input_text'] = df_test['input_text'].astype(str)

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
df_train['encoded_labels'] = label_encoder.fit_transform(df_train['output_text'])
df_test['encoded_labels'] = label_encoder.transform(df_test['output_text'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df_train, test_size=0.1, stratify=df_train['output_text'])

# Tokenize the text data and apply preprocessing
max_length = 150
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenized_train = tokenizer(list(train_df['input_text']), padding=True, truncation=True, max_length=max_length, return_tensors="tf")
tokenized_val = tokenizer(list(val_df['input_text']), padding=True, truncation=True, max_length=max_length, return_tensors="tf")

# Convert labels to one-hot encoded format
Y_train_encoded = tf.keras.utils.to_categorical(train_df['encoded_labels'], num_classes=len(label_encoder.classes_))
Y_val_encoded = tf.keras.utils.to_categorical(val_df['encoded_labels'], num_classes=len(label_encoder.classes_))

# Create TensorFlow datasets
BATCH_SIZE = 64
train_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_train), Y_train_encoded)).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_val), Y_val_encoded)).batch(BATCH_SIZE)

# Define the model
num_classes = len(label_encoder.classes_)
int2label = dict(enumerate(label_encoder.classes_))
label2int = {v: k for k, v in int2label.items()}

model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=num_classes,
    id2label=int2label,
    label2id=label2int,
    output_attentions=True
)

model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(learning_rate=3e-5),
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=20
)

# Evaluate the model on the test set
tokenized_test = tokenizer(list(df_test['input_text']), padding=True, truncation=True, max_length=max_length, return_tensors="tf")
Y_test_encoded = tf.keras.utils.to_categorical(df_test['encoded_labels'], num_classes=num_classes)

test_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_test), Y_test_encoded)).batch(BATCH_SIZE)

test_loss, test_accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

# Print the classification report for more detailed metrics
predictions = model.predict(test_dataset)
predicted_labels = tf.argmax(predictions.logits, axis=1)
true_labels = tf.argmax(tf.constant(Y_test_encoded), axis=1)

print(classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_))

notebook_login()
model.push_to_hub("Zabihin/Symptom_to_Diagnosis", use_auth_token="********")
tokenizer.push_to_hub("Zabihin/Symptom_to_Diagnosis", use_auth_token="****************")

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 0.22889424860477448, Test Accuracy: 0.9339622855186462
                                 precision    recall  f1-score   support

                        allergy       0.89      0.80      0.84        10
                      arthritis       1.00      1.00      1.00        10
               bronchial asthma       0.83      1.00      0.91        10
           cervical spondylosis       0.91      1.00      0.95        10
                    chicken pox       1.00      1.00      1.00        10
                    common cold       0.83      1.00      0.91        10
                         dengue       0.90      0.90      0.90        10
                       diabetes       1.00      0.80      0.89        10
                  drug reaction       0.86   

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



tf_model.h5:   0%|          | 0.00/434M [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/Zabihin/Symptom_to_Diagnosis/commit/c629fbc8f5c37059ee33d144c0005a838f11a0bc', commit_message='Upload tokenizer', commit_description='', oid='c629fbc8f5c37059ee33d144c0005a838f11a0bc', pr_url=None, pr_revision=None, pr_num=None)