In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFDistilBertForSequenceClassification, TextClassificationPipeline

In [2]:
df = pd.read_csv('Symptom2Disease.csv')
df

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."
...,...,...,...
1195,295,diabetes,I'm shaking and trembling all over. I've lost ...
1196,296,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,297,diabetes,I regularly experience these intense urges and...
1198,298,diabetes,"I have trouble breathing, especially outside. ..."


In [3]:
int2label = {i: disease for i, disease in enumerate(df['label'].unique())}
label2int = {v: k for k, v in int2label.items()}
df['label'] = df['label'].map(lambda x: label2int[x])

In [4]:
X, y = df['text'].values, df['label'].values
train_x, val_x, train_y, val_y = train_test_split(X, y, test_size=0.1, stratify=y)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(list(train_x), padding="max_length", truncation=True)
val_encodings = tokenizer(list(val_x), padding="max_length", truncation=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
BATCH_SIZE = 8
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_y)).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_y)).batch(BATCH_SIZE)

In [7]:
model = TFDistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(int2label)
)

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
    metrics=['accuracy']
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
EPOCHS = 3
history = model.fit(train_dataset, epochs=EPOCHS, validation_data=val_dataset)

# Predictions
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=len(int2label))

Epoch 1/3

In [None]:
pred1 = pipe("I am experiencing rashes on my skin. It is itchy and is now swelling. Even my sking is starting to peel. ")
pred2 = pipe("I have constipation and belly pain, and it's been really uncomfortable. The belly pain has been getting worse and is starting to affect my daily life. Moreover, I get chills every night, followed by a mild fever.")
pred3 = pipe("I'm sneezing and can't stop coughing. I have a mucus overflow and my sinuses are totally closed. I can't smell anything, and my chest hurts. My muscles are quite sore as well.")
pred4 = pipe("I've been cold and worn out, and I haven't been able to stop coughing. My chest hurts and my heart races when I do that. I'm coughing up horrible dark mucus.")
pred5 = pipe("I've also experienced chills, vomiting, and intense itching in addition to a high fever. Along with a headache and a lot of perspiration, I've also been experiencing nausea and muscle pain.")
pred6 = pipe("I have breathing issues and am easily out of breath. My eyes hurt, my face and body are bloated, and I have a dry, hacking cough.")
pred7 = pipe("My whole body is shaking and trembling. I can't smell or taste anymore, and I'm exhausted. My heart sometimes races or I have palpitations.")
pred8 = pipe("I often feel like food is getting caught in my throat and have problems swallowing. I feel bloated and frequently belch. I constantly have a bitter aftertaste.")



print(pred1[0][:1])
print(pred2[0][:2])
print(pred3[0][:1])
print(pred4[0][:1])
print(pred5[0][:1])
print(pred6[0][:1])
print(pred7[0][:1])
print(pred8[0][:1])