In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv(r"C:\Users\rexjo\Downloads\symptom-disease-train-dataset.csv")  # Using raw string

# OR alternatively
# df = pd.read_csv("C:/Users/rexjo/Downloads/symptom-disease-train-dataset.csv")

# Ensure 'text' and 'label' are strings
df['text'] = df['text'].astype(str).str.lower()
df['label'] = df['label'].astype(str).str.lower()

# Encode diseases (labels) into numerical labels
label_encoder = LabelEncoder()
df['disease_encoded'] = label_encoder.fit_transform(df['label'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['disease_encoded'], test_size=0.2, random_state=42)


In [33]:

print(df.columns)

Index(['text', 'label'], dtype='object')


In [41]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(df['label'].unique()))

# Tokenize the data
train_encodings = tokenizer(list(X_train), padding=True, truncation=True, max_length=64, return_tensors="pt")
test_encodings = tokenizer(list(X_test), padding=True, truncation=True, max_length=64, return_tensors="pt")
train_labels = torch.tensor(y_train.tolist())
test_labels = torch.tensor(y_test.tolist())


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
class SymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SymptomDataset(train_encodings, train_labels)
test_dataset = SymptomDataset(test_encodings, test_labels)

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [56]:
import torch
print(torch.cuda.is_available())  # Returns True if a GPU is available


False


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,  # Reduced from 3 to 1 to make training faster
    per_device_train_batch_size=2,  # Reduced from 4 to 2 for faster processing
    per_device_eval_batch_size=4,  # Reduced batch size for evaluation as well
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])


Step,Training Loss
