##Neural Network

#Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertModel
from joblib import Parallel, delayed
from torch.cuda.amp import GradScaler, autocast
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch.multiprocessing as mp
from dataset import DiseaseSymptomDataset

  from .autonotebook import tqdm as notebook_tqdm


##Data retrival

In [2]:
import pandas as pd

df = pd.read_csv("Final_Augmented_dataset_Diseases_and_Symptoms.csv")
print(df.head())
print(df.info())


         diseases  anxiety and nervousness  depression  shortness of breath  \
0  panic disorder                        1           0                    1   
1  panic disorder                        0           0                    1   
2  panic disorder                        1           1                    1   
3  panic disorder                        1           0                    0   
4  panic disorder                        1           1                    0   

   depressive or psychotic symptoms  sharp chest pain  dizziness  insomnia  \
0                                 1                 0          0         0   
1                                 1                 0          1         1   
2                                 1                 0          1         1   
3                                 1                 0          1         1   
4                                 0                 0          0         1   

   abnormal involuntary movements  chest tightness  ... 

In [13]:
class DistilBERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super(DistilBERTClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output

In [4]:
print(df.columns)

Index(['diseases', 'anxiety and nervousness', 'depression',
       'shortness of breath', 'depressive or psychotic symptoms',
       'sharp chest pain', 'dizziness', 'insomnia',
       'abnormal involuntary movements', 'chest tightness',
       ...
       'stuttering or stammering', 'problems with orgasm', 'nose deformity',
       'lump over jaw', 'sore in nose', 'hip weakness', 'back swelling',
       'ankle stiffness or tightness', 'ankle weakness', 'neck weakness'],
      dtype='object', length=378)


In [14]:
def train_epoch(model, dataloader, optimizer, criterion, scaler, device, use_cuda):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = (batch['input_ids'].to(device),
                                             batch['attention_mask'].to(device),
                                             batch['labels'].to(device))
        if use_cuda:
            with autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [15]:
def eval_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = (batch['input_ids'].to(device),
                                                 batch['attention_mask'].to(device),
                                                 batch['labels'].to(device))
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            total_correct += torch.sum(preds == labels).item()
    return total_loss / len(dataloader), total_correct / len(dataloader.dataset)


In [16]:
def compute_metrics(p):
    predictions, labels = p
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    preds = np.argmax(predictions, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [17]:
def main():
    df = pd.read_csv("Final_Augmented_dataset_Diseases_and_Symptoms.csv")
    disease_column = 'diseases'
    symptom_columns = df.columns[1:]

    df['symptom'] = df[symptom_columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    df = df.dropna(subset=['symptom', disease_column])
    df['text'] = df['symptom'].apply(lambda x: x.lower())

    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df[disease_column])

    train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def tokenize_batch(texts, tokenizer):
        return tokenizer(texts, truncation=True, padding=True, max_length=512)

    def batch_tokenize_parallel(texts, tokenizer, n_jobs=4):
        texts = texts.tolist()
        batch_size = len(texts) // n_jobs
        results = Parallel(n_jobs=n_jobs)(delayed(tokenize_batch)(texts[i:i + batch_size], tokenizer) for i in range(0, len(texts), batch_size))
        input_ids = [item for sublist in results for item in sublist['input_ids']]
        attention_mask = [item for sublist in results for item in sublist['attention_mask']]
        return {'input_ids': input_ids, 'attention_mask': attention_mask}

    train_encodings = batch_tokenize_parallel(train_texts, tokenizer)
    val_encodings = batch_tokenize_parallel(val_texts, tokenizer)

    train_encodings['labels'] = train_labels.tolist()
    val_encodings['labels'] = val_labels.tolist()

    print("Tokenization completed successfully.")

    train_dataset = DiseaseSymptomDataset(train_encodings)
    val_dataset = DiseaseSymptomDataset(val_encodings)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=8, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=8, pin_memory=True)

    num_labels = len(label_encoder.classes_)
    model = DistilBERTClassifier(num_labels)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=5e-5)
    criterion = nn.CrossEntropyLoss()

    use_cuda = torch.cuda.is_available()
    scaler = GradScaler(enabled=use_cuda)

    num_epochs = 3
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler, device, use_cuda)
        val_loss, val_accuracy = eval_epoch(model, val_loader, criterion, device)
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Validation Loss: {val_loss:.4f}")
        print(f"Validation Accuracy: {val_accuracy:.4f}")

In [18]:
if __name__ == '__main__':
    try:
        mp.set_start_method('spawn', force=True)
    except RuntimeError:
        pass
    main()


Tokenization completed successfully.
