In [None]:
!pip install pandas numpy scikit-learn nltk joblib
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib
import nltk
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Load dataset
file_path = '/content/Train_data.csv'
data = pd.read_csv(file_path)

In [None]:
# Data Preprocessing and Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

data['cleaned_symptom_description'] = data['text'].apply(clean_text)

In [None]:
# Feature Engineering with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(data['cleaned_symptom_description']).toarray()
y = data['label']

In [None]:
# Train-test split
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
# Model Training and Hyperparameter Tuning
log_reg = LogisticRegression(max_iter=1000)
param_grid_log_reg = {'C': [0.1, 1, 10, 100]}
grid_log_reg = GridSearchCV(log_reg, param_grid_log_reg, cv=5, scoring='accuracy')
grid_log_reg.fit(X_train_tfidf, y_train)
best_log_reg = grid_log_reg.best_estimator_

rf_clf = RandomForestClassifier()
param_grid_rf_clf = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, 30]}
grid_rf_clf = GridSearchCV(rf_clf, param_grid_rf_clf, cv=5, scoring='accuracy')
grid_rf_clf.fit(X_train_tfidf, y_train)
best_rf_clf = grid_rf_clf.best_estimator_


In [None]:
# Define the evaluate_model function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Classification Report for {model.__class__.__name__}:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"ROC AUC: {roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred), average='macro')}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

# Assuming X_test_tfidf and y_test are defined
# Evaluate Logistic Regression
evaluate_model(best_log_reg, X_test_tfidf, y_test)

# Evaluate Random Forest Classifier
evaluate_model(best_rf_clf, X_test_tfidf, y_test)

Classification Report for LogisticRegression:
                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                      Arthritis       1.00      1.00      1.00        10
               Bronchial Asthma       1.00      1.00      1.00        11
           Cervical spondylosis       1.00      1.00      1.00         5
                    Chicken pox       0.92      1.00      0.96        12
                    Common Cold       1.00      1.00      1.00        12
                         Dengue       1.00      0.92      0.96        12
          Dimorphic Hemorrhoids       1.00      1.00      1.00         7
               Fungal infection       1.00      1.00      1.00        13
                   Hypertension       1.00      1.00      1.00        12
                       Impetigo       1.00      1.00      1.00        11
                       Jaundice       1.00      1.00      1.00        11
    

In [None]:
# Save the best model (Random Forest Classifier) and vectorizer (assuming tfidf_vectorizer is used)
joblib.dump(best_rf_clf, 'best_rf_clf_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
# Advanced NLP Techniques with BERT
class SymptomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class BERTDiseasePredictor(nn.Module):
    def __init__(self, n_classes):
        super(BERTDiseasePredictor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        # Extract the pooled output (tensor) from the BERT model
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).pooler_output  # Access the pooler_output attribute
        output = self.drop(output)  # Now pass the tensor to the dropout layer
        return self.out(output)


In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [None]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn as nn
from collections import defaultdict

# Define SymptomDataset class
class SymptomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        # Create a label map to convert string labels to integers
        self.label_map = {label: i for i, label in enumerate(np.unique(labels))}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.label_map[self.labels[idx]]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }



# Constants and configurations
MAX_LEN = 160
BATCH_SIZE = 16
EPOCHS = 10

# Tokenizer initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Data splitting
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['cleaned_symptom_description'], data['label'], test_size=0.2, random_state=42)

# Dataset creation
train_dataset = SymptomDataset(
    texts=train_texts.to_numpy(),
    labels=train_labels.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = SymptomDataset(
    texts=test_texts.to_numpy(),
    labels=test_labels.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# DataLoader creation
train_data_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

test_data_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

# Model initialization
model = BERTDiseasePredictor(n_classes=len(data['label'].unique()))
model = model.to(device)

# Optimizer and scheduler initialization
optimizer = optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1)
criterion = nn.CrossEntropyLoss().to(device)

# History tracking
history = defaultdict(list)
best_accuracy = 0

# Training loop
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    model.train()

    for batch in train_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        scheduler.step()

    # Evaluation on test set after each epoch
    model.eval()
    with torch.no_grad():
        test_acc, test_loss = eval_model(model, test_data_loader, criterion,len(test_dataset))

        # Save best model based on test accuracy
        if test_acc > best_accuracy:
            best_accuracy = test_acc
            torch.save(model.state_dict(), 'best_model.pth')

        # Record history
        history['train_loss'].append(loss.item())
        history['test_loss'].append(test_loss)
        history['test_accuracy'].append(test_acc)

# Final message
print("Training completed. Your model is now ready for inference.")
