In [47]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim

In [48]:
df = pd.read_csv("hf://datasets/owaiskha9654/PubMed_MultiLabel_Text_Classification_Dataset_MeSH/PubMed Multi Label Text Classification Dataset Processed.csv")
df = df.sample(n=5000, random_state=42) # for testing purposes

In [49]:
df.head()

Unnamed: 0,Title,abstractText,meshMajor,pmid,meshid,meshroot,A,B,C,D,E,F,G,H,I,J,L,M,N,Z
33553,Vertical trauma: injuries to patients who fall...,We reviewed the patterns of injuries sustained...,"['Accidental Falls', 'Accidents', 'Adolescent'...",2916780,"[['N06.850.135.122'], ['N06.850.135'], ['M01.0...","['Health Care [N]', 'Named Groups [M]', 'Organ...",0,1,1,0,0,1,0,0,1,0,0,1,1,0
9427,The influence of bilateral sagittal split ramu...,The effect of orthodontic-surgical treatment o...,"['Adolescent', 'Adult', 'Chin', 'Esthetics, De...",24946129,"[['M01.060.057'], ['M01.060.116'], ['A01.456.5...","['Named Groups [M]', 'Anatomy [A]', 'Analytica...",1,1,0,0,1,0,0,0,0,0,0,1,1,0
199,Altered metabolic incorporation of fucose and ...,Sciatic nerves of 25-week-old genetically diab...,"['Animals', 'Carbon Radioisotopes', 'Diabetes ...",6888648,"[['B01.050'], ['D01.268.150.075.328', 'D01.496...","['Organisms [B]', 'Chemicals and Drugs [D]', '...",1,1,1,1,1,0,0,0,0,0,0,0,0,0
12447,Higher temperatures increase developmental rat...,Effects of temperature on development of Raja ...,"['Adaptation, Biological', 'Analysis of Varian...",31049955,"[['G16.012'], ['E05.318.740.150', 'N05.715.360...","['Phenomena and Processes [G]', 'Analytical, D...",0,1,0,0,1,0,1,0,0,0,0,0,1,1
39489,MR imaging of the flexed knee: comparison to t...,The aim of this study was to obtain MR images ...,"['Adolescent', 'Adult', 'Arthroscopy', 'Female...",11097414,"[['M01.060.057'], ['M01.060.116'], ['E01.370.3...","['Named Groups [M]', 'Analytical, Diagnostic a...",1,1,1,0,1,0,1,0,0,0,0,1,1,0


In [50]:
old_df = df.copy()

In [22]:
df = old_df.copy()

In [51]:
categories = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']
curU = 'A'
#remove curU from all_categories
categories.remove(curU)

#make df have only Title, abstractText, curU, and every category in categories
df = df[['Title', 'abstractText', curU] + categories]
#condense Title, abstractText into just "text"
df['text'] = df['Title'] + ' ' + df['abstractText']
df = df.drop(['Title', 'abstractText'], axis=1)
#tokenize text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['text'] = df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
df.head()
#make a column "feature" that is [text, 'A', 'B', 'C', ...]
df['feature'] = df.apply(lambda x: [x['text']] + x[categories].values.tolist(), axis=1)
df = df.drop(categories, axis=1)
df = df.drop(['text'], axis=1)
#rename curU to "label"
df = df.rename(columns={curU: 'label'})
df = df.dropna()
df = df.reset_index(drop=True)
df.head()

'''#split into train and test without using sklearn
train_size = 0.8
train_df = df.sample(frac=train_size, random_state=200)
test_df = df.drop(train_df.index)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
'''

#add validation
train_size = 0.7
val_size = 0.15

train_df = df.sample(frac=train_size, random_state=200)
remaining_df = df.drop(train_df.index)
val_df = remaining_df.sample(frac=val_size / (1 - train_size), random_state=200)
test_df = remaining_df.drop(val_df.index)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


Token indices sequence length is longer than the specified maximum sequence length for this model (749 > 512). Running this sequence through the model will result in indexing errors


In [None]:
# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.features = dataframe['feature'].values
        self.labels = dataframe['label'].values

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        full_feature = self.features[idx]
        text = torch.tensor(full_feature[0], dtype=torch.long)  # Tokenized text
        categories = torch.tensor(full_feature[1:], dtype=torch.float)  # Other categories
        label = torch.tensor(self.labels[idx], dtype=torch.float)  # Binary label
        return text, categories, label

def collate_fn(batch):
    texts, categories, labels = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)  # Pad texts
    categories = torch.stack(categories)  # Stack category vectors
    labels = torch.stack(labels).unsqueeze(1)  # Stack labels and add dimension
    return texts_padded, categories, labels


train_dataset = TextDataset(train_df)
val_dataset = TextDataset(val_df)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, kernel_sizes, dense_dim, category_dim):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=k)
            for k in kernel_sizes
        ])
        self.category_fc = nn.Linear(category_dim, dense_dim)  # Process category features
        self.fc1 = nn.Linear(num_filters * len(kernel_sizes) + dense_dim, 1)  # Combine features
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, categories):
        # Process tokenized text
        embedded = self.embedding(text).permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]
        conv_outs = [F.relu(conv(embedded)).max(dim=2)[0] for conv in self.convs]
        text_features = torch.cat(conv_outs, dim=1)

        # Process categorical features
        category_features = F.relu(self.category_fc(categories))

        # Combine and classify
        combined_features = torch.cat([text_features, category_features], dim=1)
        output = self.sigmoid(self.fc1(combined_features))
        return output

# Model Parameters
vocab_size = tokenizer.vocab_size
embedding_dim = 128
num_filters = 100
kernel_sizes = [3, 4, 5]
dense_dim = 32
epochs = 5
category_dim = len(df['feature'][0]) - 1  # Number of category features (excluding text)

model = TextCNN(vocab_size, embedding_dim, num_filters, kernel_sizes, dense_dim, category_dim)

# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, kernel_sizes, dense_dim, category_dim, pretrained_embeddings=None):
        super(TextCNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim) if pretrained_embeddings is None else nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.embedding_dropout = nn.Dropout(0.3)

        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=k, padding='same'),
                nn.BatchNorm1d(num_filters),
                nn.LeakyReLU(),
                nn.Dropout(0.4)
            )
            for k in kernel_sizes
        ])

        self.category_fc = nn.Linear(category_dim, dense_dim)
        self.fc1 = nn.Linear(num_filters * len(kernel_sizes) + dense_dim, dense_dim)
        self.fc2 = nn.Linear(dense_dim, 1)
        self.dropout = nn.Dropout(0.6)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, categories):
        embedded = self.embedding_dropout(self.embedding(text)).permute(0, 2, 1)
        conv_outs = [conv(embedded).max(dim=2)[0] for conv in self.convs]
        text_features = torch.cat(conv_outs, dim=1)

        category_features = F.leaky_relu(self.category_fc(categories))

        combined_features = torch.cat([text_features, category_features], dim=1)
        combined_features = self.dropout(combined_features)
        output = self.sigmoid(self.fc2(F.leaky_relu(self.fc1(combined_features))))
        return output

# Model Parameters
vocab_size = tokenizer.vocab_size
embedding_dim = 100  # Reduced from 128
num_filters = 128  # Reduced from 200
kernel_sizes = [3, 4, 5]
dense_dim = 64
epochs = 20  # Increased from 15
category_dim = len(df['feature'][0]) - 1

# Initialize model
model = TextCNN(vocab_size, embedding_dim, num_filters, kernel_sizes, dense_dim, category_dim)

# Add L2 regularization
weight_decay = 1e-5
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=weight_decay)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

# Gradient clipping
clip_value = 1.0
torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)


In [None]:
import matplotlib.pyplot as plt

def train_model(model, train_dataloader, val_dataloader, criterion, optimizer, epochs=15):
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []

    for epoch in range(epochs):
        # Training
        model.train()
        total_train_loss = 0
        train_correct = 0
        total_samples = 0
        
        for texts, categories, labels in train_dataloader:
            optimizer.zero_grad()
            outputs = model(texts, categories)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            
            # Calculate accuracy
            preds = (outputs >= 0.5).float()
            train_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        train_accuracy = train_correct / total_samples
        train_losses.append(avg_train_loss)
        train_accuracies.append(train_accuracy)
        
        # Validation
        model.eval()
        total_val_loss = 0
        val_correct = 0
        total_val_samples = 0
        
        with torch.no_grad():
            for texts, categories, labels in val_dataloader:
                outputs = model(texts, categories)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item()
                
                # Calculate validation accuracy
                preds = (outputs >= 0.5).float()
                val_correct += (preds == labels).sum().item()
                total_val_samples += labels.size(0)
        
        avg_val_loss = total_val_loss / len(val_dataloader)
        val_accuracy = val_correct / total_val_samples
        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)
        
        # Print epoch results
        print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Acc: {val_accuracy:.4f}")

    return train_losses, val_losses, train_accuracies, val_accuracies

epochs = 15

# Train the model and get stats
train_losses, val_losses, train_accuracies, val_accuracies = train_model(
    model, train_dataloader, val_dataloader, criterion, optimizer, epochs=epochs
)

# Plot Losses
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot Accuracies
plt.figure(figsize=(10, 5))
plt.plot(train_accuracies, label='Training Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()


In [71]:
# prompt: save the model

import torch

# Assuming 'model' is your trained model
torch.save(model.state_dict(), 'text_cnn_model.pth')

# To download the saved model file:
from google.colab import files
files.download('text_cnn_model.pth')

In [None]:
# Function to evaluate on a test set and get confidence scores
def evaluate_on_test(model, test_dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    all_confidences = []
    
    with torch.no_grad():
        for texts, categories, labels in test_dataloader:
            outputs = model(texts, categories)
            preds = (outputs >= 0.5).float()
            confidence_scores = outputs.squeeze().tolist()  # Convert to list
            
            all_preds.extend(preds.squeeze().tolist())
            all_labels.extend(labels.squeeze().tolist())
            all_confidences.extend(confidence_scores)
    
    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    return all_preds, all_labels, all_confidences

# Create test DataLoader
test_dataset = TextDataset(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Evaluate on test set
preds, labels, confidences = evaluate_on_test(model, test_dataloader)
