### 1. Loading preprocessed Clinical Data

In [17]:
import os
import glob
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
np.random.seed(123)

In [18]:
label_encoder = LabelEncoder()

In [19]:
training = pd.read_csv('train_data_resamples.csv')
testing = pd.read_csv('test_data_resamples.csv')

In [20]:
train_id = training['Case ID'].tolist()
test_id = testing['Case ID'].tolist()

scaler = StandardScaler()
training_scaled = scaler.fit_transform(training.drop(['Case ID','Histology'], axis = 1))
testing_scaled = scaler.transform(testing.drop(['Case ID','Histology'], axis = 1))

# replace the scaled columns
training_scaled_pd = pd.DataFrame(training_scaled)
training_scaled_pd.columns = training.drop(['Case ID','Histology'], axis = 1).columns

testing_scaled_pd = pd.DataFrame(testing_scaled)
testing_scaled_pd.columns = testing.drop(['Case ID','Histology'], axis = 1).columns

#replace column in training with the scaled columns
training_scaled_pd['Case ID'] = train_id
training_scaled_pd['Histology'] = training['Histology']

testing_scaled_pd['Case ID'] = test_id
testing_scaled_pd['Histology'] = testing['Histology']

training = training_scaled_pd
testing = testing_scaled_pd

In [21]:
PID = { 'R01-005','R01-012','R01-013','R01-014','R01-017','R01-021','R01-026','R01-027','R01-028','R01-029',
        'R01-038','R01-043','R01-046','R01-048','R01-049','R01-051','R01-052','R01-054','R01-055','R01-056',
        'R01-057','R01-059','R01-060','R01-061','R01-062','R01-063','R01-064','R01-065','R01-066','R01-067',
        'R01-068','R01-069','R01-071','R01-072','R01-073','R01-076','R01-078','R01-080','R01-081','R01-083',
        'R01-084','R01-089','R01-091','R01-093','R01-094','R01-096','R01-097','R01-098','R01-100','R01-101',
        'R01-102','R01-103','R01-104','R01-105','R01-106','R01-107','R01-108','R01-109','R01-110','R01-111',
        'R01-112','R01-113','R01-114','R01-115','R01-116','R01-117','R01-118','R01-119','R01-120','R01-121',
        'R01-122','R01-123','R01-124','R01-125','R01-126','R01-127','R01-128','R01-129','R01-130','R01-131',
        'R01-132','R01-133','R01-134','R01-135','R01-136','R01-138','R01-139','R01-140','R01-141','R01-142',
        'R01-144','R01-145','R01-146','R01-147','R01-148','R01-149','R01-151','R01-152','R01-154','R01-156',
        'R01-157','R01-158','R01-159','R01-160','LUNG-002','LUNG-004','LUNG-006','LUNG-009','LUNG-011',
        'LUNG-012','LUNG-018','LUNG-022','LUNG-030','LUNG-042','LUNG-045','LUNG-046','LUNG-047','LUNG-053',
        'LUNG-054','LUNG-061','LUNG-063','LUNG-068','LUNG-073','LUNG-078','LUNG-082','LUNG-086','LUNG-093',
        'LUNG-098','LUNG-099','LUNG-101','LUNG-104','LUNG-105','LUNG-116','LUNG-122','LUNG-135','LUNG-150',
        'LUNG-151','LUNG-173','LUNG-177','LUNG-193','LUNG-201','LUNG-202','LUNG-206','LUNG-208','LUNG-210'}

# Base directory
#CT Only
base_dir = './Lung Mask/'

# Store patient ID and corresponding image paths
image_paths_per_pid = {}

# Loop through each patient ID
for pid in PID:
    # Use glob to find all images starting with the patient ID
    image_paths = glob.glob(os.path.join(base_dir, f'{pid}*'))
    
    # Concatenate image paths with ";" and store in the dictionary
    image_paths_per_pid[pid] = ";".join(image_paths)

training['Images'] = training['Case ID'].map(image_paths_per_pid)
testing['Images'] = testing['Case ID'].map(image_paths_per_pid)

In [22]:
training['Images'] = training['Images'].str.split(';')
testing['Images'] = testing['Images'].str.split(';')
exploded_df_train = training.explode('Images')
exploded_df_test = testing.explode('Images')

In [13]:
train_data = exploded_df_train
test_data = exploded_df_test
train_data['Histology'] = exploded_df_train['Histology']
train_data['Images'] = exploded_df_train['Images']
test_data['Histology'] = exploded_df_test['Histology']
test_data['Images'] = exploded_df_test['Images']
train_data, valid_data = train_test_split(train_data, test_size=0.2, random_state=73)

In [14]:
train_data

Unnamed: 0,Age at Histological Diagnosis,Weight (lbs),Gender,Ethnicity,Smoking status,Pack Years,%GG,Tumor Location (choice=RUL),Tumor Location (choice=RML),Tumor Location (choice=RLL),...,VIM,LMO2,EGR2,BGN,COL4A1,COL5A1,COL5A2,Case ID,Histology,Images
84,0.142478,-0.520045,0.632456,0.238176,0.271694,-0.865494,-0.216051,0.848115,0.441726,-2.263846,...,-1.245929,0.051644,-1.315155,-0.914313,-0.894344,-0.720289,-0.829919,LUNG-105,1,./Lung Mask/LUNG-105_93.jpg
43,-0.399769,-0.390782,0.632456,0.238176,0.271694,0.529459,-0.442795,0.848115,0.441726,0.441726,...,-0.250353,-0.505755,-0.494601,-0.519134,-0.459968,0.332892,-0.040866,R01-115,0,./Lung Mask/R01-115_CT_78.jpg
71,1.091412,0.520355,0.632456,0.238176,0.271694,-0.151018,-0.442795,0.848115,0.441726,0.441726,...,-0.645176,0.347385,-1.164960,-0.440282,-0.639265,-0.270667,-0.085214,LUNG-054,1,./Lung Mask/LUNG-054_80.jpg
78,0.142478,2.262414,0.632456,0.238176,-1.392433,1.298577,-0.442795,0.848115,-2.263846,-2.263846,...,-0.295248,-0.480444,0.615903,-0.350723,0.695070,0.677702,0.172543,LUNG-086,1,./Lung Mask/LUNG-086_77.jpg
36,1.091412,-0.148482,0.632456,0.238176,0.271694,-0.636630,0.119566,0.848115,0.441726,0.441726,...,2.944803,1.827347,0.612770,-0.309901,2.111564,0.775107,-0.792705,R01-113,0,./Lung Mask/R01-113_CT_91.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,1.362535,-0.633082,0.632456,0.238176,0.271694,-0.247934,-0.442795,-1.179086,0.441726,0.441726,...,2.893910,-0.520695,0.012946,0.051088,0.575107,0.107287,-0.131911,R01-103,0,./Lung Mask/R01-103_CT_85.jpg
59,-0.535331,0.638638,-1.581139,0.238176,0.271694,1.050609,-0.442795,-1.179086,0.441726,0.441726,...,-0.192031,-0.572238,-0.084814,0.106664,-0.083595,0.316285,-0.147813,LUNG-006,1,./Lung Mask/LUNG-006_74.jpg
17,0.549164,-1.925348,-1.581139,0.238176,0.271694,-0.636630,2.369009,-1.179086,0.441726,0.441726,...,0.561548,-0.113635,-0.982738,-0.124799,1.083835,-0.057580,0.046347,R01-100,0,./Lung Mask/R01-100_CT_72.jpg
5,-0.942017,0.201910,0.632456,0.238176,0.271694,-0.247934,-0.442795,0.848115,0.441726,0.441726,...,1.994819,0.116045,-0.384157,-0.214552,0.312325,-0.307193,0.131372,R01-068,0,./Lung Mask/R01-068_CT_91.jpg


### 2. MedClip Model

In [15]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np

# Load the MedCLIP model and processor
# Note: Replace the model and processor names with actual MedCLIP model names when available.
model = CLIPModel.from_pretrained("flaviagiammarino/pubmed-clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("flaviagiammarino/pubmed-clip-vit-base-patch32")


train_data["Histopathological Grade"] = label_encoder.fit_transform(train_data["Histopathological Grade"])

# Example: Extracting a row of features and label
example_row = train_data.iloc[0]  # Extract the first row of the dataframe
#features = torch.tensor([example_row[['Age at Histological Diagnosis', 'Smoking status', 'Pathological T stage', 'Pathological N stage', 'Pathological M stage', 'EGFR mutation status', 'Histopathological Grade', 'Recurrence','KRAS mutation status']].values])  # Replace with your actual feature column names
label = example_row['Histology']

# Extract features and ensure they are in a suitable numeric format
features_np = example_row.drop(['Case ID', 'Histology', 'Images']).values


# Ensure data is in a suitable format (e.g., float)
features_np = features_np.astype(np.float32)

# Convert to PyTorch tensor
features = torch.tensor([features_np])

# Example text and image
text = ["A patient with a history of lung cancer"]
image_path = "./Fused Lung 2 copy/R01-149_74_Fused.jpg"  # Replace with the path to your CT image
image = Image.open(image_path)

# Preprocess the text and image
inputs = processor(
    text=text, 
    images=image, 
    return_tensors="pt", 
    padding=True
)

# Forward pass
outputs = model(**inputs)

# Extract image and text features
image_features = outputs.image_embeds

# Concatenate image features and tabular data
combined_features = torch.cat((image_features, features), dim=1)


import torch
import torch.nn as nn
import torch.nn.functional as F

class Classifier(nn.Module):
    def __init__(self, input_dim):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)  # First hidden layer
        self.fc2 = nn.Linear(256, 128)        # Second hidden layer
        self.fc3 = nn.Linear(128, 64)         # Third hidden layer
        self.fc4 = nn.Linear(64, 32)          # Fourth hidden layer
        self.fc5 = nn.Linear(32, 1)           # Output layer
        
        # Dropout for regularization (can adjust rate as needed)
        self.dropout = nn.Dropout(p=0.5)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))  # Activation for first hidden layer
        x = self.dropout(x)      # Dropout for first hidden layer
        x = F.relu(self.fc2(x))  # Activation for second hidden layer
        x = self.dropout(x)      # Dropout for second hidden layer
        x = F.relu(self.fc3(x))  # Activation for third hidden layer
        x = self.dropout(x)      # Dropout for third hidden layer
        x = F.relu(self.fc4(x))  # Activation for fourth hidden layer
        x = self.dropout(x)      # Dropout for fourth hidden layer
        x = torch.sigmoid(self.fc5(x))  # Sigmoid activation for output layer
        return x


# Initialize the classifier
input_dim = combined_features.size(1)
classifier = Classifier(input_dim)

# Forward pass through the classifier
predictions = classifier(combined_features)

# Applying a threshold to get class labels
threshold = 0.5
predicted_labels = (predictions >= threshold).int()

# Convert labels to numpy array or list for further usage
predicted_labels_np = predicted_labels.detach().cpu().numpy()

# Compute accuracy by comparing predictions to actual labels
accuracy = accuracy_score([label], predicted_labels_np)
print("Accuracy:", accuracy)



Accuracy: 1.0


In [None]:
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from torchvision.io import read_image


transform = transforms.Compose([
    transforms.RandomRotation(degrees=10),  # Random rotation between -10 and 10 degrees
    transforms.RandomHorizontalFlip(p=0.5),  # Horizontal flipping with probability 0.5
    transforms.RandomVerticalFlip(p=0.5),  # Vertical flipping with probability 0.5
    transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0)),  # Random cropping and resizing
    transforms.ToTensor(),  # Convert PIL image to PyTorch tensor
    #transforms.Normalize(mean=[mean], std=[std]),
])

class NSLCDataset(Dataset):
    def __init__(self, dataframe, processor, transform=None):
        self.dataframe = dataframe
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        cancer_type = 'Adenocarcinoma' if row['Histology'] == 0 else 'Squamous cell carcinoma'
        text = ["A photo of " + cancer_type + " cancer"]
        # Assuming 'Images' column contains lists of image paths
        image_paths = row['Images'].split(';')
        images = [Image.open(img_path) for img_path in image_paths]
        
        # Apply transformations if any
        if self.transform:
            images = [self.transform(image) for image in images]
        
        # Process each image and text separately and stack image tensors
        inputs_list = [self.processor(text=text, images=image, return_tensors="pt", padding='max_length', max_length=32) for image in images]

        image_tensors = torch.stack([inp['pixel_values'].squeeze(0) for inp in inputs_list], dim=0)
        # Average or sum the image features if you're using multiple images
        inputs = {
            'input_ids': inputs_list[0]['input_ids'],  # Using the text input from the first item
            'pixel_values': image_tensors.mean(dim=0).unsqueeze(0)  # Averaging the image inputs and adding batch dimension
        }
        

        features_np = row.drop(['Case ID', 'Histology', 'Images']).values.astype(np.float32)
        
        features = torch.tensor([features_np])
        label = torch.tensor(row['Histology']).float()
        return inputs, features, label


# Creating Datasets
train_dataset = NSLCDataset(train_data, processor)

valid_dataset = NSLCDataset(valid_data, processor)
test_dataset = NSLCDataset(test_data, processor)

# Creating DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training Loop
num_epochs = 5

# Use in loss function
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)


for epoch in range(num_epochs):
    classifier.train()
    running_loss = 0.0
    all_labels = []
    all_preds = []
    for inputs, features, labels in train_loader:
        optimizer.zero_grad()
        inputs['pixel_values'] = inputs['pixel_values'].squeeze(1)
        inputs['input_ids'] = inputs['input_ids'].squeeze(1)
        outputs = model(**inputs)
        image_features = outputs.image_embeds
        features = features.squeeze(1)
        combined_features = torch.cat((image_features, features), dim=1)
        predictions = classifier(combined_features)
        if (predictions.squeeze().shape != labels.shape):
            continue
        # print(predictions.squeeze().shape)
        # print(labels.shape)
        loss = criterion(predictions.squeeze(), labels)
        loss.backward()
        optimizer.step()
        all_labels.append(labels.cpu().numpy())
        all_preds.append(torch.round(predictions).cpu().detach().numpy())

        running_loss += loss.item()

    all_labels = np.concatenate(all_labels)
    all_preds = np.concatenate(all_preds)
    accuracy = accuracy_score(all_labels, all_preds)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}, Accuracy: {accuracy*100:.2f}%")

# Validation Loop
classifier.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, features, labels in valid_loader:
        inputs['pixel_values'] = inputs['pixel_values'].squeeze(1)
        inputs['input_ids'] = inputs['input_ids'].squeeze(1)
        outputs = model(**inputs)
        image_features = outputs.image_embeds
        features = features.squeeze(1)
        combined_features = torch.cat((image_features, features), dim=1)
        predictions = classifier(combined_features)
        all_preds.append(predictions.squeeze().cpu().numpy())
        all_labels.append(labels.cpu().numpy())
all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)
accuracy = accuracy_score(all_labels, all_preds >= 0.5)
print(f"Validation Accuracy: {accuracy*100:.2f}%")


classifier.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, features, labels in test_loader:
        inputs['pixel_values'] = inputs['pixel_values'].squeeze(1)
        inputs['input_ids'] = inputs['input_ids'].squeeze(1)
        outputs = model(**inputs)
        image_features = outputs.image_embeds
        features = features.squeeze(1)
        combined_features = torch.cat((image_features, features), dim=1)
        predictions = classifier(combined_features)
        all_preds.append(predictions.squeeze().cpu().numpy())
        all_labels.append(labels.cpu().numpy())
all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)
accuracy = accuracy_score(all_labels, all_preds >= 0.5)
print(f"Testing Accuracy: {accuracy*100:.2f}%")

In [None]:
classifier.eval()
all_preds = []
all_labels = []
count = 0
with torch.no_grad():
    for inputs, features, labels in test_loader:
        inputs['pixel_values'] = inputs['pixel_values'].squeeze(1)
        inputs['input_ids'] = inputs['input_ids'].squeeze(1)
        outputs = model(**inputs)
        image_features = outputs.image_embeds
        features = features.squeeze(1)
        combined_features = torch.cat((image_features, features), dim=1)
        predictions = classifier(combined_features)
        predictions_numpy = predictions.squeeze().cpu().numpy()
        labels_numpy = labels.cpu().numpy()
        all_preds.append(predictions.squeeze().cpu().numpy())
        all_labels.append(labels.cpu().numpy())
        count += 1
all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)
accuracy = accuracy_score(all_labels, all_preds >= 0.5)
print(f"Testing Accuracy: {accuracy*100:.2f}%")

In [None]:
from sklearn.metrics import classification_report
binary_predictions = (all_preds >= 0.5).astype(int)


print(classification_report(all_labels, binary_predictions, target_names=['Class 0', 'Class 1']))

#Confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
cm = confusion_matrix(all_labels, binary_predictions)
sns.heatmap(cm, annot=True, fmt='d')

In [25]:
torch.save(model, './models/MedClip_CT_2D.pth')
torch.save(model.state_dict(), './models/MedClip_CT_2D_weights.pth')