In [1]:
!pip install tensorflow matplotlib numpy




In [None]:
#Typical Libraries

import pandas as pd #Pandas for DataFrame's
import numpy as np #Numpy for math
from PIL import Image #Pillow for image processing
from pathlib import Path #Pathlib as an OS replacement for paths
import matplotlib.pyplot as plt #Matplotlib for plotting data

#Tracking progress and time

import time
from datetime import datetime #To get the current time for a timestamp
from zoneinfo import ZoneInfo #To set my timezone
from tqdm import tqdm #tdqm for progress bars

#Possible Viz Enhancements

import seaborn as sns #Seaborn for advanced plotting (tbd)
from tabulate import tabulate #Tabulate for pretty tables (tbd)

#Machine Learning with TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

In [None]:
#CNN Preprocessing
from tensorflow.keras.preprocessing.image import ImageDataGenerator #For data augmentation

#Constants for the model
IMAGE_SIZE = (128, 128)
DATASET_PATH = Path("../Data/data_images")
CSV_PATH = Path("../Data/data_sheet.csv")

df = pd.read_csv(CSV_PATH)
print("CSV shape:", df.shape)
df.head()

: 

In [None]:
import gc
import psutil
import os
from PIL import Image

def load_and_match_images(image_path, size=IMAGE_SIZE):
    image = Image.open(image_path).convert("RGB")
    image = image.resize(size)
    return np.array(image) / 255.0 # Normalize to [0, 1]


def get_memory_usage():
    process = psutil.Process(os.getpid())
    mem_mb = process.memory_info().rss / 1024**2
    return f"{mem_mb:.2f} MB"

# Adaptive garbage collection if memory exceeds threshold
def adaptive_gc(threshold_gb=35):
    mem_gb = psutil.Process(os.getpid()).memory_info().rss / 1024**3
    if mem_gb > threshold_gb:
        print(f"RAM at {mem_gb:.2f} GB — triggering garbage collection")
        gc.collect()

def process_large_folders(df, dataset_path, batch_size=1000, memory_threshold_gb=35):
    matched_data = []
    start = time.time()
    total_batches = (len(df) - 1) // batch_size + 1

    print(f"🚀 Starting Processing")
    print(f"Total rows: {len(df)} | Batch size: {batch_size} | Total batches: {total_batches}")
    print(f"Initial memory usage: {get_memory_usage()}")
    print("-" * 60)

    for batch_start in range(0, len(df), batch_size):
        batch_end = min(batch_start + batch_size, len(df))
        batch_df = df.iloc[batch_start:batch_end]

        batch_num = batch_start // batch_size + 1
        print(f"\n Batch {batch_num}/{total_batches} (Rows {batch_start}-{batch_end-1})")
        print(f"Memory before batch: {get_memory_usage()}")

        batch_data = []
        for idx, (_, row) in enumerate(tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Processing Batch {batch_num}")):
            folder_name = str(row['dummy_id']) + "_ct_images"
            folder_path = dataset_path / folder_name

            if folder_path.exists():
                image_files = sorted(folder_path.glob("*.jpg"))
                image_batch = []

                for j, img_path in enumerate(image_files):
                    img = load_and_match_images(img_path)
                    image_batch.append(img)

                    if (j + 1) % 1000 == 0:
                        adaptive_gc(threshold_gb=memory_threshold_gb)

                if image_batch:
                    batch_data.append((row, image_batch.copy()))
                    del image_batch
                    gc.collect()
            else:
                if idx < 5:
                    print(f"  ⚠️ Missing folder: {folder_path}")

        matched_data.extend(batch_data)
        del batch_data
        gc.collect()

        print(f"Batch {batch_num} complete | Memory now: {get_memory_usage()} | Total matched: {len(matched_data)}")

    end = time.time()
    print("\nPROCESSING COMPLETE")
    print(f"Total time: {end - start:.2f}s | Avg per batch: {(end - start)/total_batches:.2f}s")
    print(f"Final memory usage: {get_memory_usage()} | Total matched folders: {len(matched_data)}")
    print("=" * 60)

    return matched_data
# Call the function to start the processing
matched_data = process_large_folders(df, DATASET_PATH, batch_size=50)


🚀 Starting Processing
Total rows: 335 | Batch size: 50 | Total batches: 7
Initial memory usage: 188.97 MB
------------------------------------------------------------

 Batch 1/7 (Rows 0-49)
Memory before batch: 185.22 MB


Processing Batch 1: 100%|██████████| 50/50 [01:07<00:00,  1.36s/it]


Batch 1 complete | Memory now: 297.70 MB | Total matched: 50

 Batch 2/7 (Rows 50-99)
Memory before batch: 297.83 MB


Processing Batch 2: 100%|██████████| 50/50 [01:23<00:00,  1.66s/it]


Batch 2 complete | Memory now: 288.38 MB | Total matched: 100

 Batch 3/7 (Rows 100-149)
Memory before batch: 288.52 MB


Processing Batch 3: 100%|██████████| 50/50 [01:13<00:00,  1.47s/it]


Batch 3 complete | Memory now: 353.41 MB | Total matched: 150

 Batch 4/7 (Rows 150-199)
Memory before batch: 353.55 MB


Processing Batch 4: 100%|██████████| 50/50 [01:04<00:00,  1.28s/it]


Batch 4 complete | Memory now: 419.27 MB | Total matched: 200

 Batch 5/7 (Rows 200-249)
Memory before batch: 419.39 MB


Processing Batch 5: 100%|██████████| 50/50 [01:12<00:00,  1.46s/it]


Batch 5 complete | Memory now: 285.55 MB | Total matched: 250

 Batch 6/7 (Rows 250-299)
Memory before batch: 285.09 MB


Processing Batch 6: 100%|██████████| 50/50 [01:33<00:00,  1.86s/it]


Batch 6 complete | Memory now: 299.70 MB | Total matched: 300

 Batch 7/7 (Rows 300-334)
Memory before batch: 299.84 MB


Processing Batch 7:  31%|███▏      | 11/35 [00:35<02:10,  5.45s/it]

In [None]:
# Filter only L/R labels and map to binary
binary_df = df[df["Tumor laterality"].isin(["L", "R"])].copy()
binary_df["Binary Label"] = binary_df["Tumor laterality"].map({"L": 0, "R": 1})

print("Preview of binary_df:")
print(binary_df[["dummy_id", "Tumor laterality", "Binary Label"]].head())

In [None]:
X = []
y = []

# Ensure binary_df is indexed by dummy_id for fast lookup
binary_df.set_index("dummy_id", inplace=True)

for row, images in matched_data:
    dummy_id = row["dummy_id"]

    # Skip any samples not labeled as L/R (already filtered in binary_df)
    if dummy_id in binary_df.index:
        label = binary_df.loc[dummy_id, "Binary Label"]
        
        # You can change this to any image reduction strategy you prefer
        avg_image = np.mean(images, axis=0)  # Shape: (H, W) or (H, W, 1)
        
        X.append(avg_image)
        y.append(label)

# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

# Optional: Expand dims if grayscale
if len(X.shape) == 3:  # (samples, height, width)
    X = X[..., np.newaxis]  # → (samples, height, width, 1)

# Normalize pixel values
X = X / 255.0

#Train-Test Splitting

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
df = pd.read_csv(CSV_PATH)
print()
print("-"*60)
print("Dataframe shape:", df.shape)
print("-"*60)
print("Selected Parameter:")
print()
df["Tumor laterality"] = df["Tumor laterality"].str.strip().str.upper()
laterality_counts = df["Tumor laterality"].value_counts(dropna=False)
print(laterality_counts.to_string())
print()
print("-"*60)
print()


In [None]:
#CNN Model Defined
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(2)
])

#Model Compilation with ADAM
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

#Model Summary Print
model.summary()

#How to Train Your Dragon (or model)
history = model.fit(x_train, y_train, epochs=15, 
                    validation_data=(x_test, y_test))

# Evaluate The Model w/ Test Data
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=2)
print(f"\nTest accuracy: {test_accuracy*100:.2f}%")

# Plot training & validation
plt.figure(figsize=(8,6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.savefig('cnn_accuracy_plot.png')  # Save the figure
plt.show()

model.save('HNSCC_CNN_Model.h5')


## Full Code

In [None]:
#Typical Libraries

import pandas as pd #Pandas for DataFrame's
import numpy as np #Numpy for math
from PIL import Image #Pillow for image processing
from pathlib import Path #Pathlib as an OS replacement for paths
import matplotlib.pyplot as plt #Matplotlib for plotting data

#Tracking progress and time

import time
from datetime import datetime #To get the current time for a timestamp
from zoneinfo import ZoneInfo #To set my timezone
from tqdm import tqdm #tdqm for progress bars

#Possible Viz Enhancements

import seaborn as sns #Seaborn for advanced plotting (tbd)
from tabulate import tabulate #Tabulate for pretty tables (tbd)

#Machine Learning with Sci-Kit Learn, TensorFlow, and Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
#CNN Preprocessing
from tensorflow.keras.preprocessing.image import ImageDataGenerator #For data augmentation

#Constants for the model
IMAGE_SIZE = (128, 128)
DATASET_PATH = Path("../Data/data_images")
CSV_PATH = Path("../Data/data_sheet.csv")

df = pd.read_csv(CSV_PATH)
print("CSV shape:", df.shape)
df.head()

import gc
import psutil
import os
from PIL import Image

def load_and_match_images(image_path, size=IMAGE_SIZE):
    image = Image.open(image_path).convert("RGB")
    image = image.resize(size)
    return np.array(image) / 255.0 # Normalize to [0, 1]


def get_memory_usage():
    process = psutil.Process(os.getpid())
    mem_mb = process.memory_info().rss / 1024**2
    return f"{mem_mb:.2f} MB"

# Adaptive garbage collection if memory exceeds threshold
def adaptive_gc(threshold_gb=35):
    mem_gb = psutil.Process(os.getpid()).memory_info().rss / 1024**3
    if mem_gb > threshold_gb:
        print(f"RAM at {mem_gb:.2f} GB — triggering garbage collection")
        gc.collect()

def process_large_folders(df, dataset_path, batch_size=1000, memory_threshold_gb=35):
    matched_data = []
    start = time.time()
    total_batches = (len(df) - 1) // batch_size + 1

    print(f"🚀 Starting Processing")
    print(f"Total rows: {len(df)} | Batch size: {batch_size} | Total batches: {total_batches}")
    print(f"Initial memory usage: {get_memory_usage()}")
    print("-" * 60)

    for batch_start in range(0, len(df), batch_size):
        batch_end = min(batch_start + batch_size, len(df))
        batch_df = df.iloc[batch_start:batch_end]

        batch_num = batch_start // batch_size + 1
        print(f"\n Batch {batch_num}/{total_batches} (Rows {batch_start}-{batch_end-1})")
        print(f"Memory before batch: {get_memory_usage()}")

        batch_data = []
        for idx, (_, row) in enumerate(tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Processing Batch {batch_num}")):
            folder_name = str(row['dummy_id']) + "_ct_images"
            folder_path = dataset_path / folder_name

            if folder_path.exists():
                image_files = sorted(folder_path.glob("*.jpg"))
                image_batch = []

                for j, img_path in enumerate(image_files):
                    img = load_and_match_images(img_path)
                    image_batch.append(img)

                    if (j + 1) % 1000 == 0:
                        adaptive_gc(threshold_gb=memory_threshold_gb)

                if image_batch:
                    batch_data.append((row, image_batch.copy()))
                    del image_batch
                    gc.collect()
            else:
                if idx < 5:
                    print(f"  ⚠️ Missing folder: {folder_path}")

        matched_data.extend(batch_data)
        del batch_data
        gc.collect()

        print(f"Batch {batch_num} complete | Memory now: {get_memory_usage()} | Total matched: {len(matched_data)}")

    end = time.time()
    print("\nPROCESSING COMPLETE")
    print(f"Total time: {end - start:.2f}s | Avg per batch: {(end - start)/total_batches:.2f}s")
    print(f"Final memory usage: {get_memory_usage()} | Total matched folders: {len(matched_data)}")
    print("=" * 60)

    return matched_data
# Call the function to start the processing
matched_data = process_large_folders(df, DATASET_PATH, batch_size=50)


# Filter only L/R labels and map to binary
binary_df = df[df["Tumor laterality"].isin(["L", "R"])].copy()
binary_df["Binary Label"] = binary_df["Tumor laterality"].map({"L": 0, "R": 1})

print("Preview of binary_df:")
print(binary_df[["dummy_id", "Tumor laterality", "Binary Label"]].head())

X = []
y = []

# Ensure binary_df is indexed by dummy_id for fast lookup
binary_df.set_index("dummy_id", inplace=True)

for row, images in matched_data:
    dummy_id = row["dummy_id"]

    # Skip any samples not labeled as L/R (already filtered in binary_df)
    if dummy_id in binary_df.index:
        label = binary_df.loc[dummy_id, "Binary Label"]
        
        # You can change this to any image reduction strategy you prefer
        avg_image = np.mean(images, axis=0)  # Shape: (H, W) or (H, W, 1)
        
        X.append(avg_image)
        y.append(label)

# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

# Optional: Expand dims if grayscale
if len(X.shape) == 3:  # (samples, height, width)
    X = X[..., np.newaxis]  # → (samples, height, width, 1)

#Train-Test Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Data Augmentation for CT Scans

datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    zoom_range=0.1,
    fill_mode='nearest'
)

#Reading CSV to Print DataSet
df = pd.read_csv(CSV_PATH)
print()
print("-"*60)
print("Dataframe shape:", df.shape)
print("-"*60)
print("Selected Parameter:")
print()
df["Tumor laterality"] = df["Tumor laterality"].str.strip().str.upper()
laterality_counts = df["Tumor laterality"].value_counts(dropna=False)
print(laterality_counts.to_string())
print()
print("-"*60)
print()

#CNN Model Defined
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(2)
])

#Model Compilation with ADAM
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

#Model Summary Print
model.summary()

#How to Train Your Dragon (or model)
history = model.fit(X_train, y_train, epochs=15, 
                    validation_data=(X_test, y_test))


# Evaluate The Model w/ Test Data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"\nTest accuracy: {test_accuracy*100:.2f}%")

# Plot training & validation
plt.figure(figsize=(8,6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.savefig('cnn_accuracy_plot.png')  # Save the figure
plt.show()

model.save('HNSCC_CNN_Model.h5')



## Updated Claude Version of the CNN

In [None]:
# Improved CNN Model with proper regularization and architecture

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

# Enhanced CNN Model
def create_improved_cnn_model(input_shape=(128, 128, 1)):
    model = models.Sequential([
        # First Conv Block
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.Conv2D(32, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Second Conv Block
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Third Conv Block
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Fourth Conv Block
        layers.Conv2D(256, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.GlobalAveragePooling2D(),  # Better than Flatten + Dense
        
        # Classification Head
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    
    return model

# Create the improved model
model = create_improved_cnn_model()

# Better compilation for binary classification
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',  # Better for binary classification
    metrics=['accuracy', 'precision', 'recall']
)

# Add callbacks for better training
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7,
        verbose=1
    ),
    ModelCheckpoint(
        'best_hnscc_model.h5',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )
]

# Train with callbacks and validation split
history = model.fit(
    X_train, y_train,
    epochs=50,  # More epochs with early stopping
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=callbacks,
    verbose=1
)

# Enhanced evaluation
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=2)
f1_score = 2 * (test_precision * test_recall) / (test_precision + test_recall)

print(f"\nTest Results:")
print(f"Accuracy: {test_accuracy*100:.2f}%")
print(f"Precision: {test_precision*100:.2f}%")
print(f"Recall: {test_recall*100:.2f}%")
print(f"F1-Score: {f1_score*100:.2f}%")

# Enhanced plotting
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Accuracy plot
axes[0,0].plot(history.history['accuracy'], label='Training Accuracy')
axes[0,0].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[0,0].set_title('Model Accuracy')
axes[0,0].set_xlabel('Epoch')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].legend()
axes[0,0].grid(True)

# Loss plot
axes[0,1].plot(history.history['loss'], label='Training Loss')
axes[0,1].plot(history.history['val_loss'], label='Validation Loss')
axes[0,1].set_title('Model Loss')
axes[0,1].set_xlabel('Epoch')
axes[0,1].set_ylabel('Loss')
axes[0,1].legend()
axes[0,1].grid(True)

# Precision plot
axes[1,0].plot(history.history['precision'], label='Training Precision')
axes[1,0].plot(history.history['val_precision'], label='Validation Precision')
axes[1,0].set_title('Model Precision')
axes[1,0].set_xlabel('Epoch')
axes[1,0].set_ylabel('Precision')
axes[1,0].legend()
axes[1,0].grid(True)

# Recall plot
axes[1,1].plot(history.history['recall'], label='Training Recall')
axes[1,1].plot(history.history['val_recall'], label='Validation Recall')
axes[1,1].set_title('Model Recall')
axes[1,1].set_xlabel('Epoch')
axes[1,1].set_ylabel('Recall')
axes[1,1].legend()
axes[1,1].grid(True)

plt.tight_layout()
plt.savefig('enhanced_cnn_training_plots.png', dpi=300, bbox_inches='tight')
plt.show()