In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

# Import Keras utilities for image handling and model building
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout

# Import ResNet50 for Transfer Learning
from tensorflow.keras.applications import ResNet50

# CRITICAL: Import ResNet-specific preprocessing for ImageNet normalization
from tensorflow.keras.applications.resnet50 import preprocess_input 

# Callbacks for advanced training control
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [3]:
# 1. CONFIGURATION AND PATHS ---
IMAGE_SIZE = (384, 384)  # High resolution input for better feature extraction (Higher AUC potential)
BATCH_SIZE = 16          # Reduced batch size accommodates higher image resolution
MAX_EPOCHS = 30 
MODEL_NAME = 'best_tb_resnet50_auc.h5' # File to save the best model weights

In [4]:
# Set your correct absolute path here based on file structure
BASE_DIR = 'C:/Users/Lakshya Gupta/Downloads/amalgogem/' 
TRAIN_DIR = os.path.join(BASE_DIR, 'train')
VAL_DIR = os.path.join(BASE_DIR, 'val')
TEST_DIR = os.path.join(BASE_DIR, 'test')
TRAIN_CSV = os.path.join(BASE_DIR, 'train_labels.csv')
VAL_CSV = os.path.join(BASE_DIR, 'val_labels.csv')

In [5]:
# 2. DATA LOADING & PREPARATION
try:
    train_df = pd.read_csv(TRAIN_CSV)
    val_df = pd.read_csv(VAL_CSV)
except FileNotFoundError as e:
    print(f"FATAL ERROR: Could not find CSV files. Path: {e}")
    exit()

# Rename columns to match the flow_from_dataframe expected arguments
train_df.rename(columns={'image_id': 'filename', 'label': 'class'}, inplace=True)
val_df.rename(columns={'image_id': 'filename', 'label': 'class'}, inplace=True)

In [6]:
# Function to ensure filenames have the correct extension (CRITICAL for matching files on disk)
def add_extension(df):
    if not df['filename'].iloc[0].lower().endswith(('.png', '.jpg', '.jpeg')):
        df['filename'] = df['filename'].astype(str) + '.png' # Assuming PNG based on previous observation
    return df
train_df = add_extension(train_df)
val_df = add_extension(val_df)

In [7]:
# 3. CLASS WEIGHTS (Mitigating Data Imbalance) ---
# FIX: Uses numerical labels (0 and 1) confirmed from CSV content
total_samples = len(train_df)
tb_count = len(train_df[train_df['class'] == 1]) # Count TB cases (Minority Class)
normal_count = len(train_df[train_df['class'] == 0]) # Count Normal cases (Majority Class)

if tb_count == 0 or normal_count == 0:
    # Fallback weights if data is corrupted
    class_weights = {0: 1.0, 1: 1.0}
else:
    # Weights penalize errors on the minority class (TB) more heavily
    weight_for_normal = (1 / normal_count) * (total_samples / 2.0)
    weight_for_tb = (1 / tb_count) * (total_samples / 2.0)
    class_weights = {0: weight_for_normal, 1: weight_for_tb}

print(f"Calculated Class Weights: {class_weights}")

Calculated Class Weights: {0: 0.6, 1: 3.0}


In [8]:
# 4. IMAGE GENERATORS (Forced RGB and ImageNet Preprocessing) ---

# FIX: Removed simple rescaling (1./255). Using ResNet's dedicated preprocessing function.

train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input, # 🔑 CRITICAL: Normalizes pixels using ImageNet mean/std
    rotation_range=15, width_shift_range=0.1,
    height_shift_range=0.1, horizontal_flip=True, fill_mode='nearest'
)

In [9]:
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input) 

In [10]:
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df, directory=TRAIN_DIR, x_col='filename', y_col='class',
    target_size=IMAGE_SIZE, batch_size=BATCH_SIZE, class_mode='raw', 
    color_mode='rgb' # FIX: Ensures 3-channel input for ResNet50 weights
)
val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df, directory=VAL_DIR, x_col='filename', y_col='class',
    target_size=IMAGE_SIZE, batch_size=BATCH_SIZE, class_mode='raw', color_mode='rgb'
)

Found 2688 validated image filenames.
Found 672 validated image filenames.


In [11]:
# Setup Test DF

test_files = [f for f in os.listdir(TEST_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
test_df = pd.DataFrame({'filename': test_files})
test_df = add_extension(test_df)

In [12]:
# 5. MODEL CREATION (ResNet50 Transfer Learning) 
def create_resnet_model(input_shape):
    # Load ResNet50 pre-trained on ImageNet weights
    base_model = ResNet50(
        weights='imagenet', # 🔑 Using pre-trained weights for feature extraction
        include_top=False, # Discard the original classification head
        input_shape=input_shape # Explicitly set to (H, W, 3)
    )
    # Phase 1: Freeze base layers (only the new layers will train quickly)
    base_model.trainable = False

    model = Sequential([
        base_model,
        GlobalAveragePooling2D(), # Reduces spatial complexity
        Dense(512, activation='relu'),
        Dropout(0.5), # Regularization to prevent overfitting
        Dense(1, activation='sigmoid') # Final output: Probability of TB (0 to 1)
    ])
    return model

In [13]:
model = create_resnet_model(input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 0us/step


In [14]:
# 6. CALLBACKS (Focus on AUC) 

# Goal: Monitor 'val_auc' (competition metric) and maximize it (mode='max')

checkpoint = ModelCheckpoint(filepath=MODEL_NAME, monitor='val_auc', mode='max', save_best_only=True, verbose=1)
early_stop = EarlyStopping(monitor='val_auc', patience=7, mode='max', restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_auc', patience=4, factor=0.5, verbose=1, mode='max', min_lr=1e-6)
callbacks_list = [checkpoint, early_stop, reduce_lr]

In [15]:
#  7. MODEL TRAINING (Two Phases for Optimal Performance)

# Phase 1: Train the Head (Initial high Learning Rate)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])
print("\n--- Phase 1: Training Head (Frozen Backbone) ---")
history = model.fit(train_generator, validation_data=val_generator, epochs=5, class_weight=class_weights, callbacks=callbacks_list, verbose=1)

# Phase 2: Fine-Tuning (Unfreeze and Train Entire Model)

model.trainable = True # Unlock base layers
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), # 🔑 Very low LR prevents destroying pre-trained weights
              loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])
print("\n--- Phase 2: Fine-Tuning Entire Model ---")
model.fit(train_generator, validation_data=val_generator, epochs=MAX_EPOCHS, initial_epoch=history.epoch[-1], class_weight=class_weights, callbacks=callbacks_list, verbose=1)

# Load the best model weights based on validation AUC from the saved .h5 file

model.load_weights(MODEL_NAME)
print(f"\nLoaded best model weights from {MODEL_NAME}")


--- Phase 1: Training Head (Frozen Backbone) ---


  self._warn_if_super_not_called()


Epoch 1/5
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - auc: 0.8976 - loss: 0.5804
Epoch 1: val_auc improved from None to 0.99910, saving model to best_tb_resnet50_auc.h5




[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m277s[0m 2s/step - auc: 0.9645 - loss: 0.2637 - val_auc: 0.9991 - val_loss: 0.0329 - learning_rate: 0.0010
Epoch 2/5
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - auc: 0.9874 - loss: 0.1417
Epoch 2: val_auc improved from 0.99910 to 0.99937, saving model to best_tb_resnet50_auc.h5




[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 2s/step - auc: 0.9887 - loss: 0.1325 - val_auc: 0.9994 - val_loss: 0.0279 - learning_rate: 0.0010
Epoch 3/5
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - auc: 0.9980 - loss: 0.0554
Epoch 3: val_auc improved from 0.99937 to 0.99971, saving model to best_tb_resnet50_auc.h5




[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 2s/step - auc: 0.9970 - loss: 0.0616 - val_auc: 0.9997 - val_loss: 0.0274 - learning_rate: 0.0010
Epoch 4/5
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 906ms/step - auc: 0.9960 - loss: 0.0809
Epoch 4: val_auc did not improve from 0.99971
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 1s/step - auc: 0.9972 - loss: 0.0618 - val_auc: 0.9996 - val_loss: 0.0202 - learning_rate: 0.0010
Epoch 5/5
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 890ms/step - auc: 0.9991 - loss: 0.0323
Epoch 5: val_auc did not improve from 0.99971
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 1s/step - auc: 0.9988 - loss: 0.0428 - val_auc: 0.9997 - val_loss: 0.0161 - learning_rate: 0.0010

--- Phase 2: Fine-Tuning Entire Model ---
Epoch 5/30
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 884ms/step - auc: 0.9985 - loss: 0.0626
Epoch 5: val_auc i



[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 1s/step - auc: 0.9984 - loss: 0.0551 - val_auc: 0.9997 - val_loss: 0.0185 - learning_rate: 1.0000e-05
Epoch 6/30
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 881ms/step - auc: 0.9991 - loss: 0.0364
Epoch 6: val_auc did not improve from 0.99972
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 1s/step - auc: 0.9977 - loss: 0.0427 - val_auc: 0.9997 - val_loss: 0.0192 - learning_rate: 1.0000e-05
Epoch 7/30
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 881ms/step - auc: 0.9993 - loss: 0.0364
Epoch 7: val_auc did not improve from 0.99972
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 1s/step - auc: 0.9993 - loss: 0.0355 - val_auc: 0.9997 - val_loss: 0.0190 - learning_rate: 1.0000e-05
Epoch 8/30
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 885ms/step - auc: 0.9990 - loss: 0.0351
Epoch 8: val_auc did not improve from 0.99972



In [16]:
# 8. PREDICTION WITH TEST-TIME AUGMENTATION (TTA) ---

def predict_with_tta(model, test_df, test_dir, n_tta):
    """
    TTA is a technique to boost accuracy by averaging predictions 
    from N augmented versions of each test image.
    """
    tta_preds = []
    
    # TTA Generator uses simple augmentations
    tta_datagen = ImageDataGenerator(
        preprocessing_function=preprocess_input, 
        rotation_range=15, 
        horizontal_flip=True, 
        fill_mode='nearest'
    )

    # Run predictions N times
    for i in range(n_tta):
        print(f"Generating TTA run {i+1}/{n_tta}...")
        tta_generator = tta_datagen.flow_from_dataframe(
            dataframe=test_df, directory=test_dir, x_col='filename', y_col=None, 
            target_size=IMAGE_SIZE, batch_size=BATCH_SIZE, class_mode=None, 
            color_mode='rgb', shuffle=False
        )
        tta_generator.reset() 
        preds = model.predict(tta_generator, verbose=1).flatten()
        tta_preds.append(preds)
    
    # Final probability is the mean of all TTA runs
    avg_preds = np.mean(tta_preds, axis=0)
    return avg_preds

N_TTA = 8 # Recommended number of augmentations
print("\n--- Starting Predictions with TTA (N=8) ---")
final_probabilities = predict_with_tta(model, test_df, TEST_DIR, N_TTA)


--- Starting Predictions with TTA (N=8) ---
Generating TTA run 1/8...
Found 840 validated image filenames.
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 885ms/step
Generating TTA run 2/8...
Found 840 validated image filenames.
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 885ms/step
Generating TTA run 3/8...
Found 840 validated image filenames.
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 901ms/step
Generating TTA run 4/8...
Found 840 validated image filenames.
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 865ms/step
Generating TTA run 5/8...
Found 840 validated image filenames.
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 878ms/step
Generating TTA run 6/8...
Found 840 validated image filenames.
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 851ms/step
Generating TTA run 7/8...
Found 840 validated image filenames.
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4

In [22]:
# --- 9. SUBMISSION FILE GENERATION (Final Format Fix) ---
submission_df = pd.DataFrame({
    # CRITICAL FIX: Strips extension and 'test' prefix, then converts to required integer ID.
    'image_id': (
        test_df['filename']
        .str.replace(r'\.(png|jpg|jpeg)$', '', regex=True) # Strip extension
        .str.replace('test', '', regex=False)              # Strip 'test' prefix (e.g., 'test1' -> '1')
        .astype(int)                                       # Convert to integer ID (e.g., 1, 2, 100)
    ),
    'label': final_probabilities
})

SUBMISSION_FILE = os.path.join(BASE_DIR, 'submission_final_ready_3.csv') 
submission_df.to_csv(SUBMISSION_FILE, index=False)

print(f"\n✅ Final submission file '{SUBMISSION_FILE}' generated successfully. The image IDs are now numerical (e.g., 1, 2, 3).")


✅ Final submission file 'C:/Users/Lakshya Gupta/Downloads/amalgogem/submission_final_ready_3.csv' generated successfully. The image IDs are now numerical (e.g., 1, 2, 3).


In [23]:
# --- 9. SUBMISSION FILE GENERATION (MATCHING VISUAL EXAMPLE) ---

# CRITICAL STEP: Convert the predicted probabilities into hard binary labels (0 or 1)
# This uses the standard 0.5 threshold.
final_binary_labels = np.where(final_probabilities > 0.5, 1, 0)


submission_df = pd.DataFrame({
    # Use full filename, matching the submission_example.csv image
    'image_id': test_df['filename'], 
    # Use the new binary labels (0 or 1) to match the visual example
    'label': final_binary_labels
})

SUBMISSION_FILE = os.path.join(BASE_DIR, 'submission_binary_match.csv') 
submission_df.to_csv(SUBMISSION_FILE, index=False)

print(f"\n✅ Final submission file '{SUBMISSION_FILE}' generated successfully. The labels are now binary (0 or 1).")


✅ Final submission file 'C:/Users/Lakshya Gupta/Downloads/amalgogem/submission_binary_match.csv' generated successfully. The labels are now binary (0 or 1).
