## Tumor Detection (CNN)

In [1]:
import zipfile
import os
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow.keras.backend as K
K.clear_session()

In [2]:
from tensorflow.keras import layers, models

In [3]:
import tensorflow as tf

### Step1: Data preperation

In [4]:
# Read Excel
labels_df = pd.read_excel("app_data.xlsx")

#### 1.1 Found empty label data rows, clean them.

In [5]:

# Clean the Diagnosis column
labels_df['Diagnosis'] = labels_df['Diagnosis'].str.strip().str.lower()
labels_df['label'] = labels_df['Diagnosis'].map({
    'appendicitis': 1,
    'not appendicitis': 0
})

# Drop rows missing US_Number or Diagnosis
labels_df = labels_df.dropna(subset=['US_Number', 'Diagnosis'])

# Clean US_Number to string (remove decimals, spaces)
labels_df['US_Number'] = labels_df['US_Number'].astype(str).str.strip().str.split('.').str[0]

# DEBUG: Check unique values
print("🧠 Unique patient IDs in Excel (cleaned):", labels_df['US_Number'].nunique())

# Create label_dict
label_dict = dict(zip(labels_df['US_Number'], labels_df['label']))

🧠 Unique patient IDs in Excel (cleaned): 759


#### 1.2 Found unmatching images, ignore them

In [6]:
image_folder = "B_mode_ultrasound"
image_data = []
unmatched = []

for img_name in os.listdir(image_folder):
    if img_name.endswith(('.png', '.bmp')):
        patient_id = img_name.split('.')[0].strip()  # '123' from '123.1 blah.bmp'
        label = label_dict.get(patient_id)

        if label is None:
            unmatched.append(patient_id)  # Collect for review

        image_data.append({
            "image_path": os.path.join(image_folder, img_name),
            "patient_id": patient_id,
            "label": label
        })

image_df = pd.DataFrame(image_data)

# Save and review unmatched
pd.Series(unmatched).value_counts().to_csv("unmatched_patient_ids.csv")
print(f"🚨 Total unmatched: {len([l for l in image_df['label'] if pd.isna(l)])}")


🚨 Total unmatched: 756


In [7]:
print(image_df[image_df['label'].isna()])

                                       image_path patient_id  label
5                 B_mode_ultrasound/585.2 App.bmp        585    NaN
6              B_mode_ultrasound/3.3 Appendix.png          3    NaN
8                 B_mode_ultrasound/747.2 App.bmp        747    NaN
11                   B_mode_ultrasound/102.11.bmp        102    NaN
12              B_mode_ultrasound/412.1 App_M.bmp        412    NaN
...                                           ...        ...    ...
2081              B_mode_ultrasound/839.1 RLQ.bmp        839    NaN
2083              B_mode_ultrasound/368.2 App.bmp        368    NaN
2088              B_mode_ultrasound/824.5 App.bmp        824    NaN
2091              B_mode_ultrasound/841.6 App.bmp        841    NaN
2092  B_mode_ultrasound/736.1 RLQ app not vis.bmp        736    NaN

[756 rows x 3 columns]


In [8]:
matched_df = image_df[image_df['label'].notna()]
matched_df.to_csv("final_labeled_ultrasound_images.csv", index=False)


In [9]:
matched_df

Unnamed: 0,image_path,patient_id,label
0,B_mode_ultrasound/114.2 App_M.bmp,114,1.0
1,B_mode_ultrasound/189.6 App_M.bmp,189,1.0
2,B_mode_ultrasound/69.1 App T.bmp,69,1.0
3,B_mode_ultrasound/86.2 App T.bmp,86,1.0
4,B_mode_ultrasound/168.5 App.bmp,168,1.0
...,...,...,...
2087,B_mode_ultrasound/111.4 App_M.bmp,111,1.0
2089,B_mode_ultrasound/927.5 App_M.bmp,927,1.0
2090,B_mode_ultrasound/9.2 Appendix.bmp,9,1.0
2093,B_mode_ultrasound/488.3 App_M.bmp,488,1.0


#### 1.3 Train test split

In [10]:
from sklearn.model_selection import train_test_split

unique_patients = matched_df['patient_id'].unique()
train_patients, test_patients = train_test_split(unique_patients, test_size=0.2, random_state=42)

train_df = matched_df[matched_df['patient_id'].isin(train_patients)]
val_df = matched_df[matched_df['patient_id'].isin(test_patients)]

In [11]:
assert set(train_patients).isdisjoint(set(test_patients)), "Train/test patient sets overlap."


In [12]:
train_df

Unnamed: 0,image_path,patient_id,label
1,B_mode_ultrasound/189.6 App_M.bmp,189,1.0
2,B_mode_ultrasound/69.1 App T.bmp,69,1.0
4,B_mode_ultrasound/168.5 App.bmp,168,1.0
9,B_mode_ultrasound/947.4 App_M.bmp,947,1.0
10,B_mode_ultrasound/65.4 App_Color doppler_incre...,65,1.0
...,...,...,...
2087,B_mode_ultrasound/111.4 App_M.bmp,111,1.0
2089,B_mode_ultrasound/927.5 App_M.bmp,927,1.0
2090,B_mode_ultrasound/9.2 Appendix.bmp,9,1.0
2093,B_mode_ultrasound/488.3 App_M.bmp,488,1.0


### Step 2: Set Up Data Generators (TensorFlow/Keras)

In [13]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array

IMG_SIZE = (224, 224)  # or (128, 128) depending on resolution
BATCH_SIZE = 32

def preprocess_image(file_path, label):
    image = tf.io.read_file(file_path)

    # Decode based on file extension
    file_ext = tf.strings.split(file_path, '.')[-1]
    image = tf.case([
        (tf.equal(file_ext, 'bmp'), lambda: tf.image.decode_bmp(image, channels=3)),
        (tf.equal(file_ext, 'png'), lambda: tf.image.decode_png(image, channels=3)),
    ], default=lambda: tf.image.decode_jpeg(image, channels=3))  # fallback

    image = tf.image.resize(image, (224, 224))
    image = image / 255.0
    return image, label



def df_to_dataset(df, shuffle=True):
    file_paths = df['image_path'].values
    labels = df['label'].values.astype('float32')
    ds = tf.data.Dataset.from_tensor_slices((file_paths, labels))
    ds = ds.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    return ds.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Create datasets
train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df, shuffle=False)


### Step 3: Define and Compile a Simple CNN 

In [14]:

model = models.Sequential([
    layers.Input(shape=(224, 224, 3)),
    layers.Conv2D(32, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Binary classification
])


### Step 4: Train a model

In [15]:

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_ds, validation_data=val_ds, epochs=5)


Train for 34 steps, validate for 9 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Step 5: Evaluate and Save

In [16]:
model.evaluate(val_ds)
#model.save("appendicitis_cnn_model.h5")



[0.0, 1.0]