## Tumor Detection (CNN)

In [2]:
import zipfile
import os
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### Step1: Data preparation

In [3]:
zip_path = "train take out 1.zip"
extract_folder = "sampled_images"

# Create folder and unzip
os.makedirs(extract_folder, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print(f"Unzipped to {extract_folder}")

Unzipped to sampled_images


In [8]:
sampled_filenames = os.listdir('train take out 1')
# Remove file extensions from sampled image names
sampled_ids = [os.path.splitext(f)[0] for f in sampled_filenames]
print(f"Total sampled images: {len(sampled_filenames)}")

Total sampled images: 1000


In [10]:
# Load the full labels
labels_df = pd.read_csv("train_labels.csv")

# Optional: strip whitespace from filenames
labels_df['id'] = labels_df['id'].str.strip()

# Filter for only sampled images
filtered_labels = labels_df[labels_df['id'].isin(sampled_ids)]

print(f"Labels matched: {len(filtered_labels)}")


Labels matched: 1000


In [11]:
# Save the filtered labels
filtered_labels.to_csv("sampled_labels.csv", index=False)
print("Saved sampled_labels.csv!")

Saved sampled_labels.csv!


In [13]:
df = pd.read_csv('sampled_labels.csv')
df['label'].value_counts()

0    596
1    404
Name: label, dtype: int64

In [15]:
# Add the .tif extension if needed
df['filename'] = df['id'] + ".tif"

# Add the full image path
df['image_path'] = df['filename'].apply(lambda x: os.path.join("train take out 1", x))

# Check it worked
print(df[['id', 'label' if 'label' in df.columns else df.columns[-1], 'image_path']].head())


                                         id  label  \
0  c363267f3311039b685f0fec2311060e5f7f93f6      0   
1  00b25ec6689474e5d5a471e50094ccf529e4173d      0   
2  573fbded852b0ba1fb7361247814ccf3cff73f38      0   
3  ffe632addae0b0846dcc4f60d715cecefdca0983      1   
4  ffcacb05e57c4a333fccb2744298b9275a9f79c4      1   

                                          image_path  
0  train take out 1/c363267f3311039b685f0fec23110...  
1  train take out 1/00b25ec6689474e5d5a471e50094c...  
2  train take out 1/573fbded852b0ba1fb7361247814c...  
3  train take out 1/ffe632addae0b0846dcc4f60d715c...  
4  train take out 1/ffcacb05e57c4a333fccb2744298b...  


### Step 2: Set Up Data Generators (TensorFlow/Keras)

In [17]:
df['label'] = df['label'].astype(str)

IMG_SIZE = (224, 224)
BATCH_SIZE = 32
label_col = 'label'  # change if yours is different (e.g. 'diagnosis')

datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

train_gen = datagen.flow_from_dataframe(
    dataframe=df,
    x_col='image_path',
    y_col=label_col,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',   # or 'categorical' if more than 2 classes
    subset='training',
    shuffle=True
)

val_gen = datagen.flow_from_dataframe(
    dataframe=df,
    x_col='image_path',
    y_col=label_col,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation',
    shuffle=True
)


Found 800 validated image filenames belonging to 2 classes.
Found 200 validated image filenames belonging to 2 classes.


### Step 3: Define and Compile a Simple CNN 

In [20]:
from tensorflow.keras import layers, models 

model = models.Sequential([ 
    layers.Input(shape=(224, 224, 3)), 
    layers.Conv2D(32, (3, 3), activation='relu'), 
    layers.MaxPooling2D(), layers.Conv2D(64, (3, 3), 
    activation='relu'), 
    layers.MaxPooling2D(), 
    layers.Flatten(), 
    layers.Dense(64, activation='relu'), 
    layers.Dense(1, activation='sigmoid') # binary output 
    ]) 

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 

model.summary() 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 222, 222, 32)      896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 111, 111, 32)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 109, 109, 64)      18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 54, 54, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 186624)            0         
_________________________________________________________________
dense (Dense)                (None, 64)                11944000  
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6

### Step 4: Train a model

In [24]:
history = model.fit( train_gen, validation_data=val_gen, epochs=3) 

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 25 steps, validate for 7 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3


### Step 5: Evaluate and Save

In [25]:
# Evaluate 
loss, acc = model.evaluate(val_gen) 
print(f"Validation Accuracy: {acc:.4f}") 
# Save model 
model.save("tumor_cnn_model.h5") 

  ...
    to  
  ['...']
Validation Accuracy: 0.6550
