In [3]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import img_to_array, load_img

# Paths to the directories and CSV file
segmented_masks_path = "D:\\CANCER-DETECTION\\masks"
csv_file_path = "D:\\CANCER-DETECTION\\output.csv"

# Load the CSV file
patient_data = pd.read_csv(csv_file_path)

# Function to load segmented mask images
def load_segmented_masks(segmented_masks_path, target_size=(256, 256)):
    masks = []
    filenames = []

    mask_files = [f for f in os.listdir(segmented_masks_path) if f.endswith('_mask.png')]

    for mask_file in mask_files:
        mask_path = os.path.join(segmented_masks_path, mask_file)
        mask = load_img(mask_path, color_mode='grayscale', target_size=target_size)
        mask = img_to_array(mask)
        
        # Normalize
        mask = mask / 255.0
        
        masks.append(mask)
        filenames.append(mask_file.replace('_mask.png', '.txt'))

    return np.array(masks), filenames

masks, mask_filenames = load_segmented_masks(segmented_masks_path)

# Create a DataFrame for the mask filenames
mask_df = pd.DataFrame(mask_filenames, columns=['File Name'])

# Ensure the 'filename' column in patient_data matches the format of mask_filenames
patient_data['File Name'] = patient_data['File Name'].apply(lambda x: x.replace('.png', ''))

# Debugging: Find mask filenames without corresponding entries in the CSV
mask_filenames_set = set(mask_filenames)
csv_filenames_set = set(patient_data['File Name'])

missing_in_csv = mask_filenames_set - csv_filenames_set
missing_in_masks = csv_filenames_set - mask_filenames_set

print(f"Number of mask files missing in CSV: {len(missing_in_csv)}")
print(f"Number of CSV entries missing in masks: {len(missing_in_masks)}")

if len(missing_in_csv) > 0:
    print("Mask files missing in CSV:")
    print(missing_in_csv)

if len(missing_in_masks) > 0:
    print("CSV entries missing in masks:")
    print(missing_in_masks)

# Remove inconsistencies
mask_df = mask_df[mask_df['File Name'].isin(csv_filenames_set)]
patient_data = patient_data[patient_data['File Name'].isin(mask_filenames_set)]

# Merge the mask DataFrame with the patient data CSV on the file name
merged_data = pd.merge(mask_df, patient_data, on='File Name', how='inner')

# Encode the labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
labels = merged_data['Disease']
labels_encoded = label_encoder.fit_transform(labels)

# Check consistency between masks and labels
assert len(masks) == len(labels_encoded), "Mismatch in the number of masks and labels"

# Split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(masks, labels_encoded, test_size=0.2, random_state=42)

# Convert labels to categorical
y_train = tf.keras.utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_test = tf.keras.utils.to_categorical(y_test, num_classes=len(label_encoder.classes_))

# Define a simple CNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

def build_cnn(input_shape, num_classes):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))

    model.add(Dense(num_classes, activation='softmax'))
    return model

input_shape = (256, 256, 1)  # Adjust this according to your input size
num_classes = len(label_encoder.classes_)
model = build_cnn(input_shape, num_classes)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


results = model.evaluate(X_test, y_test)
test_loss = results[0]
test_accuracy = results[1]
test_precision = results[2]
test_recall = results[3]

print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Test Precision: {test_precision:.4f}')
print(f'Test Recall: {test_recall:.4f}')

Number of mask files missing in CSV: 0
Number of CSV entries missing in masks: 0


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 932ms/step - accuracy: 0.5978 - loss: 1.9296 - precision_1: 0.5978 - recall_1: 0.5978 - val_accuracy: 0.5439 - val_loss: 0.6858 - val_precision_1: 0.5439 - val_recall_1: 0.5439
Epoch 2/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 454ms/step - accuracy: 0.6668 - loss: 0.6582 - precision_1: 0.6668 - recall_1: 0.6668 - val_accuracy: 0.6140 - val_loss: 0.7080 - val_precision_1: 0.6140 - val_recall_1: 0.6140
Epoch 3/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 527ms/step - accuracy: 0.6137 - loss: 0.6733 - precision_1: 0.6137 - recall_1: 0.6137 - val_accuracy: 0.6754 - val_loss: 0.5980 - val_precision_1: 0.6754 - val_recall_1: 0.6754
Epoch 4/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 488ms/step - accuracy: 0.7295 - loss: 0.5626 - precision_1: 0.7295 - recall_1: 0.7295 - val_accuracy: 0.6404 - val_loss: 0.6372 - val_precision_1: 0.6404 - val_recall_

In [4]:
results

[1.4804033041000366,
 0.7280701994895935,
 0.7280701994895935,
 0.7280701994895935]