In [1]:
import os
import numpy as np
import cv2
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from imblearn.over_sampling import SMOTE
from tensorflow.keras.utils import to_categorical
import albumentations as A

In [4]:
##Put 0,1,2,3 into benign and rest in malignant..
import pandas as pd 
#dataset 
data = {
    "Category": [
        "Likely Benign", "Unknown", "2 - Malignant (High Risk)",
        "2 - Malignant (Very High Risk)", "1 - Benign", "2 - Malignant (Confirmed Cancer)"
    ],
    "Count": [99, 92, 75, 56, 38, 30]
}

df = pd.DataFrame(data)

# Define category mapping
benign_labels = ["Likely Benign", "Unknown", "1 - Benign"]  # Categories for "Benign"
malignant_labels = ["2 - Malignant (High Risk)", "2 - Malignant (Very High Risk)", "2 - Malignant (Confirmed Cancer)"]  # Categories for "Malignant"

# Create new column "Diagnosis"
df["Diagnosis"] = df["Category"].apply(lambda x: "Benign" if x in benign_labels else "Malignant")

# Display updated DataFrame
print(df)

                           Category  Count  Diagnosis
0                     Likely Benign     99     Benign
1                           Unknown     92     Benign
2         2 - Malignant (High Risk)     75  Malignant
3    2 - Malignant (Very High Risk)     56  Malignant
4                        1 - Benign     38     Benign
5  2 - Malignant (Confirmed Cancer)     30  Malignant


In [5]:
def load_images_from_csv(df, folder, target_size=(128, 128)):
    images, labels = [], []
    for _, row in df.iterrows():
        img_path = os.path.join(folder, row['image_Filename'])
        if os.path.exists(img_path):
            img = cv2.imread(img_path)
            img = cv2.resize(img, target_size)
            images.append(img)
            labels.append(row['Category'])
    return np.array(images), np.array(labels)

In [6]:
def augment_image(image):
    augmentations = [
        A.HorizontalFlip(p=1), 
        A.RandomBrightnessContrast(p=1),
        A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=30, p=1),
        A.GaussNoise(var_limit=(10.0, 50.0), p=1),
    ]
    aug = random.choice(augmentations)
    augmented = aug(image=image)['image']
    return augmented

In [7]:

def extract_features(images):
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
    model = Model(inputs=base_model.input, outputs=base_model.output)
    features = model.predict(images)
    return features.reshape(features.shape[0], -1)

In [8]:
csv_path = 'output_final.csv'
image_folder = 'thyroid (1)'

import pandas as pd
df = pd.read_csv(csv_path)

In [9]:
df 

Unnamed: 0,Filename,Category
0,1.xml,Unknown
1,10.xml,2 - Malignant (High Risk)
2,100.xml,Likely Benign
3,101.xml,2 - Malignant (Confirmed Cancer)
4,102.xml,2 - Malignant (High Risk)
...,...,...
385,95.xml,Likely Benign
386,96.xml,Unknown
387,97.xml,Unknown
388,98.xml,Likely Benign


In [10]:
df['image_Filename'] = df['Filename'].str.replace('.xml', '').apply(lambda x: x + '_1.jpg')

In [11]:
df

Unnamed: 0,Filename,Category,image_Filename
0,1.xml,Unknown,1_1.jpg
1,10.xml,2 - Malignant (High Risk),10_1.jpg
2,100.xml,Likely Benign,100_1.jpg
3,101.xml,2 - Malignant (Confirmed Cancer),101_1.jpg
4,102.xml,2 - Malignant (High Risk),102_1.jpg
...,...,...,...
385,95.xml,Likely Benign,95_1.jpg
386,96.xml,Unknown,96_1.jpg
387,97.xml,Unknown,97_1.jpg
388,98.xml,Likely Benign,98_1.jpg


In [12]:
x, y = load_images_from_csv(df, image_folder)

In [13]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

augment_images, augment_labels = [], []
for img, label in zip(x, y_encoded):
    for _ in range(3):
        augment_images.append(augment_image(img))
        augment_labels.append(label)

  original_init(self, **validated_kwargs)
  A.GaussNoise(var_limit=(10.0, 50.0), p=1),


In [14]:
x.shape

(62, 128, 128, 3)

In [15]:
x_aug = np.array(augment_images)
y_aug = np.array(augment_labels)

In [16]:
x_combined = np.vstack((x, x_aug))
y_combined = np.hstack((y_encoded, y_aug))


In [17]:
x_combined.shape


(248, 128, 128, 3)

In [18]:
x_features = extract_features(x_combined)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 0us/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 664ms/step


In [19]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

x_resampled, y_resampled = smote.fit_resample(x_features, y_combined)

In [20]:
y_final = to_categorical(y_resampled)

In [21]:
y_final

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]])

In [22]:
df['name'] = df['Filename'].str.replace('.xml', '')
df

Unnamed: 0,Filename,Category,image_Filename,name
0,1.xml,Unknown,1_1.jpg,1
1,10.xml,2 - Malignant (High Risk),10_1.jpg,10
2,100.xml,Likely Benign,100_1.jpg,100
3,101.xml,2 - Malignant (Confirmed Cancer),101_1.jpg,101
4,102.xml,2 - Malignant (High Risk),102_1.jpg,102
...,...,...,...,...
385,95.xml,Likely Benign,95_1.jpg,95
386,96.xml,Unknown,96_1.jpg,96
387,97.xml,Unknown,97_1.jpg,97
388,98.xml,Likely Benign,98_1.jpg,98


In [23]:
df['name'] = df['name'].astype(int)
df.sort_values(by='name')

Unnamed: 0,Filename,Category,image_Filename,name
0,1.xml,Unknown,1_1.jpg,1
110,2.xml,1 - Benign,2_1.jpg,2
220,3.xml,Likely Benign,3_1.jpg,3
323,4.xml,Likely Benign,4_1.jpg,4
335,5.xml,2 - Malignant (Confirmed Cancer),5_1.jpg,5
...,...,...,...,...
319,396.xml,Unknown,396_1.jpg,396
320,397.xml,1 - Benign,397_1.jpg,397
321,398.xml,2 - Malignant (High Risk),398_1.jpg,398
322,399.xml,2 - Malignant (High Risk),399_1.jpg,399


In [24]:
# Count how many 0,1,2,3,4a,4b,....
category_counts = df["Category"].value_counts()

In [25]:
category_counts

Category
Likely Benign                       99
Unknown                             92
2 - Malignant (High Risk)           75
2 - Malignant (Very High Risk)      56
1 - Benign                          38
2 - Malignant (Confirmed Cancer)    30
Name: count, dtype: int64

In [26]:
##----MODEL----##
##82.35% accuracy...
import tensorflow as tf 
import numpy as np
import os
import cv2
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam

In [27]:


X_resized = tf.image.resize(x_aug, (224, 224))  # Resizing using TensorFlow
X_resized = X_resized.numpy()  # Convert to NumPy array if needed

print(X_resized.shape)  # Should be (num_samples, 224, 224, 3)


(186, 224, 224, 3)


In [28]:
##1.-->

X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_final, test_size=0.2, random_state=42)

In [29]:
##2.--->
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step - accuracy: 0.3278 - loss: 5.4302 - val_accuracy: 0.6630 - val_loss: 1.8891
Epoch 2/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7883 - loss: 0.9195 - val_accuracy: 0.8043 - val_loss: 1.2021
Epoch 3/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9290 - loss: 0.2842 - val_accuracy: 0.8261 - val_loss: 1.1215
Epoch 4/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9719 - loss: 0.1125 - val_accuracy: 0.8478 - val_loss: 1.1718
Epoch 5/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9808 - loss: 0.0479 - val_accuracy: 0.8696 - val_loss: 0.9688
Epoch 6/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9982 - loss: 0.0121 - val_accuracy: 0.8587 - val_loss: 1.0415
Epoch 7/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━

In [34]:
model.save('thyroid_classifier.h5')
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')



['label_encoder.pkl']

In [31]:
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model = Model(inputs=base_model.input, outputs=base_model.output)


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 0us/step


In [32]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D

model = Sequential([
    Conv2D(64, (3,3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2,2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(y_train.shape[1], activation='softmax')
])


  super().__init__(


In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Define the improved model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),
    
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    
    Dense(y_train.shape[1], activation='softmax')  # Output layer
])

# Compile the model
optimizer = Adam(learning_rate=0.001)  # Adjust learning rate if needed
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Add callbacks to prevent overfitting
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)
]

# Train the model
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), batch_size=32, callbacks=callbacks)

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Epoch 1/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 78ms/step - accuracy: 0.3550 - loss: 2.0572 - val_accuracy: 0.2935 - val_loss: 7.3570 - learning_rate: 0.0010
Epoch 2/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.6425 - loss: 0.9856 - val_accuracy: 0.4239 - val_loss: 3.2777 - learning_rate: 0.0010
Epoch 3/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.7283 - loss: 0.7368 - val_accuracy: 0.6522 - val_loss: 1.0716 - learning_rate: 0.0010
Epoch 4/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.8066 - loss: 0.6071 - val_accuracy: 0.6957 - val_loss: 0.9286 - learning_rate: 0.0010
Epoch 5/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.8336 - loss: 0.4544 - val_accuracy: 0.7935 - val_loss: 0.5479 - learning_rate: 0.0010
Epoch 6/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 