In [2]:
#DATA SPLITTING

In [None]:
#data is split into train, test and validation sets
import pandas as pd
import numpy as np
import os
import random
import shutil

input_folder = ""
output_folder = ""
train_folder = ""
test_folder = ""
validation_folder = ""
train_ratio = 0.7   #70% of the data is used for training
validation_ratio = 0.15 #15% of the data is used for validation
test_ratio = 0.15  #15% of the data is used for testing


# Iterate over the folders in the input folder (assuming each folder represents a tumor type)
for tumor_folder in os.listdir(input_folder):
    tumor_folder_path = os.path.join(input_folder, tumor_folder)
    if not os.path.isdir(tumor_folder_path):
        continue

#iterate over the files in the input folder
for filename in os.listdir(tumor_folder_path):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        image_path = os.path.join(tumor_folder_path, filename)

        #randomly select the image to be put into the train, test and validation folders
        random_number = random.random()
        if random_number < train_ratio:
            dest_folder = train_folder
        elif random_number < train_ratio + validation_ratio:
            dest_folder = validation_folder
        else:
            dest_folder = test_folder

        #copy the image to the destination folder
        shutil.copy2(image_path, os.path.join(dest_folder, filename))


In [1]:
#Model training - using machine learning approach - Random Forest Classifier
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load the feature data and labels
data = np.load('path/to/feature_data.npy')
labels = np.load('path/to/labels.npy')

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100)

# Train the classifier
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the feature data and labels for the training set
training_data = np.load('path/to/training_data.npy')
training_labels = np.load('path/to/training_labels.npy')

# Initialize the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100)

# Train the classifier
clf.fit(training_data, training_labels)

# Load the feature data and labels for the test set
test_data = np.load('path/to/test_data.npy')
test_labels = np.load('path/to/test_labels.npy')

# Predict the labels for the test set
test_predictions = clf.predict(test_data)

# Evaluate the accuracy of the model
accuracy = accuracy_score(test_labels, test_predictions)
print(f'Accuracy: {accuracy:.2f}')

In [2]:
#Model training - using deep learning approach - CNN
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical

# Load the training data and labels
training_data = np.load('path/to/training_data.npy')
training_labels = np.load('path/to/training_labels.npy')

# Preprocess the data
# ...

# Convert labels to one-hot encoding
num_classes = len(np.unique(training_labels))
training_labels = to_categorical(training_labels, num_classes)

# Define the CNN model
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(image_height, image_width, channels)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(training_data, training_labels, batch_size=32, epochs=10, validation_split=0.2)

# Save the trained model
model.save('path/to/saved_model.h5')

In [3]:
#model evaluation - accuracy, error rate, confusion matrix, classification report with validation set
#Random Forest Classifier evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Load the testing set features and labels
X_test = ...
y_test = ...

# Make predictions using the trained random forest model
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(confusion_mat)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
#Random Forest Classifier evaluation
from tensorflow import keras
import numpy as np

# Load the testing set features and labels
X_test = ...
y_test = ...

# Normalize the pixel values of the testing set
X_test = X_test / 255.0

# Load the trained CNN model
model = keras.models.load_model('path/to/model.h5')

# Make predictions using the trained model
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate evaluation metrics
accuracy = np.mean(y_pred_classes == y_test)
confusion_mat = confusion_matrix(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes, average='macro')
recall = recall_score(y_test, y_pred_classes, average='macro')
f1 = f1_score(y_test, y_pred_classes, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(confusion_mat)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
