In [2]:
#DATA SPLITTING

In [14]:
import os
import random
import shutil

input_folder = "D:\\master_thesis\\datasets\\segmented"
train_folder = "D:\\master_thesis\\datasets\\data_split\\training_set"
test_folder = "D:\\master_thesis\\datasets\\data_split\\test_set"
validation_folder = "D:\\master_thesis\\datasets\\data_split\\validation_set"
train_ratio = 0.7   # 70% of the data is used for training
validation_ratio = 0.15  # 15% of the data is used for validation
test_ratio = 0.15  # 15% of the data is used for testing

# Create the destination folders if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)
os.makedirs(validation_folder, exist_ok=True)

# Iterate over the folders in the input folder (assuming each folder represents a tumor type)
for tumor_folder in os.listdir(input_folder):
    tumor_folder_path = os.path.join(input_folder, tumor_folder)
    if not os.path.isdir(tumor_folder_path):
        continue

    print("Processing tumor folder:", tumor_folder)

    # Get the list of image files in the tumor folder
    image_files = [filename for filename in os.listdir(tumor_folder_path) if
                   filename.endswith(".jpg") or filename.endswith(".png")]

    # Shuffle the image files randomly
    random.shuffle(image_files)

    # Split the data into train, test, and validation sets based on the given ratios
    train_size = int(len(image_files) * train_ratio)
    validation_size = int(len(image_files) * validation_ratio)

    train_files = image_files[:train_size]
    validation_files = image_files[train_size:train_size + validation_size]
    test_files = image_files[train_size + validation_size:]

    print("Copying images to train folder...")
    # Copy the image files to the train folder
    for filename in train_files:
        src_path = os.path.join(tumor_folder_path, filename)
        dest_path = os.path.join(train_folder, tumor_folder, filename)
        shutil.copy2(src_path, dest_path)

    print("Copying images to validation folder...")
    # Copy the image files to the validation folder
    for filename in validation_files:
        src_path = os.path.join(tumor_folder_path, filename)
        dest_path = os.path.join(validation_folder, tumor_folder, filename)
        shutil.copy2(src_path, dest_path)

    print("Copying images to test folder...")
    # Copy the image files to the test folder
    for filename in test_files:
        src_path = os.path.join(tumor_folder_path, filename)
        dest_path = os.path.join(test_folder, tumor_folder, filename)
        shutil.copy2(src_path, dest_path)

    print("Completed processing tumor folder:", tumor_folder)
    print()

print("Data splitting completed.")


Processing tumor folder: glioma
Copying images to train folder...
Copying images to validation folder...
Copying images to test folder...
Completed processing tumor folder: glioma

Processing tumor folder: menin
Copying images to train folder...
Copying images to validation folder...
Copying images to test folder...
Completed processing tumor folder: menin

Processing tumor folder: normal
Copying images to train folder...
Copying images to validation folder...
Copying images to test folder...
Completed processing tumor folder: normal

Processing tumor folder: pituitary
Copying images to train folder...
Copying images to validation folder...
Copying images to test folder...
Completed processing tumor folder: pituitary

Data splitting completed.


In [12]:
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import cv2

input_folder = "D:\\master_thesis\\datasets\\data_split\\test_set"
output_folder = "D:\\master_thesis\\datasets\\test_model"

# Iterate over the folders in the input folder (assuming each folder represents a tumor type)
for tumor_folder in os.listdir(input_folder):
    tumor_folder_path = os.path.join(input_folder, tumor_folder)
    if not os.path.isdir(tumor_folder_path):
        continue

    print("Processing tumor folder:", tumor_folder)

    # Load the features from the .npy file
    feature_path = "D:\\master_thesis\\datasets\\feature_extracted2\\glioma"
    feature_folder = os.path.join(tumor_folder_path, feature_path)
    features = []
    labels = []

    # Iterate over the files in the feature folder
    for filename in os.listdir(feature_folder):
        if filename.endswith(".jpg"):
            feature_file_path = os.path.join(feature_folder, filename)
            print("Processing feature file:", feature_file_path)
            feature = cv2.imread(feature_file_path)
            features.append(feature)
            labels.append(tumor_folder)
            
    # Convert features and labels to numpy arrays
    features = np.array(features)
    labels = np.array(labels)
    print(labels)
    # Check if features and labels arrays are empty
    if features.size == 0 or labels.size == 0:
        print("No feature files found for tumor folder:", tumor_folder)
        continue

    # Reshape the feature data if needed (uncomment the line below and modify the shape)
    #features = features.reshape(features.shape[0], -1)

    # Train a Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
    # rf = RandomForestClassifier(n_estimators=100,max_depth=6,class_weight='balanced',warm_start= True, n_jobs=-1)
    rf_classifier.fit(features, labels)

    # Save the trained model
    model_file_path = os.path.join(output_folder, tumor_folder + ".joblib")
    joblib.dump(rf_classifier, model_file_path)

    print("Completed processing tumor folder:", tumor_folder)

print("Model training completed.")


Processing tumor folder: glioma
Processing feature file: D:\master_thesis\datasets\feature_extracted2\glioma\image(19)_denoised_equilized.jpg
Processing feature file: D:\master_thesis\datasets\feature_extracted2\glioma\image(23)_denoised_equilized.jpg
Processing feature file: D:\master_thesis\datasets\feature_extracted2\glioma\image(30)_denoised_equilized.jpg
Processing feature file: D:\master_thesis\datasets\feature_extracted2\glioma\image(48)_denoised_equilized.jpg
Processing feature file: D:\master_thesis\datasets\feature_extracted2\glioma\image(49)_denoised_equilized.jpg
Processing feature file: D:\master_thesis\datasets\feature_extracted2\glioma\image(51)_denoised_equilized.jpg
Processing feature file: D:\master_thesis\datasets\feature_extracted2\glioma\image(53)_denoised_equilized.jpg
Processing feature file: D:\master_thesis\datasets\feature_extracted2\glioma\image(54)_denoised_equilized.jpg
Processing feature file: D:\master_thesis\datasets\feature_extracted2\glioma\image(56)_d

  features = np.array(features)


ValueError: setting an array element with a sequence.

In [9]:
# Check if features and labels arrays are empty
    if features.size == 0 or labels.size == 0:
        print("No feature files found for tumor folder:", tumor_folder)
        continue

    # Reshape the feature data if needed (uncomment the line below and modify the shape)
    features = features.reshape(features.shape[0], 2)

    # Train a Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(features, labels)

    # Save the trained model
    model_file_path = os.path.join(output_folder, tumor_folder + ".joblib")
    joblib.dump(rf_classifier, model_file_path)

    print("Completed processing tumor folder:", tumor_folder)

print("Model training completed.")

IndentationError: unexpected indent (313788011.py, line 2)

In [4]:
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

input_folder = "D:\\master_thesis\\datasets\\data_split\\test_set"
output_folder = "D:\\master_thesis\\datasets\\test_model"

# Iterate over the folders in the input folder (assuming each folder represents a tumor type)
for tumor_folder in os.listdir(input_folder):
    tumor_folder_path = os.path.join(input_folder, tumor_folder)
    if not os.path.isdir(tumor_folder_path):
        continue

    print("Processing tumor folder:", tumor_folder)

    # Load the features from the .npy file
    feature_path = "D:\\master_thesis\\datasets\\feature_extracted"
    feature_folder = os.path.join(tumor_folder_path, feature_path)
    features = []
    labels = []

    # Iterate over the files in the feature folder
    for filename in os.listdir(feature_folder):
        if filename.endswith(".npy"):
            feature_file_path = os.path.join(feature_folder, filename)
            feature = np.load(feature_file_path)
            features.append(feature)
            labels.append(tumor_folder)

    # Convert features and labels to numpy arrays
    features = np.array(features)
    labels = np.array(labels)

    # Reshape the feature data if needed (uncomment the line below and modify the shape)
    features = features.reshape(1, -1)
    # features = features.reshape(features.shape[0], features.shape[1], features.shape[2], features.shape[3])
    # Train a Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(features, labels)

    # Save the trained model
    model_file_path = os.path.join(output_folder, tumor_folder + ".joblib")
    joblib.dump(rf_classifier, model_file_path)

    print("Completed processing tumor folder:", tumor_folder)

print("Model training completed.")


Processing tumor folder: glioma


ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by RandomForestClassifier.

In [7]:
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

input_folder = "D:\\master_thesis\\datasets\\data_split\\test_set"
output_folder = "D:\\master_thesis\\datasets\\test_model"

# Iterate over the folders in the input folder (assuming each folder represents a tumor type)
for tumor_folder in os.listdir(input_folder):
    tumor_folder_path = os.path.join(input_folder, tumor_folder)
    if not os.path.isdir(tumor_folder_path):
        continue

    print("Processing tumor folder:", tumor_folder)

    # Load the features from the .npy file
    feature_path = "D:\\master_thesis\\datasets\\feature_extracted"
    feature_folder = os.path.join(tumor_folder_path, feature_path)
    features = []
    labels = []

    # Iterate over the files in the feature folder
    for filename in os.listdir(feature_folder):
        if filename.endswith(".npy"):
            feature_file_path = os.path.join(feature_folder, filename)
            print("Processing feature file:", feature_file_path)
            feature = np.load(feature_file_path)
            features.append(feature)
            labels.append(tumor_folder)

    # Convert features and labels to numpy arrays
    features = np.array(features)
    labels = np.array(labels)

    # Train a Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(features, labels)

    # Save the trained model
    model_file_path = os.path.join(output_folder, tumor_folder + ".joblib")
    joblib.dump(rf_classifier, model_file_path)

    print("Completed processing tumor folder:", tumor_folder)

print("Model training completed.")


Processing tumor folder: glioma


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [5]:
#model training - test set
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

image_folder = "D:\\master_thesis\\datasets\\data_split\\test_set"
feature_folder = "D:\\master_thesis\\datasets\\feature_extracted"
output_folder = "D:\\master_thesis\\datasets\\test_model"

# Initialize lists to store the features and labels
features = []
labels = []

# Iterate over the subfolders in the feature folder (each subfolder represents a tumor type)
for tumor_folder in os.listdir(feature_folder):
    feature_tumor_folder_path = os.path.join(feature_folder, tumor_folder)
    if not os.path.isdir(feature_tumor_folder_path):
        continue

    print("Processing tumor folder:", tumor_folder)

    # Get the corresponding image folder path
    image_tumor_folder_path = os.path.join(image_folder, tumor_folder)

    # Iterate over the files in the feature tumor folder
    for filename in os.listdir(feature_tumor_folder_path):
        if filename.endswith(".npy"):
            # Load the features from the .npy file
            feature_file = os.path.join(feature_tumor_folder_path, filename)
            feature = np.load(feature_file)

            # Append the features to the list
            features.append(feature)

            # Append the label based on the tumor folder name
            labels.append(tumor_folder)

print("Total features:", len(features))

# Convert the features and labels lists to numpy arrays
features = np.array(features)
labels = np.array(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Create an instance of the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier using the training data
rf_classifier.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = rf_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Save the trained model
model_file_path = os.path.join(output_folder, "random_forest.joblib")
joblib.dump(rf_classifier, model_file_path)

print("Model training completed.")


Processing tumor folder: glioma
Processing tumor folder: menin
Processing tumor folder: normal
Processing tumor folder: pituitary
Total features: 28498


  features = np.array(features)


ValueError: setting an array element with a sequence.

In [4]:
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

input_folder = "D:\\master_thesis\\datasets\\data_split\\test_set"
output_folder = "D:\\master_thesis\\datasets\\test_model"

# Iterate over the folders in the input folder (assuming each folder represents a tumor type)
for tumor_folder in os.listdir(input_folder):
    tumor_folder_path = os.path.join(input_folder, tumor_folder)
    if not os.path.isdir(tumor_folder_path):
        continue

    print("Processing tumor folder:", tumor_folder)

    # Load the features from the .npy file
    feature_path = "D:\\master_thesis\\datasets\\feature_extracted"
    feature_folder = os.path.join(tumor_folder_path, feature_path)
    features = []
    labels = []

    # Iterate over the files in the feature folder
    for filename in os.listdir(feature_folder):
        if filename.endswith(".npy"):
            feature_file_path = os.path.join(feature_folder, filename)
            feature = np.load(feature_file_path)
            features.append(feature)
            labels.append(tumor_folder)

    # Convert features and labels to numpy arrays
    features = np.array(features)
    labels = np.array(labels)

    # Train a Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(features, labels)

    # Predict labels for the training data
    train_predictions = rf_classifier.predict(features)

    # Calculate the accuracy of the model on the training data
    train_accuracy = accuracy_score(labels, train_predictions)
    print("Training Accuracy:", train_accuracy)

    # Save the trained model
    model_file_path = os.path.join(output_folder, tumor_folder + ".joblib")
    joblib.dump(rf_classifier, model_file_path)

    print("Completed processing tumor folder:", tumor_folder)

print("Model training completed.")


Processing tumor folder: test_set


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [2]:
#model training - test set
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

input_folder = "D:\\master_thesis\\datasets\\data_split\\test_set"
output_folder = "D:\\master_thesis\\datasets\\test_model"

# Iterate over the folders in the input folder (assuming each folder represents a tumor type)
for tumor_folder in os.listdir(input_folder):
    tumor_folder_path = os.path.join(input_folder, tumor_folder)
    if not os.path.isdir(tumor_folder_path):
        continue

    print("Processing tumor folder:", tumor_folder)

    # Load the features from the .npy file
    #feature = "D:\master_thesis\datasets\feature_extracted"
    features = np.load(os.path.join(tumor_folder_path, "D:\\master_thesis\\datasets\\feature_extracted"))

    # Assign labels based on the folder name
    labels = np.full((features.shape[0],), tumor_folder)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    # Create an instance of the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the Random Forest classifier using the training data
    rf_classifier.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = rf_classifier.predict(X_test)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Save the trained model
    model_file_path = os.path.join(output_folder, tumor_folder + ".joblib")
    joblib.dump(rf_classifier, model_file_path)

    print("Completed processing tumor folder:", tumor_folder)
    

print("Model training completed.")


Processing tumor folder: glioma


PermissionError: [Errno 13] Permission denied: 'D:\\master_thesis\\datasets\\feature_extracted'

In [1]:
#Model training - using machine learning approach - Random Forest Classifier
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load the feature data and labels
data = np.load('path/to/feature_data.npy')
labels = np.load('path/to/labels.npy')

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100)

# Train the classifier
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the feature data and labels for the training set
training_data = np.load('path/to/training_data.npy')
training_labels = np.load('path/to/training_labels.npy')

# Initialize the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100)

# Train the classifier
clf.fit(training_data, training_labels)

# Load the feature data and labels for the test set
test_data = np.load('path/to/test_data.npy')
test_labels = np.load('path/to/test_labels.npy')

# Predict the labels for the test set
test_predictions = clf.predict(test_data)

# Evaluate the accuracy of the model
accuracy = accuracy_score(test_labels, test_predictions)
print(f'Accuracy: {accuracy:.2f}')

In [2]:
#Model training - using deep learning approach - CNN
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical

# Load the training data and labels
training_data = np.load('path/to/training_data.npy')
training_labels = np.load('path/to/training_labels.npy')

# Preprocess the data
# ...

# Convert labels to one-hot encoding
num_classes = len(np.unique(training_labels))
training_labels = to_categorical(training_labels, num_classes)

# Define the CNN model
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(image_height, image_width, channels)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(training_data, training_labels, batch_size=32, epochs=10, validation_split=0.2)

# Save the trained model
model.save('path/to/saved_model.h5')

In [3]:
#model evaluation - accuracy, error rate, confusion matrix, classification report with validation set
#Random Forest Classifier evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Load the testing set features and labels
X_test = ...
y_test = ...

# Make predictions using the trained random forest model
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(confusion_mat)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
#Random Forest Classifier evaluation
from tensorflow import keras
import numpy as np

# Load the testing set features and labels
X_test = ...
y_test = ...

# Normalize the pixel values of the testing set
X_test = X_test / 255.0

# Load the trained CNN model
model = keras.models.load_model('path/to/model.h5')

# Make predictions using the trained model
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate evaluation metrics
accuracy = np.mean(y_pred_classes == y_test)
confusion_mat = confusion_matrix(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes, average='macro')
recall = recall_score(y_test, y_pred_classes, average='macro')
f1 = f1_score(y_test, y_pred_classes, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(confusion_mat)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
