In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.cm as cm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import torch
import seaborn as sns
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def balance_dataset(input_file_path, file_name):
    
    file_path = os.path.join(input_file_path, f'{file_name}.csv')

    # Load the CSV dataset into a Pandas DataFrame
    df = pd.read_csv(file_path, header=0)

    # Ignore the first column of the DataFrame
    df = df.iloc[:, 1:]

    # Find unique classes in the second column
    unique_classes = df.iloc[:, 0].unique()

    # Create a dictionary to store the data frames for each class
    class_dataframes = {class_name: df[df.iloc[:, 0] == class_name] for class_name in unique_classes}

    # Find the minimum number of samples among the classes
    min_samples = min(len(class_df) for class_df in class_dataframes.values())

    # Randomly select and remove rows from classes with more samples
    for class_name, class_df in class_dataframes.items():
        if len(class_df) > min_samples:
            class_dataframes[class_name] = class_df.sample(min_samples)

    # Concatenate the dataframes for each class back into a single dataframe
    balanced_df = pd.concat(class_dataframes.values())

    print(balanced_df.iloc[:, 0].value_counts())

    output_file_path = os.path.join(input_file_path, f'{file_name}_balanced.csv')

    # Save the balanced dataset to a new CSV file
    balanced_df.to_csv(output_file_path, index=False)

In [3]:
input_file_path = r"C:\Users\jenni\OneDrive - Queen's University\DESI project\DESI TXT colon\Annotated Dataset"
file_name = "2021 03 30 colon 0413337-2 Analyte 6_dataset"

balance_dataset(input_file_path, file_name)

adenocarcinoma    457
benign mucosa     457
smooth muscle     457
serosa            457
submucosa         457
Name: Class, dtype: int64


In [79]:
def get_test_train(output, csv_path, patch_size, dim_y, dim_x):
    # Extract patches from the output feature maps
    output_patches = output.unfold(1, patch_size, 1).unfold(2, patch_size, 1)
    print(output_patches.size())

    # Flatten the patches
    output_patches_flattened = output_patches.reshape(dim_y, dim_x, -1)
    print(output_patches_flattened.shape)

    # Load the CSV file
    df = pd.read_csv(csv_path)
    df.fillna(0, inplace=True)

    unique_classes = df['Class'].unique()
    print(f"Unique classes: {unique_classes}")

    # Create a dictionary where the keys are the (X, Y) coordinates and the values are the class labels
    csv_dict = {(row['X'], row['Y']): row for _, row in df.iterrows()}

    # Initialize the features and labels
    labels = []
    features = []
    mz_values = []

    # For each patch
    for y in range(0, output_patches_flattened.shape[0] - patch_size + 1):
        for x in range(0, output_patches_flattened.shape[1] - patch_size + 1):
            
            # Get the class labels and m/z values for the corresponding region in the CSV file
            patch_labels = [csv_dict.get((x + dx, y + dy))['Class'] for dx in range(patch_size) for dy in range(patch_size) if csv_dict.get((x + dx, y + dy)) is not None]
            # Get the m/z values for the corresponding region in the CSV file
            patch_mz_values = [csv_dict.get((x + dx, y + dy))[3:] for dx in range(patch_size) for dy in range(patch_size) if csv_dict.get((x + dx, y + dy)) is not None]

            # If there are any class labels for the corresponding region in the CSV file
            if patch_labels:
                # Get the most common class label in the region
                most_common_label = stats.mode(patch_labels)[0][0]

                # Append the output patch to the features list
                features.append(output_patches_flattened[y, x, :])

                # Append the most common class label
                labels.append(most_common_label)

                # Append the mean m/z value for the patch
                mz_values.append(np.mean(patch_mz_values))

    # Convert the lists to NumPy arrays
    labels = np.array(labels)
    features = np.array(features).reshape(-1, 1)  # Reshape features to be a 2D array
    mz_values = np.array(mz_values).reshape(-1, 1)

    # Stack the feature arrays horizontally
    features = np.hstack((features, mz_values.reshape(-1, 1)))

    # Handle NaN values
    features = np.nan_to_num(features, nan=0.0)

    # Encode the class labels as integers
    le = LabelEncoder()
    labels = le.fit_transform(labels)

    for i, class_label in enumerate(le.classes_):
        print(f"{i}: {class_label}")

    # Split the features and labels into a training set and a test set
    features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    return features_train, features_test, labels_train, labels_test

In [54]:
# def get_test_train(output, csv_path, patch_size):
#     # Extract patches from the output feature maps
#     output_patches = output.unfold(1, patch_size, 1).unfold(2, patch_size, 1)
#     print(output_patches.size())

#     # Flatten the patches
#     output_patches_flattened = output_patches.reshape(output_patches.shape[0], -1)
#     print(output_patches_flattened.shape)

#     # Load the CSV file
#     df = pd.read_csv(csv_path)
#     df.fillna(0, inplace=True)

#     unique_classes = df['Class'].unique()
#     print(f"Unique classes: {unique_classes}")

#     # Create a dictionary where the keys are the (X, Y) coordinates and the values are the class labels
#     csv_dict = {(row['X'], row['Y']): row for _, row in df.iterrows()}

#     # Initialize the features and labels
#     features = []
#     labels = []

#     print(output.shape)

#     #205 263

#     # For each patch
#     for y in range(0, output.shape[0] - patch_size + 1):
#         for x in range(0, output.shape[1] - patch_size + 1):
#             # Get the class labels for the corresponding region in the CSV file
#             patch_labels = [csv_dict.get((x + dx, y + dy))['Class'] for dx in range(patch_size) for dy in range(patch_size) if csv_dict.get((x + dx, y + dy)) is not None]

#             patch = output[x:x+patch_size, y:y+patch_size]

#             # Print the patch
#             #print(f"Patch at ({x}, {y}):")
#             #print(patch)

#             # If there are any class labels for the corresponding region in the CSV file
#             if patch_labels:
#                 # Get the most common class label in the region
#                 most_common_label = stats.mode(patch_labels)[0][0]

#                 # features.append(np.concatenate([output_patches_flattened[y * output.shape[1] + x, :], df.loc[
#                 #                                     (df['Y'] == x) & (df['X'] == y), df.columns[3:]].mean().values]))

#                 features.append(np.concatenate([output_patches_flattened[y * output.shape[1] + x, :], df.loc[
#                                                     (df['X'] == x) & (df['Y'] == y), df.columns[3:]].mean().values]))

#                 print(f"Most common class label: {most_common_label}")

#                 # Add the most common class label to the labels
#                 labels.append(most_common_label)

#     unique_labels = np.unique(labels)
#     print(f"Unique labels: {unique_labels}")

#     # Handle NaN values
#     features = np.nan_to_num(features, nan=0.0)

#     # Encode the class labels as integers
#     le = LabelEncoder()
#     labels = le.fit_transform(labels)

#     for i, class_label in enumerate(le.classes_):
#         print(f"{i}: {class_label}")

#     # Split the features and labels into a training set and a test set
#     features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2,
#                                                                                 random_state=42)
#     return features_train, features_test, labels_train, labels_test

In [80]:
csv_path = r"C:\Users\jenni\OneDrive - Queen's University\DESI project\DESI TXT colon\Annotated Dataset\2021 03 30 colon 0413337-2 Analyte 6_dataset_balanced.csv"
patch_size = 5

output = np.load(r"C:\Users\jenni\OneDrive - Queen's University\DESI project\DESI TXT colon\dc-DeepMSI outputs\w5 nc60\2021 03 30 colon 0413337-2 Analyte 6 array.npy")

output = torch.from_numpy(output)
#torch.Size([51980, 30]) torch.Size([51980, 30])

#205 263

features_train, features_test, labels_train, labels_test = get_test_train(output, csv_path, patch_size, 205, 263)

torch.Size([53915, 56, 1, 5])
torch.Size([205, 263, 280])
Unique classes: ['adenocarcinoma' 'benign mucosa' 'smooth muscle' 'serosa' 'submucosa']
0: adenocarcinoma
1: benign mucosa
2: serosa
3: smooth muscle
4: submucosa




In [None]:
def logisticRegression(features_train, features_test, labels_train, labels_test):
    # Train the logistic regression model

    scaler = StandardScaler()
    features_train = scaler.fit_transform(features_train)
    features_test = scaler.transform(features_test)

    print("Training")
    clf = LogisticRegression(max_iter=1000, solver= 'newton-cg')
    clf.fit(features_train, labels_train)


    print("Predicting")
    predicted_labels = clf.predict(features_test)

    #Accuracy
    accuracy = accuracy_score(labels_test, predicted_labels)
    print("Accuracy: " + str(accuracy))

    # Classification report
    print("Classification Report")
    print(classification_report(labels_test, predicted_labels))

    # Confustion matrix
    c_matrix = confusion_matrix(labels_test, predicted_labels)
    print("Confusion Matrix: " + str(c_matrix))

    c_matrix = confusion_matrix(labels_test, predicted_labels)
    sns.heatmap(c_matrix, annot=True, fmt='d')
    plt.show()

In [None]:
def logisticRegression_grid(features_train, features_test, labels_train, labels_test):
    # Existing code...

    # Define the parameter grid
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'penalty': ['l1', 'l2'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }

    # Create a GridSearchCV object
    grid_search = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5)

    # Fit the GridSearchCV object to the data
    grid_search.fit(features_train, labels_train)

    # Print the best parameters
    print("Best Parameters: ", grid_search.best_params_)

    # Use the best model to make predictions
    clf = grid_search.best_estimator_
    predicted_labels = clf.predict(features_test)

    accuracy = accuracy_score(labels_test, predicted_labels)
    print("Accuracy: " + str(accuracy))
    c_matrix = confusion_matrix(labels_test, predicted_labels)
    print("Confusion Matrix: " + str(c_matrix))

    c_matrix = confusion_matrix(labels_test, predicted_labels)
    sns.heatmap(c_matrix, annot=True, fmt='d')
    plt.show()

In [None]:
logisticRegression(features_train, features_test, labels_train, labels_test)

In [None]:
def runRF(features_train, features_test, labels_train, labels_test):
	rf_model = RandomForestClassifier()
	rf_model.fit(features_train, labels_train)
	y_test_preds = rf_model.predict(features_test)

	accuracy = accuracy_score(labels_test, y_test_preds)
	print("Accuracy: " + str(accuracy))

	# Print the confusion matrix
	c_matrix = confusion_matrix(labels_test, y_test_preds)
	print("Confusion Matrix: ")
	print(c_matrix)

		# Print the classification report
	print("Classification Report: ")
	print(classification_report(labels_test, y_test_preds))

	# Visualize the confusion matrix using a heatmap
	plt.figure(figsize=(10,7))
	sns.heatmap(c_matrix, annot=True, fmt='d')
	plt.xlabel('Predicted')
	plt.ylabel('Truth')
	plt.show()

In [None]:
def runrf_gridsearch(X_train, y_train):
    # Define the parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Create a base model
    rf = RandomForestClassifier()

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Print the best parameters
    print(f"Best parameters: {grid_search.best_params_}")

    return grid_search.best_estimator_

In [None]:
runrf_gridsearch(features_train, features_test, labels_train, labels_test)

In [None]:
def run_decisiontree(features_train, features_test, labels_train, labels_test):
    # Create a decision tree classifier
    clf = DecisionTreeClassifier()

    # Train the classifier
    clf.fit(features_train, labels_train)

    # Make predictions
    predicted_labels = clf.predict(features_test)

    # Print the accuracy
    accuracy = accuracy_score(labels_test, predicted_labels)
    print("Accuracy: " + str(accuracy))

    # Print the confusion matrix
    c_matrix = confusion_matrix(labels_test, predicted_labels)
    print("Confusion Matrix: ")
    print(c_matrix)

    # Print the classification report
    print("Classification Report: ")
    print(classification_report(labels_test, predicted_labels))

    # Visualize the confusion matrix using a heatmap
    plt.figure(figsize=(10,7))
    sns.heatmap(c_matrix, annot=True, fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    plt.show()

In [None]:
run_decisiontree_gridsearch(features_train, features_test, labels_train, labels_test)