In [1]:
from pathlib import Path
import os, sys
repo_path= Path.cwd().resolve()
while '.gitignore' not in os.listdir(repo_path): # while not in the root of the repo
    repo_path = repo_path.parent #go up one level
sys.path.insert(0,str(repo_path)) if str(repo_path) not in sys.path else None

import pandas as pd
import numpy as np
#Import paths and patients classes
from notebooks.info import path_label, patient
import notebooks.utils as utils

In [2]:
def get_ex_included(budget_path: Path):
    """get excluded features name using the budget CV value

    Args:
        budget_path (Path): path to the csv file with the budget

    Returns:
        sequences: excluded and included features
    """
    # get the name of the features from the budget
    budget = pd.read_excel(budget_path, index_col=0)
    # change name of column
    budget.columns = ['budget']
    # get all features with values greater than 1
    excluded = budget[budget[ 'budget' ] > 1].index
    # get all other names
    included = budget[budget[ 'budget' ] <= 1].index
    
    return excluded, included

def get_features(stype:str, excluded:list):
    """
    return df with features given the segmentation type and excluded features

    Args:
        stype (str): general or focal
        excluded (seq): sequence of excluded features

    Returns:
        pd.DataFrame: df with features
    """
    # get features
    features = pd.read_csv(repo_path / 'data' / 'features' / f'features_all_time{stype}.csv', index_col=0)
    features = features.groupby(by='pat_num', axis=0).mean()
    # remove features in excluded list
    features = features.drop(excluded, axis=1)

    return features

# Features preprocessing

In [3]:
# load info class
info = path_label()

## CV greater than 100 (Uncompatible budget)

In [5]:
# experiment HP
stype='G' # segmentation type
label = 'RP' # receptor type (RP, RE, ki67)

# load features
budget_path = repo_path/ 'data' / 'budget' / 'budget.xlsx'
excluded, _ = get_ex_included(budget_path) # get excluded features due to their budget value
features = get_features(stype, excluded)

## PCC

Pearson Correlation Coefficient (PCC) based filtering is performed on the features to:
- Remove the redundant features
- Select those with more stable variability according to the budget values.
    - Stability can be seen as a feature-specific variance.

In [69]:
# Criteria to group features
pcc_value = 0.95

# Compute the pearson correlation coefficient
pcc = features.corr(method='pearson')
# compute absolute value, because we are interested in the magnitude of the correlation, not the sign
pcc = pcc.abs()
# if the value is greater than 0.9 but not exaclty 1, group them
pcc = pcc[(pcc > pcc_value) & (pcc < 1)]
pcc.to_csv('pcc_high.csv')
# go through each column and get the name of the features that are correlated
pcc_names = pcc.apply(lambda x: x.dropna().index.tolist(), axis=1)
pcc_names.to_csv('pcc_names.csv')

In [114]:
# list for the clusters (as lists)
clusters = []

# first example in column
for column in pcc_names.index:
    # check if list is empty
    if not pcc_names[column]: # if empty
        clusters.append([column]) # lone feature
    else: # if not empty
        # make a list including the first feature and all the features that are correlated to it
        connected_features = [column] + pcc_names[column]
        # check if any of the features in connected features are already in a cluster
        for cluster in clusters:
            if any(feature in cluster for feature in connected_features):
                # if so, add all the features to that cluster
                cluster.extend(connected_features)
                break
        else: # if not, create a new cluster
            clusters.append(connected_features)
# remove duplicates
for i, cluster in enumerate(clusters):
    cluster = list(set(cluster))
    clusters[i] = cluster

print(f'The number of clusters is {len(clusters)}')

# select a leader feature for each cluster depending on the budget value
budget = pd.read_excel(budget_path, index_col=0).T
cluster_leaders = []
# example
for cluster in clusters:
    # get the feature with the lowest budget value
    leader = budget[cluster].min().idxmin()
    # add it to the list
    cluster_leaders.append(leader)

print(f'The cluster leaders are: {cluster_leaders}')

The number of clusters is 31
The cluster leaders are: ['shape2D_Elongation', 'shape2D_MaximumDiameter', 'shape2D_PixelSurface', 'shape2D_MinorAxisLength', 'shape2D_PerimeterSurfaceRatio', 'shape2D_Sphericity', 'firstorder_10Percentile', 'firstorder_Median', 'firstorder_Entropy', 'firstorder_RobustMeanAbsoluteDeviation', 'firstorder_Kurtosis', 'firstorder_Maximum', 'firstorder_Minimum', 'firstorder_Range', 'glcm_SumAverage', 'glcm_Imc1', 'glcm_Idmn', 'glcm_Imc2', 'glcm_JointEntropy', 'glcm_MCC', 'glrlm_RunEntropy', 'glrlm_GrayLevelVariance', 'gldm_LargeDependenceLowGrayLevelEmphasis', 'gldm_LowGrayLevelEmphasis', 'glszm_SmallAreaHighGrayLevelEmphasis', 'glrlm_LongRunHighGrayLevelEmphasis', 'glrlm_LongRunLowGrayLevelEmphasis', 'ngtdm_Coarseness', 'ngtdm_Contrast', 'gldm_SmallDependenceHighGrayLevelEmphasis', 'gldm_SmallDependenceLowGrayLevelEmphasis']
