In [18]:
from pathlib import Path
import os, sys
repo_path= Path.cwd().resolve()
while '.gitignore' not in os.listdir(repo_path): # while not in the root of the repo
    repo_path = repo_path.parent #go up one level
sys.path.insert(0,str(repo_path)) if str(repo_path) not in sys.path else None

import pandas as pd
import numpy as np
#Import paths and patients classes
from notebooks.info import path_label, patient
import notebooks.utils as utils

In [4]:
def get_ex_included(budget_path: Path):
    """get excluded features name using the budget CV value

    Args:
        budget_path (Path): path to the csv file with the budget

    Returns:
        sequences: excluded and included features
    """
    # get the name of the features from the budget
    budget = pd.read_excel(budget_path, index_col=0)
    # change name of column
    budget.columns = ['budget']
    # get all features with values greater than 1
    excluded = budget[budget[ 'budget' ] > 1].index
    # get all other names
    included = budget[budget[ 'budget' ] <= 1].index
    
    return excluded, included

def get_features(stype:str, excluded:list):
    """
    return df with features given the segmentation type and excluded features

    Args:
        stype (str): general or focal
        excluded (seq): sequence of excluded features

    Returns:
        pd.DataFrame: df with features
    """
    # get features
    features = pd.read_csv(repo_path / 'data' / 'features' / f'features_all_time{stype}.csv', index_col=0)
    features = features.groupby(by='pat_num', axis=0).mean()
    # remove features in excluded list
    features = features.drop(excluded, axis=1)

    return features

# Image preprocessing

## CV greater than 100 (Uncompatible features)

In [9]:
# load all features
# HP
stype='G' # segmentation type
label = 'RP' # receptor type (RP, RE, ki67)
budget_path = repo_path/ 'data' / 'budget' / 'budget.xlsx'

excluded, _ = get_ex_included(budget_path) # get excluded features due to their budget value
features = get_features(stype, excluded)
# load info class
info = path_label()

## PCC

Pearson Correlation Coefficient (PCC) based filtering is performed on the features to:
- Remove the redundant features
- Select those with more stable variability according to the budget values.

First all features with CV greater that 100 are removed

In [44]:
# HP
pcc_value = 0.95 # PCC value to group features

# Compute the pearson correlation coefficient
pcc = features.corr(method='pearson')
# compute absolute value
pcc = pcc.abs()
# if the value is greater than 0.9 but not exaclty 1, group them
pcc = pcc[(pcc > pcc_value) & (pcc < 1)]
pcc.to_csv('pcc_high.csv')
# go through each column and get the name of the features that are correlated
pcc_names = pcc.apply(lambda x: x.dropna().index.tolist(), axis=1)
pcc_names.to_csv('pcc_names.csv')

In [47]:
# go thorugh all columns and make groups of features if they are correlated at least with another feature
pcc_groups = []
for i in range(len(pcc_names)):
    # get the name of the feature
    feat_name = pcc_names.index[i]
    # get correlated features
    correlated = pcc_names[i]
    if len(correlated)==0: # check if empty
        continue # skip
    # check if the feature is already in a group
    if feat_name in [item for sublist in pcc_groups for item in sublist]:
        continue
    # check if the correlated features are already in a group
    for group in pcc_groups:
        if any(elem in group for elem in correlated): 
            # add to the group
            group.extend(correlated)
            break
    else:
        # create a new group
        pcc_groups.append(correlated)
# remove duplicates
pcc_groups = [list(set(x)) for x in pcc_groups]
# remove empty lists
pcc_groups = [x for x in pcc_groups if x != []]

print(f'There are {len(pcc_groups)} groups of features with PCC greater than {pcc_value}')

There are 16 groups of features with PCC greater than 0.95
