## Cluster feature importance

This notebook implements different measures of cluster importance for the paper

* Laplacian scores (Computed in results already)
* Intra-cluster variable similarity, using logistic regression and SVM
* Feature selection, using logistic regression and SVM with l1 penalty

In [5]:
# Include and load packages, config files

import numpy as np
import simlr_ad
import pandas as pd
from utils.data_utils import load_all_data
from utils.utils import compute_simlr, feat_ranking, estimate_number_clusters
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Parameters of the procedure
clusters = 3
rd_seed = 1714                                          # Random seed for experiment replication

# Paths
existing_cluster = True                               # Compute the clustering again or use an existing one
cluster_path = "results/extendeddata_cluster/"   # Path of the existing cluster, if applicable
covariate_path = "data/useddata_homo_abeta_plasma_meta.csv"                 # Path of the covariance data frame (.csv)
feature_path = "data/UCSDVOL.csv"                     # Path of the feature path (.csv)

covariate_data, cov_names, feature_data, feature_names = load_all_data(covariate_path, feature_path)
feature_data['DX'] = covariate_data.DX_bl.values

if existing_cluster:
    # Load existent
    c_data = pd.read_csv(cluster_path + 'cluster_data.csv')
    ## Load S, F data
    S = np.load(cluster_path + 'S_matrix.npy')
    ydata = np.load(cluster_path + 'ydata_matrix.npy')
    F = np.load(cluster_path + 'F_matrix.npy')
else:
    # Compute base clustering
    y_b, S, F, ydata, alpha = compute_simlr(
        np.array(covariate_data[cov_names]), clusters)


In [30]:
# Feature selection
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

# For different clusters
cov_names_np = np.array(cov_names)
for c in range(1, clusters+1):
    print('For Cluster ' + str(c))
    y = (c_data['C'] == c).astype(int)
    
    # LogisticRegression featselect
    lr = LogisticRegression(penalty='l1', C = 0.25)
    lr.fit(covariate_data[cov_names], y)
    sfm = SelectFromModel(lr, prefit=True)
    feats = sfm.get_support(indices=True)
    print(len(feats))
    print(cov_names_np[feats])
    
    # LinearSVC feat select
    svm = LinearSVC(penalty='l1', C = 0.1, dual=False)
    svm.fit(covariate_data[cov_names], y)
    
    sfm = SelectFromModel(svm, prefit=True)
    feats = sfm.get_support(indices=True)
    print(len(feats))
    print(cov_names_np[feats])
    

For Cluster 1
14
['Alpha-1-Microglobulin (A1Micro) (ug/ml)'
 'Apolipoprotein A-I (Apo A-I) (mg/mL)' 'CD40 Ligand (CD40-L) (ng/mL)'
 'Fatty Acid-Binding Protein- heart  (FABP (ng/mL)'
 'Follicle-Stimulating Hormone (FSH) (mIU/mL)'
 'Growth-Regulated alpha protein (GRO-alph (pg/mL)'
 'Immunoglobulin E (IgE) (ng/mL)'
 'Prostatic Acid Phosphatase (PAP) (ng/mL)'
 'Pregnancy-Associated Plasma Protein A (P (mIU/mL)'
 'T-Cell-Specific Protein RANTES (RANTES) (ng/mL)' 'Resistin (ng/ml)'
 'Testosterone- Total (ng/ml)' 'Thrombomodulin (TM) (ng/ml)' 'Kynurenine']
20
['Alpha-1-Microglobulin (A1Micro) (ug/ml)'
 'Apolipoprotein A-I (Apo A-I) (mg/mL)'
 'Apolipoprotein C-I (Apo C-I) (ng/ml)' 'CD40 Ligand (CD40-L) (ng/mL)'
 'Chromogranin-A (CgA) (ng/mL)'
 'Fatty Acid-Binding Protein- heart  (FABP (ng/mL)'
 'Follicle-Stimulating Hormone (FSH) (mIU/mL)'
 'Growth-Regulated alpha protein (GRO-alph (pg/mL)'
 'Immunoglobulin E (IgE) (ng/mL)'
 'Matrix Metalloproteinase-9- total (MMP-9 (ng/ml)'
 'Prostatic Acid

In [None]:
# Intra cluster variable similarity

