In [1]:
import pickle
import numpy as np
import pandas as pd

In [2]:
EXTRACTED_FEATURES_SAVE_ADDR = "../extracted_features.pickle"
NUMBER_OF_PCA_COMPONENTS = 512

In [3]:
# read extracted features from pickle file
def read_extracted_features(extracted_features_save_adr):
    with open(extracted_features_save_adr, 'rb') as input_file:
        return pickle.load(input_file)

In [4]:
features = read_extracted_features(EXTRACTED_FEATURES_SAVE_ADDR)
features = dict(features)
len(features)

170

In [5]:
# Réduction dimensionnelle
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture

In [6]:
df1 = pd.DataFrame.from_dict({(i,j): features[i][j]
                            for i in features.keys()
                            for j in features[i].keys()},orient='index')
df1.shape

(30007, 1024)

In [7]:
pca = PCA(n_components=NUMBER_OF_PCA_COMPONENTS, svd_solver='full', whiten=True)
feat1=pca.fit_transform(df1)

In [8]:
new_features = {}

for patient, arrays in features.items():
    num_arrays = len(arrays)
    start_index = sum(len(features[p]) for p in features.keys() if p < patient)
    end_index = start_index + num_arrays
    new_features[patient] = feat1[start_index:end_index]

In [9]:
patient_1 = list(new_features.keys())[0]
len(new_features[patient_1])
new_features[patient_1]

array([[-0.68519324, -0.036267  ,  0.68056554, ...,  0.1245825 ,
         0.46785548, -0.5620293 ],
       [ 0.08258477,  0.2915892 , -0.453633  , ...,  0.2324271 ,
         1.4754457 , -3.426455  ],
       [ 0.07229007,  0.15851565, -0.6015045 , ..., -0.36923653,
        -0.9832874 , -0.952228  ],
       ...,
       [ 0.75171405,  0.951176  , -1.1128942 , ..., -0.05477891,
         0.27957812, -1.670907  ],
       [ 0.07584345,  1.3097154 ,  0.14997289, ...,  0.18237059,
         0.13031238, -0.17452797],
       [ 1.0626428 ,  0.9879044 , -0.23988684, ..., -1.5661945 ,
         1.2969369 , -1.068313  ]], dtype=float32)

In [10]:
#Gaussian
gmm_features={}
for patient in new_features.keys():
    gmm = GaussianMixture(n_components=1,covariance_type='diag')
    gmm.fit(new_features[patient])
    mean_cov = np.concatenate([gmm.means_, gmm.covariances_], axis=1)
    gmm_features.update({patient: mean_cov})




In [11]:
len(gmm_features)
patient_1 = list(gmm_features.keys())[0]
gmm_features[patient_1]
#gmm_features

array([[ 0.30672966,  0.46950914, -0.27195971,  0.03802643,  0.53005146,
         0.10078505,  0.32233573, -0.49414574,  0.49855606,  0.64133787,
         0.59530286,  0.99445369,  0.72433078,  0.67011592,  0.57150599,
         0.83342374]])

In [12]:
reshaped_gmm_features = {}
for key, value in gmm_features.items():
    reshaped_gmm_features[key] = value.reshape(-1)
df = pd.DataFrame.from_dict(reshaped_gmm_features, orient='index')
df.to_csv('../CSV/features_gmm3.csv', index=True, header=True)

In [40]:
# Merge the extracted features with the clinical data on the patient ID
clinical_data = pd.read_csv('../CSV/clinical_data_with_no_missing_values.csv')
clinical_data = clinical_data.set_index('patient_id')
features1 = pd.read_csv('../CSV/features_gmm3.csv')
features1 = features1.set_index('Unnamed: 0')
features1 = clinical_data.join(features1)
features1.to_csv('../CSV/features_gmm3_with_clinical_data.csv', index=True, header=True)