# Import librairies

In [8]:
# Import general librairies
import numpy as np
import pandas as pd

# Import librairies to connect with the database
import psycopg2 as ps
import pandas.io.sql as sqlio

# Import matplotlib librairies
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# Import cluster algorithms librairies
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score

# Import Filter librairies
from sklearn.feature_selection import VarianceThreshold

# Import librairies for SVD entropy : univariate selection
import antropy as ant

# Import itertools librairie for combinations
import itertools

# Import Tools script
%run ../Tools.ipynb

# Connect to database

In [9]:
conn = ps.connect(dbname="SLA", user="postgres", password="root", host="localhost", port="5432")

# Get data

In [10]:
# Set variables to evaluate
features = ['WEIGHT_VAR_M1', 'ALS_VAR_M1', 'CVF_VAR_M1',
            'WEIGHT_VAR_M3', 'ALS_VAR_M3', 'CVF_VAR_M3',
            'WEIGHT_VAR_M6', 'ALS_VAR_M6', 'CVF_VAR_M6',
            'WEIGHT_VAR_M9', 'ALS_VAR_M9', 'CVF_VAR_M9',
            'WEIGHT_VAR_M12', 'ALS_VAR_M12', 'CVF_VAR_M12']

# Set sql query
sql = 'SELECT * FROM "PATIENTS"'

# Get data from database
df_patients = sqlio.read_sql_query(sql, conn)[features].copy()
df_patients.shape

(1045, 15)

# Scale data

In [11]:
# Scale data
X_scaled = (df_patients - df_patients.min())/ (df_patients.max() - df_patients.min())

# Remove features with few values

In [7]:
# Set n that mean the required limit of records from a feature
n = 20
# For each features
for feature in features:
    # Get his number of records
    nb_records = np.sum(~pd.isna(X_scaled[feature]))
    # If his number of records is lower than n
    if(nb_records < n):
        # Drop feature
        X_scaled = X_scaled.drop(feature, axis=1)

# Update features
features = X_scaled.columns.values.tolist()
# Get shape
print('New Shape :', X_scaled.shape)

New Shape : (1045, 15)


# Remove features with low variance

In [153]:
# Set threshold
threshold = 0.015
# Set filter
qconstant_filter = VarianceThreshold(threshold=threshold)
# Fit data
qconstant_filter.fit(X_scaled)

# Get features to remove
qconstant_columns = [column for column in X_scaled.columns if column not in X_scaled.columns[qconstant_filter.get_support()]]
# Get current features
features = X_scaled.columns.values.tolist()

# For each constant features remove it from the features array to evaluate
for cf in qconstant_columns:
    features.remove(cf)

# Remove features with low variance from the dataframe
X_scaled_arr = qconstant_filter.transform(X_scaled)
X_scaled = pd.DataFrame(X_scaled_arr, columns=features)

# Show shape
print('New Shape :', X_scaled.shape)

New Shape : (1000, 11)


# Generate all possible subset of features

In [12]:
# Set n that means the maximum number (n - 1) of features for a subset
n = 4
# Set list that will contains all possible feature subset between 1 and n features
features_subset = []
for i in range(1, n):  # to get all lengths: 0 to 3
    for subset in itertools.combinations(features, i):
        features_subset.append(list(subset))

# Feature selection with Kmedoid

In [15]:
# Create dataframe that will contains silhouette scores for each feature
df_kmedoids_results = pd.DataFrame(columns=['Features Subset', 'Score', 'Patients'])

# Number of clusters to look for
n_clusters = 2

# For each features subset
for subset in features_subset:

    # If current subset contains just one feature
    if len(subset) == 1:   
        # add temporary y axis variable set to zero
        X_scaled['y'] = np.zeros(len(X_scaled.copy())).copy()

        # Set variable to be evaluated
        variables = [subset[0], 'y']
    # Otherwise, there are several features
    else:
        variables = subset

    # Make a copy of the dataframe
    df = X_scaled[variables]
    df= df.dropna()

    # Metric used
    metric = "manhattan"

    # Set up clustering method
    kmedoids = KMedoids(n_clusters=n_clusters, metric=metric)
    
    if len(df[variables]) >= n_clusters:
        df['cluster'] = kmedoids.fit_predict(df[variables])
        # Get score silhouette if the number of labels is equal to the number of cluster set
        if (kmedoids.n_clusters == len(np.unique(df['cluster']))):
            if len(df['cluster']) > 2:
                score = silhouette_score(df[variables], kmedoids.fit_predict(df[variables]), metric=metric)
                # Add subset and score into dataframe
                df_kmedoids_results.loc[len(df_kmedoids_results.index)] = [subset, score, float(len(df))]



In [16]:
# Largest
res = df_kmedoids_results.nlargest(50, 'Score')
print(res)

                                    Features Subset     Score  Patients
72                      [WEIGHT_VAR_M6, ALS_VAR_M9]  0.611125     176.0
75                     [WEIGHT_VAR_M6, ALS_VAR_M12]  0.605228     111.0
4                                      [ALS_VAR_M9]  0.590123     291.0
3                                   [WEIGHT_VAR_M9]  0.571023     270.0
5                                      [CVF_VAR_M9]  0.569188     163.0
73                      [WEIGHT_VAR_M6, CVF_VAR_M9]  0.567623      90.0
493        [WEIGHT_VAR_M6, ALS_VAR_M9, ALS_VAR_M12]  0.564741      88.0
330           [CVF_VAR_M1, ALS_VAR_M9, CVF_VAR_M12]  0.559928      10.0
0                                   [WEIGHT_VAR_M3]  0.558733     382.0
97                        [ALS_VAR_M9, ALS_VAR_M12]  0.548428     148.0
2                                      [CVF_VAR_M6]  0.547131     200.0
79                         [ALS_VAR_M6, ALS_VAR_M9]  0.544394     184.0
82                        [ALS_VAR_M6, ALS_VAR_M12]  0.544087   

In [17]:
# Smallest
res = df_kmedoids_results.nsmallest(50, 'Score')
print(res[res.Score >= 0])

                                  Features Subset     Score  Patients
307      [CVF_VAR_M1, WEIGHT_VAR_M6, ALS_VAR_M12]  0.015170      10.0
370      [WEIGHT_VAR_M3, ALS_VAR_M6, CVF_VAR_M12]  0.081818      32.0
314         [CVF_VAR_M1, ALS_VAR_M6, ALS_VAR_M12]  0.084057       9.0
275       [CVF_VAR_M1, WEIGHT_VAR_M3, ALS_VAR_M6]  0.085951       8.0
261      [ALS_VAR_M1, WEIGHT_VAR_M9, CVF_VAR_M12]  0.091929      48.0
113       [WEIGHT_VAR_M1, ALS_VAR_M1, CVF_VAR_M9]  0.094198      85.0
457         [CVF_VAR_M3, CVF_VAR_M6, CVF_VAR_M12]  0.095413      21.0
320         [CVF_VAR_M1, CVF_VAR_M6, ALS_VAR_M12]  0.116531       9.0
297          [CVF_VAR_M1, CVF_VAR_M3, ALS_VAR_M9]  0.119341       4.0
289          [CVF_VAR_M1, ALS_VAR_M3, CVF_VAR_M9]  0.123428       3.0
168       [WEIGHT_VAR_M1, ALS_VAR_M6, ALS_VAR_M9]  0.124602     111.0
126      [WEIGHT_VAR_M1, CVF_VAR_M1, ALS_VAR_M12]  0.127538      24.0
283          [CVF_VAR_M1, ALS_VAR_M3, CVF_VAR_M3]  0.129066      11.0
285          [CVF_VA