In [None]:
import sys
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, dbscan

import hdbscan

%matplotlib inline

In [None]:

def sigmoid(x):
    #return np.tanh(x)
    return 1/(1+np.exp(-x))

def sigmoid_log(x):
    return sigmoid(np.log(x))

In [None]:
os.chdir("L:/lab_research/RES-Folder-UPOD/Celldynclustering")
celldyn = pd.read_sas("E_ResearchData/2_ResearchData/celldyn.sas7bdat")

In [None]:
celldyn['afnamejaar']  = pd.to_datetime(celldyn.afname_dt).dt.year

In [78]:
meas_columns = [c for c in celldyn.columns if 'c_b' in c]
mode_columns = [c for c in celldyn.columns if 'c_m' in c]
alrt_columns = [c for c in celldyn.columns if 'alrt' in c.lower()]
c_s_columns = [c for c in celldyn if 'c_s_' in c.lower()]

In [79]:
np.any(celldyn[c_s_columns] == 0,axis = 1)

0           True
1           True
2           True
3           True
4          False
           ...  
3107031    False
3107032    False
3107033    False
3107034     True
3107035     True
Length: 3099542, dtype: bool

In [80]:
c_s_columns[-3:]

['c_s_retc', 'c_s_pretc', 'c_s_irf']

In [87]:
for c in c_s_columns:
    print(c,len(celldyn.loc[celldyn[c] == 0]))

c_s_wbc 152
c_s_wvf 152
c_s_neu 152
c_s_seg 152
c_s_bnd 152
c_s_ig 152
c_s_lym 152
c_s_lyme 152
c_s_vlym 152
c_s_mon 152
c_s_mone 152
c_s_blst 152
c_s_eos 152
c_s_bas 152
c_s_nrbc 152
c_s_pneu 152
c_s_pseg 152
c_s_pbnd 152
c_s_pig 152
c_s_plym 152
c_s_plyme 152
c_s_pvlym 152
c_s_pmon 152
c_s_pmone 152
c_s_pblst 152
c_s_peos 152
c_s_pbas 152
c_s_pnrbc 152
c_s_rbci 152
c_s_rbco 152
c_s_hb 152
c_s_mcv 152
c_s_rdw 152
c_s_mch 152
c_s_mchc 152
c_s_ht 152
c_s_plt 155
c_s_plto 152
c_s_plti 152
c_s_mpv 152
c_s_pct 152
c_s_pdw 152
c_s_retc 613135
c_s_pretc 613135
c_s_irf 613135


In [82]:
celldyn.loc[celldyn.c_s_rdw == 0][c_s_columns].sum(axis = 1).sum()

0.0

In [234]:
celldyn.loc[np.any(celldyn[c_s_columns] == -1,axis = 1)][c_s_columns]

Unnamed: 0,c_s_wbc,c_s_wvf,c_s_neu,c_s_seg,c_s_bnd,c_s_ig,c_s_lym,c_s_lyme,c_s_vlym,c_s_mon,...,c_s_ht,c_s_plt,c_s_plto,c_s_plti,c_s_mpv,c_s_pct,c_s_pdw,c_s_retc,c_s_pretc,c_s_irf


In [84]:
cluster_columns = c_s_columns+alrt_columns
celldyn = celldyn[cluster_columns].dropna()
clust_data = celldyn.sample(250000)

### try ARM

In [86]:
fin_c_s_cols = ['c_s_wbc', 'c_s_wvf', 'c_s_neu', 'c_s_seg', 'c_s_lym','c_s_vlym', 'c_s_mon', 'c_s_mone', 'c_s_blst',
       'c_s_eos', 'c_s_bas', 'c_s_nrbc', 'c_s_rbci', 'c_s_rbco', 'c_s_hb', 'c_s_mcv', 'c_s_rdw', 'c_s_mch',
       'c_s_mchc', 'c_s_ht', 'c_s_plt', 'c_s_plto', 'c_s_plti', 'c_s_mpv',
       'c_s_pct', 'c_s_pdw', 'c_s_retc', 'c_s_pretc', 'c_s_irf']

In [165]:
with open("E_ResearchData/2_ResearchData/arm_index_to_take.txt",'r') as f:
    index_to_take = [int(x) for x in f.read().split("\n")[:-1]]

### c_s_ != 1


Bekijk associaties tussen alert en suspect vlaggen als ze niet OK zijn (c_s_ != 1, alrt == 1)

In [224]:
arm_data = celldyn.loc[index_to_take]
arm_data[alrt_columns] = np.where(arm_data[alrt_columns] == 1,True,False)
arm_data[c_s_columns] = np.where(arm_data[c_s_columns] != 1,True,False)
frequent_candidates = apriori(arm_data[c_s_columns+alrt_columns], min_support=0.05, use_colnames=True,low_memory=True)

rules = association_rules(frequent_candidates, metric="lift", min_threshold=1)
interesting_rules = rules[ (rules['confidence'] == 1) ]
c_s_isno1 = {tuple(k):tuple(v) for k,v in zip(interesting_rules.antecedents,interesting_rules.consequents)}
c_s_isno1

{('c_s_pneu',): ('c_s_neu',),
 ('c_Alrt_bnd',): ('c_s_neu', 'c_s_pneu'),
 ('c_Alrt_ig',): ('c_s_neu', 'c_s_pneu'),
 ('c_s_plym',): ('c_s_lym',),
 ('c_Alrt_vlym',): ('c_s_lym', 'c_s_plym'),
 ('c_s_pmon',): ('c_s_mon',),
 ('c_Alrt_blst',): ('c_s_mon', 'c_s_pmon'),
 ('c_s_pnrbc',): ('c_s_nrbc',),
 ('c_s_rbci',): ('c_s_retc',),
 ('c_s_irf',): ('c_s_pretc',),
 ('c_s_mon', 'c_s_pneu'): ('c_s_neu', 'c_s_pmon'),
 ('c_s_neu', 'c_s_pmon'): ('c_s_mon', 'c_s_pneu'),
 ('c_s_pneu', 'c_s_pmon'): ('c_s_mon', 'c_s_neu'),
 ('c_s_retc', 'c_s_pneu'): ('c_s_neu',),
 ('c_s_pretc', 'c_s_pneu'): ('c_s_neu',),
 ('c_s_pneu', 'c_s_irf'): ('c_s_neu', 'c_s_pretc'),
 ('c_s_neu', 'c_Alrt_bnd'): ('c_s_pneu',),
 ('c_Alrt_bnd', 'c_s_pneu'): ('c_s_neu',),
 ('c_s_neu', 'c_Alrt_ig'): ('c_s_pneu',),
 ('c_Alrt_ig', 'c_s_pneu'): ('c_s_neu',),
 ('c_s_neu', 'c_s_irf'): ('c_s_pretc',),
 ('c_Alrt_bnd', 'c_Alrt_ig'): ('c_s_neu', 'c_s_pneu'),
 ('c_s_mon', 'c_s_plym'): ('c_s_lym', 'c_s_pmon'),
 ('c_s_lym', 'c_s_pmon'): ('c_s_mon', 

### c_s_ = 2

Check als c_s_ kolommen 2 geven, dit ook gelijk 2 is bij andere c_s_ kolommen

In [219]:
arm_data = celldyn.loc[index_to_take]
arm_data[alrt_columns] = np.where(arm_data[alrt_columns] == 1,True,False)
arm_data[c_s_columns] = np.where(arm_data[c_s_columns] == 2,True,False)
frequent_candidates = apriori(arm_data[c_s_columns+alrt_columns], min_support=0.05, use_colnames=True,low_memory=True)

rules = association_rules(frequent_candidates, metric="lift", min_threshold=1)
interesting_rules = rules[ (rules['confidence'] == 1) ]
c_s_is2 = {tuple(k):tuple(v) for k,v in zip(interesting_rules.antecedents,interesting_rules.consequents)}
c_s_is2

{}

### c_s = 4

Check als c_s_ kolommen 4 geven, dit ook gelijk 4 is bij andere c_s_ kolommen

In [222]:
arm_data = celldyn.loc[index_to_take]
arm_data[alrt_columns] = np.where(arm_data[alrt_columns] == 1,True,False)
arm_data[c_s_columns] = np.where(arm_data[c_s_columns] == 4,True,False)
frequent_candidates = apriori(arm_data[c_s_columns+alrt_columns], min_support=0.05, use_colnames=True,low_memory=True)

rules = association_rules(frequent_candidates, metric="lift", min_threshold=1)
interesting_rules = rules[ (rules['confidence'] == 1) ]
c_s_is4 = {tuple(k):tuple(v) for k,v in zip(interesting_rules.antecedents,interesting_rules.consequents)}
c_s_is4

{}

### c_s = 5

Check als c_s_ kolommen 5 geven, dit ook gelijk 5 is bij andere c_s_ kolommen

In [223]:
arm_data = celldyn.loc[index_to_take]
arm_data[alrt_columns] = np.where(arm_data[alrt_columns] == 1,True,False)
arm_data[c_s_columns] = np.where(arm_data[c_s_columns] == 5,True,False)
frequent_candidates = apriori(arm_data[c_s_columns+alrt_columns], min_support=0.05, use_colnames=True,low_memory=True)

rules = association_rules(frequent_candidates, metric="lift", min_threshold=1)
interesting_rules = rules[ (rules['confidence'] == 1) ]
c_s_is5 = {tuple(k):tuple(v) for k,v in zip(interesting_rules.antecedents,interesting_rules.consequents)}
c_s_is5

{('c_s_neu',): ('c_s_pneu',),
 ('c_s_pneu',): ('c_s_neu',),
 ('c_s_mon',): ('c_Alrt_blst', 'c_s_pmon'),
 ('c_s_pmon',): ('c_s_mon', 'c_Alrt_blst'),
 ('c_s_neu', 'c_Alrt_bnd'): ('c_s_pneu',),
 ('c_Alrt_bnd', 'c_s_pneu'): ('c_s_neu',),
 ('c_s_neu', 'c_Alrt_ig'): ('c_s_pneu',),
 ('c_Alrt_ig', 'c_s_pneu'): ('c_s_neu',),
 ('c_s_mon', 'c_Alrt_blst'): ('c_s_pmon',),
 ('c_s_mon', 'c_s_pmon'): ('c_Alrt_blst',),
 ('c_Alrt_blst', 'c_s_pmon'): ('c_s_mon',),
 ('c_s_neu', 'c_Alrt_bnd', 'c_Alrt_ig'): ('c_s_pneu',),
 ('c_Alrt_bnd', 'c_Alrt_ig', 'c_s_pneu'): ('c_s_neu',)}

### try permutations of c_s_ = {2,4,5}.
Set 1 on 2,4 or 5, and see if other columns not on that number combine
SO see if you put 1 column on 2, other columns are always 4 or 5, and do this the other way around as well

In [230]:
for col in c_s_columns:
    arm_data = celldyn.loc[index_to_take]
    arm_data[alrt_columns] = np.where(arm_data[alrt_columns] == 1,True,False)
    arm_data[col] = np.where(arm_data[col] == 5,True,False)
    other_cols = [c for c in c_s_columns if c != col]
    arm_data[other_cols] = np.where(arm_data[other_cols] == 4,True,False)
    frequent_candidates = apriori(arm_data[c_s_columns+alrt_columns], min_support=0.05, use_colnames=True,low_memory=True)
    rules = association_rules(frequent_candidates, metric="lift", min_threshold=1)
    interesting_rules = rules[ (rules['confidence'] == 1) ]
    print(col,{tuple(k):tuple(v) for k,v in zip(interesting_rules.antecedents,interesting_rules.consequents) if tuple(k) not in c_s_is4.keys() and tuple(k) not in c_s_is5.keys()})

c_s_wbc {}
c_s_wvf {}
c_s_neu {}
c_s_seg {}
c_s_bnd {}
c_s_ig {}
c_s_lym {}
c_s_lyme {}
c_s_vlym {}
c_s_mon {}
c_s_mone {}
c_s_blst {}
c_s_eos {}
c_s_bas {}
c_s_nrbc {}
c_s_pneu {}
c_s_pseg {}
c_s_pbnd {}
c_s_pig {}
c_s_plym {}
c_s_plyme {}
c_s_pvlym {}
c_s_pmon {}
c_s_pmone {}
c_s_pblst {}
c_s_peos {}
c_s_pbas {}
c_s_pnrbc {}
c_s_rbci {}
c_s_rbco {}
c_s_hb {}
c_s_mcv {}
c_s_rdw {}
c_s_mch {}
c_s_mchc {}
c_s_ht {}
c_s_plt {}
c_s_plto {}
c_s_plti {}
c_s_mpv {}
c_s_pct {}
c_s_pdw {}
c_s_retc {}
c_s_pretc {}
c_s_irf {}


### try PCA

In [None]:
var_ex = []
for i in range(2,20):
    pca = PCA(n_components=i)
    pca.fit(celldyn[cluster_columns].dropna())
    var_ex.append(pca.explained_variance_)
    
    


In [None]:
pca = PCA(n_components = 8)
transformed_data = pca.fit_transform(clust_data.dropna())

In [None]:
km = KMeans(n_clusters=3).fit(transformed_data)

In [None]:
celldyn = celldyn.dropna()
celldyn['km_labels']  = km.labels_

### check clusters after pca

In [None]:
plot_df = pd.DataFrame(data=transformed_data, columns=[f'd_{i}' for i in range(8)], index=clust_data.index)
plot_df['labels'] = hdb_cl.labels_
#reduce_dim = min([6, reduce_dim])


In [None]:
cols_to_plot = ['c_s_wbc', 'c_s_wvf', 'c_s_neu', 'c_s_seg', 'c_s_lym','c_s_vlym', 'c_s_mon', 'c_s_mone', 'c_s_blst',
       'c_s_eos', 'c_s_bas', 'c_s_nrbc','c_s_rbci', 'c_s_rbco', 'c_s_hb', 'c_s_mcv', 'c_s_rdw', 'c_s_mch','c_s_mchc', 'c_s_ht',
        'c_s_plt', 'c_s_plto', 'c_s_plti', 'c_s_mpv','c_s_pct', 'c_s_pdw', 'c_s_retc', 'c_s_pretc', 'c_s_irf']

In [None]:
pd.set_option("display.max_columns",50)
clust_data[c_s_columns].corr()

In [None]:
len(cols_to_plot)

In [None]:
num_rows = int(np.ceil(len(cols_to_plot)*len(cols_to_plot)/3))
fig, ax = plt.subplots(ncols=3, nrows=num_rows, figsize=(18, 3*num_rows))
k = 0
rest_cols = cols_to_plot[1:]

for ki in range(len(cols_to_plot)):
    rest_cols = rest_cols[1:]
    for kj in range(len(rest_cols)): 
        i = int(k/3)
        j = k%3
        if num_rows > 1:
            sns.scatterplot(data=clust_data.sample(20000), x=cols_to_plot[ki], y=rest_cols[kj],
                            ax=ax[i,j], 
                            hue = 'labels',alpha=0.5)
        else:
            sns.scatterplot(data=clust_data.sample(20000),x=cols_to_plot[ki], y=rest_cols[kj],
                            ax=ax[k], 
                             hue = 'labels',alpha=0.5)
        k += 1
        