## TriHSPAM
### Experiments with ALS dataset
with Strict Alignments Option

##### 0. Preprocessing & Loading Functions

In [1]:
import numpy as np
import csv
import pandas as pd
    # Function to convert float(nan) to np.nan
def convert_nan(arr):
    if isinstance(arr, float):
        return np.nan
    else:
        return arr
def read_transactions_csv(file_path):
    visits = {}
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            client_id = row['id']
            if client_id not in visits:
                visits[client_id] = []
            visits[client_id].append(row)
    
    max_transactions = max(len(visits[client_id]) for client_id in visits)
    num_clients = len(visits)
    num_features = len(visits[next(iter(visits))][0]) - 1  # Excluding patientID
    
    # Create a 3D numpy array to store visits
    transactions_array = np.empty((max_transactions, num_clients, num_features), dtype=object)
    
    for i, (client_id, client_transactions) in enumerate(visits.items()):
        for j, transaction in enumerate(client_transactions):
            transaction_details = [val if val is not None and val != "" else np.nan for key, val in transaction.items() if key != 'id']
            transactions_array[j, i, :] = transaction_details
        # If a patient has fewer transactions than the maximum, pad with np.nan
        for j in range(len(client_transactions), max_transactions):
            transactions_array[j, i, :] = np.nan
    transactions_array = np.transpose(transactions_array, (2,1,0))
    # transactions_array = np.vectorize(convert_nan)(transactions_array)

    return transactions_array

In [2]:
transactions_3d_array = read_transactions_csv("realData/ALS_data/ALS_snapshots_modified_subscores.csv")
print(transactions_3d_array.shape)

(15, 1715, 20)


##### 1. Triclustering with TriHSPAM

In [3]:
from TriHSPAM import TriHSPAM
triclustering = TriHSPAM(symb_features_idx=[0,5,6,13,14],
                          num_features_idx=[1,2,3,4,7,8,9,10,11,12],
                          min_I= 170,
                          min_J=2, 
                          min_K=2,
                          disc_method="eq_width",
                          n_bins=10,
                          time_as_cols=True,
                          time_relaxed=False,
                          spm_algo='fournier08closed')

In [4]:
triclustering.fit(transactions_3d_array)

>/Users/diogosoares/Documents/PhD Work/TriHSPAM/src/spmf_vd.jar
 Total time ~ 495882 ms
 Frequent sequences count : 3185



<TriHSPAM.TriHSPAM at 0x10c8b3e50>

In [5]:
trics = triclustering.triclusters_()

In [6]:
len(trics)

2718

In [7]:
transactions_3d_array.transpose((1,0,2)).shape

(1715, 15, 20)

##### 2. Assessing Statistical Significance

In [9]:
from sigStats import Tricluster, significance
import copy

lst_trics = []
data_f = copy.deepcopy(transactions_3d_array.transpose((1,0,2)))
Y_cat = [0,5,6,13,14,9,10,11,12]
            
for t_id, tric in enumerate(trics):
    t = Tricluster(data_f, Y_cat, sorted(tric[0]), tric[1], tric[2])
    lst_trics.append(t)

p_values = significance(data_f, Y_cat, lst_trics, 0.1)

In [10]:
sign_stats = []

for t_i, p_val in enumerate(p_values):
    if p_val < 0.001:
        row = {'TricID':t_i, 'p-value':p_val}
        sign_stats.append(row)

df_stat = pd.DataFrame(sign_stats)
df_stat.to_csv("signStats_als_relaxed.csv", index=False)

In [11]:
df_stat.sort_values('p-value').head(10)

Unnamed: 0,TricID,p-value
2464,2464,7.120236e-306
2053,2053,7.899012e-306
1918,1918,8.299525e-306
2548,2548,1.386221e-305
2002,2002,1.466324e-305
941,941,1.6554549999999998e-305
988,988,1.682156e-305
388,388,1.726657e-305
435,435,1.753358e-305
2499,2499,1.831236e-305


##### 3. Patterns Representation

In [12]:
from visualize_patterns import compute_representative_patterns

df = pd.read_csv("realData/ALS_data/ALS_snapshots_modified_subscores.csv", sep=',')
data_modes = [0,5,6,13,14]
data_means = [1,2,3,4,7,8]
data_medians = [9,10,11,12]

for t_id in df_stat.sort_values('p-value').head(10)['TricID']:
    tric_dims = trics[t_id]
    print(len(tric_dims[0]), list(df.columns[1:][tric_dims[1]]), tric_dims[2], sep='\t')
    print(compute_representative_patterns(triclustering.get_tricluster(t_id), 
                                        mode_feats=[i for i in tric_dims[1] if i in data_modes],
                                        mean_features=[i for i in tric_dims[1] if i in data_means],
                                        median_features=[i for i in tric_dims[1] if i in data_medians]))


1396	['slope', 'bulbar_subscore', 'respiratory_subscore']	[0, 1]
[[0.68, 12.0, 12.0], [0.68, 11.0, 12.0]]
1361	['height', 'slope']	[0, 1]
[[1.64, 0.67], [1.64, 0.67]]
1343	['height', 'slope', 'bulbar_subscore', 'respiratory_subscore']	[0, 1]
[[1.64, 0.67, 12.0, 12.0], [1.64, 0.67, 11.0, 12.0]]
1093	['slope', 'alsfrs_r_tot_score', 'bulbar_subscore', 'respiratory_subscore']	[0, 1]
[[0.53, 44.0, 12.0, 12.0], [0.53, 42.0, 11.0, 12.0]]
1057	['height', 'slope', 'alsfrs_r_tot_score', 'bulbar_subscore', 'respiratory_subscore']	[0, 1]
[[1.65, 0.53, 44.0, 12.0, 12.0], [1.65, 0.53, 42.0, 11.0, 12.0]]
972	['slope', 'Onset_form']	[0, 1]
[[0.68, 'onset_limbs'], [0.68, 'onset_limbs']]
960	['slope', 'bulbar_subscore', 'respiratory_subscore', 'Onset_form']	[0, 1]
[[0.68, 12.0, 12.0, 'onset_limbs'], [0.68, 12.0, 12.0, 'onset_limbs']]
940	['height', 'slope', 'Onset_form']	[0, 1]
[[1.65, 0.68, 'onset_limbs'], [1.65, 0.68, 'onset_limbs']]
928	['height', 'slope', 'bulbar_subscore', 'respiratory_subscore', '

  vals[i] = round(np.nanmean(converted_column),2)
  return function_base._ureduce(a, func=_nanmedian, keepdims=keepdims,
