## TriHSPAM

### Experiments with MS dataset 
with Strict Alignments Option

##### 0. Preprocessing & Loading Functions

In [1]:
import csv
import numpy as np

def read_transactions_csv(file_path):
    visits = {}
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            patient_id = row['patient_id']
            if patient_id not in visits:
                visits[patient_id] = []
            visits[patient_id].append(row)
    
    max_visits = max(len(visits[patient_id]) for patient_id in visits)
    num_patients = len(visits)
    num_features = len(visits[next(iter(visits))][0]) - 1  # Excluding patientID
    
    # Create a 3D numpy array to store visits
    transactions_array = np.empty((max_visits, num_patients, num_features), dtype=object)
    
    for i, (patient_id, patient_visit) in enumerate(visits.items()):
        for j, transaction in enumerate(patient_visit):
            transaction_details = [val if val is not None and val != "" else np.nan for key, val in transaction.items() if key != 'patient_id']
            transactions_array[j, i, :] = transaction_details
        # If a patient has fewer transactions than the maximum, pad with np.nan
        for j in range(len(patient_visit), max_visits):
            transactions_array[j, i, :] = np.nan
    transactions_array = np.transpose(transactions_array, (2,1,0))
    return transactions_array

In [2]:
transactions_3d_array = read_transactions_csv("realData/MS_data/MS_snapshots_modified.csv")
print(transactions_3d_array.shape)

(11, 510, 28)


##### 1. Triclustering with TriHSPAM

In [3]:
from TriHSPAM import TriHSPAM
triclustering = TriHSPAM(symb_features_idx=[0,1,2,5,6,7,8],
                          num_features_idx=[3,4,9,10],
                          min_I= 50,
                          min_J=2, 
                          min_K=2,
                          disc_method="eq_width",
                          n_bins=10,
                          time_as_cols=True,
                          time_relaxed=False, #strict alignments option
                          spm_algo='fournier08closed')

In [4]:
triclustering.fit(transactions_3d_array)

>/Users/diogosoares/Documents/PhD Work/TriHSPAM/src/spmf_vd.jar
 Total time ~ 2075941 ms
 Frequent sequences count : 8524



<TriHSPAM.TriHSPAM at 0x110a87310>

In [5]:
trics = triclustering.triclusters_()

In [6]:
len(trics)

7548

##### 2. Assessing Statistical Significance

In [7]:
from sigStats import significance, Tricluster
import pandas as pd
import copy

lst_trics = []
data_f = copy.deepcopy(transactions_3d_array.transpose((1,0,2)))
Y_cat = [0,1,2,5,6,7,8,10]
            
for t_id, tric in enumerate(trics):
    t = Tricluster(data_f, Y_cat, tric[0], tric[1], tric[2])
    lst_trics.append(t)

p_values = significance(data_f, Y_cat, lst_trics, 0.1)

In [8]:
sign_stats = []

for t_i, p_val in enumerate(p_values):
    if p_val < 0.001:
        row = {'TricID':t_i, 'p-value':p_val}
        sign_stats.append(row)

df_stat = pd.DataFrame(sign_stats)
df_stat.to_csv("signStats_ms.csv", index=False)

In [9]:
df_stat.sort_values('p-value').head(10)

Unnamed: 0,TricID,p-value
7459,7459,5.11767e-307
3359,3359,1.023534e-306
3389,3389,1.068035e-306
7458,7458,1.134788e-306
7455,7455,1.490799e-306
3388,3388,1.668805e-306
3387,3387,2.0025659999999997e-306
7457,7457,2.225074e-306
7454,7454,2.314077e-306
7453,7453,2.536584e-306


##### 3. Patterns Representation

In [10]:
from visualize_patterns import compute_representative_patterns

df = pd.read_csv("realData/MS_data/MS_snapshots_modified.csv", sep=',')
data_modes = [0,1,2,5,6,7,8]
data_means = [3,4,9]
data_medians = [10]

for t_id in df_stat.sort_values('p-value').head(10)['TricID']:
    tric_dims = trics[t_id]
    print(len(tric_dims[0]), list(df.columns[1:][tric_dims[1]]), tric_dims[2])
    print(compute_representative_patterns(triclustering.get_tricluster(t_id), 
                                        mode_feats=[i for i in tric_dims[1] if i in data_modes],
                                        mean_features=[i for i in tric_dims[1] if i in data_means],
                                        median_features=[i for i in tric_dims[1] if i in data_medians]))


488 ['age_at_onset', 'edss_as_evaluated_by_clinician'] [0, 1]
[[31.15, 2.0], [31.15, 2.0]]
465 ['ms_in_pediatric_age', 'age_at_onset'] [0, 1]
[['False', 31.97], ['False', 31.97]]
463 ['ms_in_pediatric_age', 'age_at_onset', 'edss_as_evaluated_by_clinician'] [0, 1]
[['False', 31.96, 2.0], ['False', 31.96, 1.75]]
460 ['age_at_onset', 'edss_as_evaluated_by_clinician'] [0, 1, 2]
[[31.25, 2.0], [31.25, 2.0], [31.25, 1.5]]
444 ['age_at_onset', 'edss_as_evaluated_by_clinician'] [0, 1, 2]
[[31.28, 2.0], [31.28, 2.0], [31.28, 1.5]]
436 ['ms_in_pediatric_age', 'age_at_onset', 'edss_as_evaluated_by_clinician'] [0, 1, 2]
[['False', 32.08, 2.0], ['False', 32.08, 2.0], ['False', 32.08, 1.5]]
421 ['ms_in_pediatric_age', 'age_at_onset', 'edss_as_evaluated_by_clinician'] [0, 1, 2]
[['False', 32.1, 2.0], ['False', 32.1, 2.0], ['False', 32.1, 1.5]]
411 ['age_at_onset', 'edss_as_evaluated_by_clinician'] [0, 1, 2, 3]
[[31.3, 2.0], [31.3, 2.0], [31.3, 1.5], [31.3, 2.0]]
407 ['age_at_onset', 'edss_as_evaluate

  vals[i] = round(np.nanmean(converted_column),2)
  return function_base._ureduce(a, func=_nanmedian, keepdims=keepdims,
