## TriHSPAM

### Experiments with MS dataset 
with Flexible Structure Option

##### 0. Preprocessing & Loading Functions

In [1]:
import csv
import numpy as np

def read_transactions_csv(file_path):
    visits = {}
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            patient_id = row['patient_id']
            if patient_id not in visits:
                visits[patient_id] = []
            visits[patient_id].append(row)
    
    max_visits = max(len(visits[patient_id]) for patient_id in visits)
    num_patients = len(visits)
    num_features = len(visits[next(iter(visits))][0]) - 1  # Excluding patientID
    
    # Create a 3D numpy array to store visits
    transactions_array = np.empty((max_visits, num_patients, num_features), dtype=object)
    
    for i, (patient_id, patient_visit) in enumerate(visits.items()):
        for j, transaction in enumerate(patient_visit):
            transaction_details = [val if val is not None and val != "" else np.nan for key, val in transaction.items() if key != 'patient_id']
            transactions_array[j, i, :] = transaction_details
        # If a patient has fewer transactions than the maximum, pad with np.nan
        for j in range(len(patient_visit), max_visits):
            transactions_array[j, i, :] = np.nan
    transactions_array = np.transpose(transactions_array, (2,1,0))
    return transactions_array

In [2]:
transactions_3d_array = read_transactions_csv("realData/MS_data/MS_snapshots_modified.csv")
print(transactions_3d_array.shape)

(11, 510, 28)


##### 1. Triclustering with TriHSPAM (Flexible Structure)

In [3]:
from TriHSPAM import TriHSPAM
triclustering = TriHSPAM(symb_features_idx=[0,1,2,5,6,7,8],
                          num_features_idx=[3,4,9,10],
                          min_I= 50,
                          min_J=2, 
                          min_K=2,
                          disc_method="eq_width",
                          n_bins=10,
                          time_as_cols=True,
                          time_relaxed=True, #flexible structure option
                          spm_algo='fournier08closed')

In [4]:
triclustering.fit(transactions_3d_array)

>/Users/diogosoares/Documents/PhD Work/TriHSPAM/src/spmf_vd.jar
 Total time ~ 7866542 ms
 Frequent sequences count : 8524



<TriHSPAM.TriHSPAM at 0x10c64c6a0>

In [5]:
trics = triclustering.triclusters_()

In [6]:
len(trics)

7548

##### 2. Assessing Statistical Significance

In [7]:
from sigStats import significance, Tricluster
import pandas as pd
import copy

lst_trics = []
data_f = copy.deepcopy(transactions_3d_array.transpose((1,0,2)))
Y_cat = [0,1,2,5,6,7,8,10]
            
for t_id, tric in enumerate(trics):
    t = Tricluster(data_f, Y_cat, tric[0], tric[1], tric[2])
    lst_trics.append(t)

p_values = significance(data_f, Y_cat, lst_trics, 0.1)

In [8]:
sign_stats = []

for t_i, p_val in enumerate(p_values):
    if p_val < 0.001:
        row = {'TricID':t_i, 'p-value':p_val}
        sign_stats.append(row)

df_stat = pd.DataFrame(sign_stats)
df_stat.to_csv("signStats_ms_relaxed.csv", index=False)

In [9]:
df_stat.sort_values('p-value').head(10)

Unnamed: 0,TricID,p-value
7446,7446,5.11767e-307
3457,3457,1.023534e-306
3755,3755,1.068035e-306
7445,7445,1.134788e-306
7444,7444,1.490799e-306
3754,3754,1.668805e-306
3753,3753,2.0025659999999997e-306
7420,7420,2.225074e-306
7443,7443,2.314077e-306
7442,7442,2.536584e-306


##### 3. Patterns Representation

In [10]:
from visualize_patterns import compute_representative_patterns

df = pd.read_csv("realData/MS_data/MS_snapshots_modified.csv", sep=',')
data_modes = [0,1,2,5,6,7,8]
data_means = [3,4,9]
data_medians = [10]

for t_id in df_stat.sort_values('p-value').head(10)['TricID']:
    tric_dims = trics[t_id]
    print(len(tric_dims[0]), list(df.columns[1:][tric_dims[1]]), tric_dims[2])
    print(compute_representative_patterns(triclustering.get_tricluster(t_id), 
                                        mode_feats=[i for i in tric_dims[1] if i in data_modes],
                                        mean_features=[i for i in tric_dims[1] if i in data_means],
                                        median_features=[i for i in tric_dims[1] if i in data_medians]))


488 ['age_at_onset', 'edss_as_evaluated_by_clinician'] {0: [0, 1], 1: [0, 1], 2: [0, 1], 3: [0, 1], 4: [0, 1], 5: [0, 1], 6: [0, 1], 7: [0, 1], 8: [0, 1], 9: [0, 1], 10: [0, 1], 11: [0, 1], 12: [0, 1], 13: [0, 1], 14: [0, 1], 15: [0, 1], 16: [0, 1], 17: [0, 1], 18: [0, 1], 19: [0, 1], 20: [0, 1], 21: [0, 1], 22: [0, 1], 23: [0, 1], 24: [0, 1], 25: [0, 1], 26: [0, 1], 28: [0, 1], 29: [0, 1], 30: [0, 1], 31: [0, 1], 32: [0, 1], 33: [0, 1], 34: [0, 1], 35: [0, 1], 36: [0, 1], 37: [0, 1], 38: [0, 1], 39: [0, 1], 40: [0, 1], 43: [0, 1], 44: [0, 1], 45: [0, 1], 46: [0, 1], 47: [0, 1], 48: [0, 1], 49: [0, 1], 51: [0, 1], 52: [0, 1], 53: [0, 1], 54: [0, 1], 55: [0, 1], 56: [0, 1], 57: [0, 1], 58: [0, 1], 59: [0, 1], 60: [0, 1], 61: [0, 1], 63: [0, 1], 64: [0, 1], 65: [0, 1], 66: [0, 1], 67: [0, 1], 68: [0, 1], 69: [0, 1], 70: [0, 1], 71: [0, 1], 72: [0, 1], 73: [0, 1], 74: [0, 1], 75: [0, 1], 76: [0, 1], 77: [0, 1], 78: [0, 1], 79: [0, 1], 80: [0, 1], 81: [0, 1], 82: [0, 1], 83: [0, 1], 84: [0

  vals[i] = round(np.nanmean(converted_column),2)
  return function_base._ureduce(a, func=_nanmedian, keepdims=keepdims,
