In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# import necessary packages

import os
import numpy as np
import pandas as pd
import cv2
import shutil
import ast

import seaborn as sns
from matplotlib import pyplot as plt

from collections import defaultdict
from scipy.signal import find_peaks
import scipy

from sklearn.feature_selection import f_classif
from extract_feature_utils import *
from stat_vis_utils  import *

In [None]:
DIR_WSI = "/Users/jinzhou/Desktop/Paper_MIA/data/wsi"
DIR_ANN = "/Users/jinzhou/Desktop/Paper_MIA/data/ann_geojsons"
DIR_SAVE_RESULTS = "/Users/jinzhou/Desktop/Paper_MIA/results"
DIR_SAVE_FIGURE = "/Users/jinzhou/Desktop/Paper_MIA/results/figures"

## Process thickness.csv

In [None]:
PATH_STAT_MANUAL = os.path.join(DIR_SAVE_RESULTS, "thickness.csv")

In [None]:
# read thickness analysis
def read_df_from_csv(path_csv):
    df = pd.read_csv(path_csv)
#     df = pd.read_json(path_csv, orient="records", lines=True)
    # original list in path_csv is read as string, for example "[0.2, 0.1, ...]"
    # ast.literal_eval converts string to python literal structures
    df['Thickness_Media_Abs'] = df['Thickness_Media_Abs'].apply(lambda x: ast.literal_eval(x))
    df['Thickness_Intima_Abs'] = df['Thickness_Intima_Abs'].apply(lambda x: ast.literal_eval(x))
    df['Thickness_Wall_Abs'] = df['Thickness_Wall_Abs'].apply(lambda x: ast.literal_eval(x))

    return df

def clean(df_thick):

    df_thick.loc[:, 'Artery_ID'] = df_thick.loc[:, 'Artery_ID'].str.split('_').str[0]
    if "12_26609_021_507 L03 TRI" in df_thick.loc[:, 'WSI_ID'].values:
        df_thick = df_thick[df_thick.loc[:, "WSI_ID"]!="12_26609_021_507 L03 TRI"]
    if "12_26609_020_018 L05 A1 TRI" in df_thick.loc[:, 'WSI_ID'].values:
        df_thick = df_thick[df_thick.loc[:, "WSI_ID"]!="12_26609_020_018 L05 A1 TRI"]   
        
    # Set and WSI_Artery_ID, ignoring media and intima index
    df_thick.loc[:, 'WSI_Artery_ID'] = df_thick.loc[:, 'WSI_ID'] + '_' \
        + df_thick.loc[:, 'Artery_ID']
    # For arteries with multiple lumen/intima areas, we pick the one with maxium lumen area
    df_thick = df_thick.sort_values('Area', ascending=False).drop_duplicates(['WSI_Artery_ID'])
    df_thick = df_thick[df_thick.loc[:, "WSI_Artery_ID"]!="11_26609_027_005 L03 TRI_A17"]        
    
    return df_thick

In [None]:
df_thick = read_df_from_csv(PATH_STAT_MANUAL)
df_thick = clean(df_thick)
df_thick.head()

## Process "Labels.xlsx"

In [None]:
PATH_LABEL = "/Users/jinzhou/Desktop/Paper_MIA/data/labels_updated.xlsx"

In [None]:
df_label = pd.ExcelFile(PATH_LABEL)

# Only read the first sheet of the excel file
sheet_name = df_label.sheet_names[0]
df_label = df_label.parse(sheet_name, skiprows=1) # skip the first row
# Use Artery_ID as row index and WSI_ID as column name
df_label = df_label.rename(columns = {'Unnamed: 0':'Artery_ID'}).set_index("Artery_ID")

In [None]:
# replace labels of strings to integers.
df_label = df_label.replace({"without arteriosclerosis": 0, 
                             "mild arteriosclerosis": 1, 
                             "mild hyalinosis": 1, 
                             "moderate arteriosclerosis": 2,
                             "severe arteriosclerosis": 3, 
                             "-": np.nan, " - ": np.nan})


## Post-Processing

In [None]:
df_features_label = pd.DataFrame(columns = [])

for index, row in df_thick.iterrows(): 
    
    # get the img
    path_artery_img = os.path.join(DIR_SAVE_FIGURE, row["WSI_ID"], row["Artery_ID"]+".png")
    artery_img = cv2.cvtColor(cv2.imread(path_artery_img), cv2.COLOR_BGR2RGB)
    
    # get the measurements
    thick_media = np.array(row["Thickness_Media_Abs"])
    thick_intima = np.array(row["Thickness_Intima_Abs"])
    thick_wall = np.array(row["Thickness_Wall_Abs"])
    thick_media, thick_intima, thick_wall = post_process(thick_media, thick_intima, thick_wall)
#     plot_hist_w_two_list(thick_media, thick_intima, "Thickness", None)
    features_intima, features_media, features_ratio = extract_features(thick_media, thick_intima, thick_wall)
    row_features_label = {**features_intima, **features_media, **features_ratio}
    row_features_label["WSI_Artery_ID"] = row["WSI_Artery_ID"]
#     row_features_label["Label"] = int(row["Hyalin"])
    row_features_label["Label"] = df_label.loc[row["Artery_ID"], row["WSI_ID"]]
    df_features_label = df_features_label.append(row_features_label, ignore_index=True)    


In [None]:
df_features_label.head()

## Boxtplot and Kendall Tau Analysis

In [None]:
def violin_plots(df_features_label, feature_names):
    fig = plt.figure(figsize=(5*len(feature_names), 4))
    axs = fig.subplots(1, len(feature_names))

    for i, feature_name in enumerate(feature_names):
        features = df_features_label.loc[:, feature_name].values
        idx_sort = features.argsort()
        features[features>features[idx_sort][-30]] = features[idx_sort][-30]

        labels = df_features_label.loc[:, "Label"].values
        features_label_0 = features[labels==0]
        features_label_1 = features[labels==1]
        features_label_2 = features[labels==2]
        features_label_3 = features[labels==3]
        
        rho, p_val = scipy.stats.kendalltau(features, labels)
        data = pd.DataFrame({'Score': labels, 'Feature Value': features})

        sns.violinplot(x='Score', y='Feature Value', data=data, ax=axs[i])
        axs[i].set_xlabel("Arteriosclerosis Score", fontsize=15)
        if i == 0:
            axs[i].set_ylabel("Feature Value", fontsize=15)
        else:
            axs[i].set_ylabel(None)
        axs[i].set_xlabel("Arteriosclerosis Score", fontsize=15)
        axs[i].set_title( feature_name + "\n" + r"$\gamma_{\tau}$" + "={:.2f}".format(rho) + " p<0.0001",
                         y=-0.4,pad=-14, fontsize=18)
    plt.tight_layout()

In [None]:
feature_names = [x for x in df_features_label.columns if x not in ["WSI_Artery_ID", "Label"] ]
media_features = [x for x in feature_names if x.startswith("Media")]
intima_features  = [x for x in feature_names if x.startswith("Intima")]
ratio_features  = [x for x in feature_names if x.startswith("Ratio")]

In [None]:
violin_plots(df_features_label, intima_features[:2])
violin_plots(df_features_label, media_features[:2])
violin_plots(df_features_label, ratio_features[:2])

In [None]:
wsi_artery_ids_hist_comp = ["11_26609_027_006 L02 TRI_A07", "11_26609_009_008 L10 TRI_A01",
                            "11_26609_098_005_L4 TRI_A09", "11_26609_020_006 A15 TRI_A22"]

dir_to_save = "/Users/jinzhou/Desktop/Paper_MIA/paper/figs"
for wsi_artery_id in wsi_artery_ids_hist_comp:
    row = df_thick.loc[df_thick.loc[:, "WSI_Artery_ID"]==wsi_artery_id, :]
    path_artery_img = os.path.join(DIR_SAVE_FIGURE, row["WSI_ID"].values[0], row["Artery_ID"].values[0]+".png")
    thick_media = row["Thickness_Media_Abs"].values[0]
    thick_intima = row["Thickness_Intima_Abs"].values[0]
    thick_wall = row["Thickness_Wall_Abs"].values[0]
    thick_media, thick_intima, thick_wall = post_process(thick_media, thick_intima, thick_wall)
    label = df_label.loc[row["Artery_ID"], row["WSI_ID"]].values[0][0]
    path_to_save = os.path.join(dir_to_save, wsi_artery_id+"_hist_score_"+str(int(label))+".png")
    print(path_to_save, label)
    plot_hist_w_two_list(thick_media, thick_intima, "Thickness", path_to_save)
    shutil.copy(path_artery_img, os.path.join(dir_to_save, wsi_artery_id+".png"))

In [None]:
wsi_artery_ids_hist_comp = ["11_26609_027_006 L02 TRI_A07", "11_26609_009_008 L10 TRI_A01",
                            "11_26609_098_005_L4 TRI_A09", "11_26609_020_006 A15 TRI_A22"]
wsi_artery_ids_hist_comp = df_thick.loc[:, "WSI_Artery_ID"].values
dir_to_save = "/Users/jinzhou/Desktop/Paper_MIA/paper/figs"
for wsi_artery_id in wsi_artery_ids_hist_comp:
    row = df_thick.loc[df_thick.loc[:, "WSI_Artery_ID"]==wsi_artery_id, :]
    label = df_label.loc[row["Artery_ID"], row["WSI_ID"]].values[0][0]
    if label < 2: continue

    path_artery_img = os.path.join(DIR_SAVE_FIGURE, row["WSI_ID"].values[0], row["Artery_ID"].values[0]+".png")
    artery_img = cv2.cvtColor(cv2.imread(path_artery_img), cv2.COLOR_BGR2RGB)
    thick_media = row["Thickness_Media_Abs"].values[0]
    thick_intima = row["Thickness_Intima_Abs"].values[0]
    thick_wall = row["Thickness_Wall_Abs"].values[0]
    thick_media, thick_intima, thick_wall = post_process(thick_media, thick_intima, thick_wall)
    path_to_save = None
    print(wsi_artery_id, label)
    plt.imshow(artery_img)
    plt.show()
    plot_hist_w_two_list(thick_media, thick_intima, "Thickness", path_to_save)
#     shutil.copy(path_artery_img, os.path.join(dir_to_save, wsi_artery_id+".png"))

In [None]:
df_features_label.sort_values("Intima Peak Height")[:40]

In [None]:
wsi_artery_ids_hist_comp = ["11_26609_009_008 L10 TRI_A23", "11_26609_098_005_L4 TRI_A07"]

dir_to_save = "/Users/jinzhou/Desktop/Paper_MIA/paper/figs"
for wsi_artery_id in wsi_artery_ids_hist_comp:
    row = df_thick.loc[df_thick.loc[:, "WSI_Artery_ID"]==wsi_artery_id, :]
    path_artery_img = os.path.join(DIR_SAVE_FIGURE, row["WSI_ID"].values[0], row["Artery_ID"].values[0]+".png")
    artery_img = cv2.cvtColor(cv2.imread(path_artery_img), cv2.COLOR_BGR2RGB)
    
    thick_media = row["Thickness_Media_Abs"].values[0]
    thick_intima = row["Thickness_Intima_Abs"].values[0]
    thick_wall = row["Thickness_Wall_Abs"].values[0]
    thick_media, thick_intima, thick_wall = post_process(thick_media, thick_intima, thick_wall)
    label = df_label.loc[row["Artery_ID"], row["WSI_ID"]].values[0][0]
    
    thick_media, thick_intima, thick_wall = post_process(thick_media, thick_intima, thick_wall)
    path_to_save = None
    plt.imshow(artery_img)
    plt.show()
    plot_hist_w_two_list(thick_media, thick_intima, "Thickness", path_to_save)
    
    print(df_features_label.loc[df_features_label.loc[:, "WSI_Artery_ID"]==wsi_artery_id, :])