In [1]:

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
import os, sys, random
import pickle as pkl
from os.path import exists, join

sys.path.append('../dependencies/')

# Figure dir
figDir = os.path.join(os.getcwd(), 'figures_output')
if not os.path.isdir(figDir):
    os.makedirs(figDir)

# Define the target directory
targDir = "C:\OneDrive\KwanLab\Lightsheet_cFos_Pipeline\\1.scaled_Output\\classif"

# Define a list of substrings to filter directories
tagList = ['data=count_norm-', 'PowerTrans_RobScal_fSel_BorFS_clf_LogReg(multinom)_CV100']

# Call the function and get the list of paths based on the tagList
score_dict_paths = []

# Walk through the directory and its subdirectories
for root, dirs, files in os.walk(targDir):
    # Check if 'scoreDict.pkl' is present in the files
    if 'scoreDict_Real.pkl' in files:
        if all(tag in root for tag in tagList):
            score_dict_paths.append(os.path.join(root, 'scoreDict_Real.pkl'))

# Each directory name will be used to generate a label, based on the sequence between the strings in the directory name below
startStr = 'count_norm-'
endStr = '\PowerTrans'
featureLists, countNames  = [], []

# Print the result
print(f"Found 'scoreDict.pkl' files in directories containing {tagList}:")
for path in score_dict_paths:

    # Load the scoreDict.pkl file and extract desired variables.
    with open(path, 'rb') as f:                 
        featureDict = pkl.load(f)
        featureLists.append(featureDict['featuresPerModel'])

    # Extract the label for the entry
    countNames.append(featureDict['compLabel'])

Found 'scoreDict.pkl' files in directories containing ['data=count_norm-', 'PowerTrans_RobScal_fSel_BorFS_clf_LogReg(multinom)_CV100']:


## Create Violin Plot for Feature counts

In [2]:
# Set Style of Font
# Set font to 12 pt Helvetica
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 8
plt.rcParams['svg.fonttype'] = 'none'

In [4]:
import seaborn as sns
import pandas as pd

colorsList = [[82, 211, 216], [56, 135, 190]]
colorsList = np.array(colorsList)/256

origNames = ['5MEO vs PSI', 'MDMA vs PSI', 'A-SSRI vs PSI', 'KET vs PSI', '6-F-DET vs PSI', '5MEO vs 6-F-DET', 'A-SSRI vs C-SSRI', 'MDMA vs PSI/5MEO', '6-F-DET vs PSI/5MEO']
scoreNames = countNames

# Your list of lists (sublists with numbers)
data = [[len(sublist) for sublist in inner_list] for inner_list in featureLists]

# Use numpy.argsort to obtain the indices that would sort the original list
sort_indices = [scoreNames.index(name) for name in origNames]

# Use the sorted indices to reorder the original list
names = np.array(scoreNames)[sort_indices]
data = np.array(data)[sort_indices]

# If swapping names is desired
# swapDict = dict()
# swapDict['MDMA vs PSI/5MEO'] = 'PSI/5MEO vs MDMA'
# swapDict['6-F-DET vs PSI/5MEO'] = 'PSI/5MEO vs 6-F-DET'

# for i in range(len(names)):
#     if names[i] in swapDict:
#         names[i] = swapDict[names[i]]

# Create a data frame with melted data
flat_data = [item for sublist in data for item in sublist]

df = pd.melt(pd.DataFrame(data, index=names).T, var_name='Category', value_name='Values')

# Create horizontally oriented violin plot
plt.figure(figsize=(5, 5))  # Adjust the width and height as needed

ax = sns.violinplot(x='Values', y='Category', bw_adjust=.5, data=df, orient='h', color=colorsList[0])  #, palette=colors)  # Remove inner bars and set color
# for violin in ax.collections:
#     violin.set_alpha(1)

# Set plot labels and title
plt.xlabel('Number of Regions in Classifier')
plt.ylabel('Classifier')

plt.savefig(f"{figDir}\\RegionCountPerSplit_violin.svg", format='svg', bbox_inches='tight')     

# Show the plot
plt.show()

ValueError: '6-F-DET vs PSI' is not in list

# Create Distance matricies to compare features across comparisons

In [None]:
# Some Functions
from collections import Counter

def listToCounterFilt(listArray, filterByFreq=0):

    counter_u = Counter(listArray)
    
    if filterByFreq > 0:
        return Counter({k: v for k, v in counter_u.items() if v >= filterByFreq})
    else:
        return counter_u

def overlapCounter(list1, list2, filterByFreq=0):

    counter_u = listToCounterFilt(list1, filterByFreq)
    counter_v = listToCounterFilt(list2, filterByFreq)

    list1 = list(counter_u.keys())
    list2 = list(counter_v.keys())

    only_list1 = list(set(list1) - set(list2))
    only_list2 = list(set(list2) - set(list1))

    intersection = list(set(list1) & set(list2))
    
    return only_list1, only_list2, intersection

def weighted_jaccard_similarity(u, v, filt):

    counter_u, counter_v = Counter(u), Counter(v)

    # If Filt is non-0, filter out features in each counter whose count is not above it.
    if filt:
        counter_u = Counter({k: v for k, v in counter_u.items() if v > filt})
        counter_v = Counter({k: v for k, v in counter_v.items() if v > filt})

    intersection = sum((counter_u & counter_v).values())
    union = sum((counter_u | counter_v).values())

    # Using the modified Jaccard similarity with frequency
    similarity = intersection / union if union != 0 else 0

    return similarity

In [None]:
filterByFreq = 75

modelCount = len(featureLists)
# regionDict = dict(Counter(featureLists[0]))
# labels, counts = list(regionDict.keys()), list(regionDict.values())

# Initialize a grid
grid = [[0 for _ in range(modelCount)] for _ in range(modelCount)]

# Flatten every list
featureListFlat = [[element for item in subList for element in item] for subList in featureLists]

jacSim = False 

# compare the mean distances across items of the list
for idx_a, listA in enumerate(featureListFlat):
    for idx_b, listB in enumerate(featureListFlat):
        
        if jacSim:
            # Jaccard Sim
            grid[idx_a][idx_b] = weighted_jaccard_similarity(listA, listB, 75)
        else:
            # Overlap count
            _, _, intersection = overlapCounter(listA, listB, filterByFreq)
            grid[idx_a][idx_b] = len(intersection)


In [None]:
from matplotlib import cm

# Set the title
if jacSim:
    titleStr = f'% Similarity Between Feature Lists at (>={filterByFreq})'
else:
    titleStr = f'Overlap Count Between Feature Lists at (>={filterByFreq})'

# Plot the grid
fig, ax = plt.subplots(figsize=(7,7))
im = sns.heatmap(grid, cmap='Blues', annot=True, fmt='.0f', ax=ax, yticklabels=scoreNames, xticklabels=scoreNames, annot_kws={'size': 15})

# Remove the colorbar
cbar = ax.collections[0].colorbar
cbar.remove()

# Set font size for x-axis ticks and labels
ax.tick_params(axis='x', labelsize=12)

# Set font size for y-axis ticks and labels
ax.tick_params(axis='y', labelsize=12, rotation=0)

ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')

# Cycle through sns heatmap annotations and remove those that are equal to 0
for text in ax.texts:
    if int(text.get_text()) == 0:
        text.set_text("")

# plt.title(titleStr, fontdict={'fontsize': 18})
plt.savefig(f"{figDir}\\MeanSimilarity_heatmap.svg", format='svg', bbox_inches='tight')     
plt.show()


In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import textwrap

wrapper = textwrap.TextWrapper(width=12, break_on_hyphens=False)  # Adjust width as needed

# Sample data
if 0:
    for idx1, list1 in enumerate(featureListFlat):
        for idx2, list2 in enumerate(featureListFlat):

            # Skip the same list
            if idx1 == idx2:
                continue

            # Filter out features in each counter whose count is not above it.
            only_list1, only_list2, intersection = overlapCounter(list1, list2, filterByFreq)

            # Skip if there is no overlap
            if intersection == []:
                continue

            # Create a Venn diagram
            venn_diagram = venn2(subsets=(len(only_list1), len(only_list2), len(intersection)/2),
                                set_labels=(scoreNames[idx1], scoreNames[idx2]))


            venn_labels = {'100': only_list1, '010': only_list2, '110': intersection}
            for idx, (labId, labels) in enumerate(venn_labels.items()):
                wrapped_labels = wrapper.fill(text='  '.join(labels))
                venn_diagram.get_label_by_id(labId).set_text(wrapped_labels)
                venn_diagram.get_label_by_id(labId).set_fontsize(8)  # Adjust font size if needed

            # # Customize the size of the Venn diagram
            # plt.gcf().set_size_inches(8, 8)
            figName = f'VD_{scoreNames[idx1]} and {scoreNames[idx2]}'
            figName = figName.replace('/', '+')
            figName = figName.replace(' ', '_')
            plt.savefig(f"{figDir}\\{figName}.svg", format='svg', bbox_inches='tight')     

            # Display the plot
            plt.show()

# Create the single heatmap of features above the threshold

In [None]:
# Current Mode: Create plot with colorbar, then without, and grab the svg item and place it in the second plot to ensure even spacing
# Creates the heatmap for the data

plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 9
plt.rcParams['svg.fonttype'] = 'none'

import matplotlib.ticker as tkr
import seaborn as sns
import pandas as pd

sys.path.append('../dependencies/')
sys.path.append('../functionScripts/')
sys.path.append('../testScripts/')

import helperFunctions as hf
import plotFunctions as pf

# Load the lightsheet_data.pkl from the testScripts folder into a pandas df
df_raw = pd.read_pickle('../testScripts/lightsheet_data.pkl')

# Create a sorted structure from the data for scaffolding the desired heatmap
brainAreaColorDict = hf.create_color_dict(dictType='brainArea', rgbSwitch=0)
brainAreaPlotDict = hf.create_brainArea_dict('short')
regionArea = hf.create_region_to_area_dict(df_raw, 'abbreviation')
regionArea['Region_Color'] = regionArea['Brain_Area'].map(brainAreaColorDict)

df_Tilted = df_raw.pivot(index='abbreviation', columns='dataset', values='count')
df_Tilted = df_Tilted.reindex(regionArea['abbreviation'].tolist(), axis=0)

# Set variables
dataFeature = 'abbreviation'
blockCount = 2
# filterByFreq = 75 # Set Above

# Process the data from above
featureListDicts = [listToCounterFilt(x, filterByFreq=0) for x in featureListFlat]
featureListArray = [list(x.keys()) for x in featureListDicts]

# Add in columns for each of the actual comparisons.
df_frame = df_Tilted.reindex(columns=scoreNames)

# Populate df_frame with 0s
for col in df_frame.columns:
    df_frame[col] = 0
    
for idx, (comp, featureList) in enumerate(zip(df_frame.columns, featureListDicts)):
    for regionName in featureList.keys():
        df_frame.loc[regionName, comp] = featureList[regionName]
    # df_frame[comp] = df_frame.index.isin(featureListArray[idx])
    # print(f"{comp}: {df_frame[comp].sum()}")
        
# Remove any rows which are not above threshold
df_plot = df_frame[df_frame.sum(axis=1) > 74]
    
# Remove the abbreviations from regionArea not represented in df_plot, Filter the regionArea for 'Cortex' and 'Thalamus'
regionArea = hf.create_region_to_area_dict(df_raw, dataFeature)
regionArea = regionArea[regionArea['abbreviation'].isin(df_plot.index)]
regionArea = regionArea[regionArea['Brain_Area'].isin(['Cortex', 'Thalamus'])]

# Sort the data to be combined per larger area
df_plot = df_plot.loc[regionArea[dataFeature]]
modelCount = len(df_plot.columns)

# Create indicies for dividing the data into the correct number of sections regardless of the size
row_idx_set = np.zeros((blockCount, 2), dtype=int)
indices = np.linspace(0, len(df_plot), num=blockCount+1, dtype=int)
for block_idx in range(blockCount):
    row_idx_set[block_idx][0] = indices[block_idx]
    row_idx_set[block_idx][1] = indices[block_idx+1]

# Hand change to make Cortex 1st block, Thal 2nd
row_idx_set[0,1] = 25
row_idx_set[1,0] = 25

# merge df_plot and regionArea, moving the Brain_Area_Idx and Brain_Area columns to df_plot
df_plot_combo = df_plot.merge(regionArea, left_index=True, right_on=dataFeature)

# Cycle through the df_plot_combo's distinct Brain_Area_Idx, and resort data by row sums
newIdx = []
for idx in regionArea.Brain_Area_Idx.unique():
    # Identify which regions have the same Brain_Area_Idx
    df_seg = df_plot_combo[df_plot_combo.Brain_Area_Idx == idx]
    sorted_seg_idx = df_seg.iloc[:, 0:modelCount].sum(axis=1).sort_values(ascending=False).index

    # Append to list
    newIdx = newIdx + list(sorted_seg_idx)

# Resort the data
df_plot = df_plot_combo.reindex(newIdx, axis=0)
df_plot = df_plot.set_index('abbreviation')
df_plot = df_plot.drop(columns=['Brain_Area_Idx', 'Brain_Area'])

# Resort the columns to match 'origNames'
df_plot = df_plot[origNames]

# Drop the columns which do not include the string 'PSI'
df_plot = df_plot.loc[:, df_plot.columns.str.contains('PSI')]

# Update the column names to have 'PSI' in front.
origcolNames = df_plot.columns
colNames = [x.split(' vs ') for x in df_plot.columns]
newColNames = [f'{x[1]} vs {x[0]}' for x in colNames]
newColNames[-1], newColNames[-2] = origcolNames[-1], origcolNames[-2]
newColNames = [x.replace('/', ' & ') for x in newColNames]
df_plot.columns = newColNames

# Plotting variables
formatter = tkr.ScalarFormatter(useMathText=True)
formatter.set_scientific(False)
formatter.set_powerlimits((-2, 2))

scalefactor = 12
figH = (scalefactor*2.5)/blockCount
figW = blockCount * 2.5

colorbar = [False, True]

for cbs in colorbar:

    fig, axes = plt.subplots(1, blockCount, figsize=(figW, figH))  # Adjust figsize as needed
    # figsize=(scalefactor*2.4, len(df_plot)/len(row_idx_set) * scalefactor * 0.0125)

    if blockCount == 1:
        axes = [axes]

    for idx, row_set in enumerate(row_idx_set):

        # Slice and modify previous structures to create segment
        df_plot_seg = df_plot.iloc[row_set[0]: row_set[1], :]
        regionArea_local = regionArea[regionArea[dataFeature].isin(df_plot_seg.index)]
        region_idx = regionArea_local.Brain_Area_Idx  # Extract for horizontal lines in plot later.

        matrix = df_plot_seg.values

        xticklabels = df_plot_seg.columns.values.tolist()
        yticklabels = df_plot_seg.index.values.tolist()

        heatmap = sns.heatmap(matrix, cmap='crest', ax=axes[idx] , fmt='.2f', cbar = False, square=True, yticklabels=yticklabels, xticklabels=xticklabels, cbar_kws={"format": formatter}, center=0)
        horzLineColor = 'black'

        # Rotate the xticklabels 45 degrees
        axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')

        # Add a colorbar
        # Change the colorbar labels to be in non-scientific notation

        if cbs:
            cbar = heatmap.figure.colorbar(heatmap.collections[0], ax=axes[idx], location='right', use_gridspec=True, pad=0.05)
            cbar.set_label('Feature Count', rotation=270, labelpad=5)
            cbar.ax.yaxis.set_major_formatter(formatter)

        # Add in horizontal lines breaking up brain regions types.
        line_break_num, line_break_ind = np.unique(region_idx, return_index=True)
        for l_idx in line_break_ind[1:]:
            axes[idx].axhline(y=l_idx, color=horzLineColor, linewidth=1)
            
        # Set the yl abel on the first subplot.
        # if idx == 0:
        #     axes[idx].set_ylabel("Region Names", fontsize=20)

        # if idx == 2:
        #     cbar = heatmap.collections[0].colorbar
        #     cbar.set_label('Colorbar Label', rotation=270, labelpad=5)

    titleStr = f"FeatureCountHeatmap"  
    # fig.suptitle(titleStr, fontsize=20, y=1)
    # fig.text(0.5, -.02, "Samples Per Group", ha='center', fontsize=20)
    plt.tight_layout(h_pad = 0, w_pad = .5)

    plt.savefig(f"{titleStr}_cb_{cbs}.svg", dpi=300, format='svg', bbox_inches='tight')
    plt.show()