In [1]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = "retina"

In [2]:
import sys

In [3]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

sys.path.append('./../src/')
from manuscript import sankey_side_by_side as sankey
from manuscript import clustering, datasets, inout, export

pd.options.display.max_columns = 200
mpl.rcParams["figure.figsize"] = (10, 8)
mpl.rcParams["pdf.fonttype"] = 42
mpl.rcParams["font.family"] = "Arial"

import IPython.display
IPython.display.display(IPython.display.HTML("<style>.container { width:90% !important; }</style>"))

fonts = inout.get_resource_path('fonts')
for f in os.listdir(fonts):
    if f.endswith(".ttf"):
        mpl.font_manager.fontManager.addfont(f"{fonts}/{f}")

In [4]:
user = 'general'     # defines top hierarchy of output folder
outfolder = '04b_clustering_pairwise_similarity_matrix'    # name of notebook
save = True

In [5]:
def dump_figure(name):
    if save:
        export.image(
            user,
            f'{outfolder}/{name}',
        )

In [6]:
def dump_raster_figure(name):
    if save:
        export.raster_image(
            user,
            f'{outfolder}/{name}',
            dpi=300
        )

In [7]:
def dump_table(df, name):
    if save:
        export.full_frame(
            user, 
            f'{outfolder}/{name}', 
            df, 
            index=True,
            date=True
        )
        
        

# Get Data, as in reference

In [8]:
data = pd.read_csv(
    inout.get_material_path('general/03_overwrite_PF_Cr/03data-external_220901_1010.csv.gz'), 
    index_col=0)

In [9]:
data.shape

(12495, 72)

In [10]:
data = data.reset_index()

List of columns for clustering

In [11]:
data_columns = clustering.get_reference_data_columns()

Get data that we will run clustering on

In [12]:
data_mtx_orig = data[data_columns].copy()

## 0. Preparation of shared aspects

Let's create groups of variables which share those high correlations. Let's try different cutoffs for high correlation, as they produce different results

In [13]:
cutoff_groups_on_orig = clustering.identify_related_features(data_mtx_orig)


In [14]:
data_mtx_as_pct = data_mtx_orig.rank(axis=0, pct=True)
data_dist_col = clustering.get_distances(
    data_mtx_as_pct.transpose(), approach='nan_euclidean')   
col_tree = clustering.get_tree(data_dist_col, approach='ward')

In [15]:
threshold_for_relatedness = 0.7
cutoff_groups = cutoff_groups_on_orig[threshold_for_relatedness]

In [16]:
approaches = {}

## 1. Similarity approach

In [17]:
data_mtx = data_mtx_orig.copy()
data_mtx = data_mtx.rank(axis=0, pct=True)

data_mtx_for_similarity = data_mtx.copy()
data_mtx_for_similarity = clustering.reweight_related_features(
    data_mtx_for_similarity, 
    approach='mean_rank', 
    groups=cutoff_groups)

In [18]:
corr_mtx = data_mtx_for_similarity.transpose().corr("pearson")
data_dist = clustering.get_distances(corr_mtx, approach='euclidean')   
tree = clustering.get_tree(df_dist=data_dist, approach='ward')

In [19]:
out, assignments = clustering.table_with_assignments(
    tree=tree,
    labels=data.index
)

In [None]:
sns.clustermap(
    corr_mtx.astype(float),
    vmin=-1,
    vmax=1,
    cmap='coolwarm',
    cbar=False,
    method='ward',
    xticklabels=False,
    yticklabels=False
)
dump_raster_figure('pairwise.png')
dump_figure('pairwise.pdf')



In [None]:
sns.clustermap(
    corr_mtx.iloc[:10, :10].astype(float),
    vmin=-1,
    vmax=1,
    cmap='coolwarm',
    method='ward',
    xticklabels=False,
    yticklabels=False
)
dump_figure('palette.pdf')