In [1]:
import os.path

import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances, check_pairwise_arrays
from allib.metrics.distance import _AVAIL_CAT_METRICS, get_dist_metric

In [5]:
from allib.datasets import load_uci, AVAIL_DATASETS
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
iris = load_uci("balance-scale")
iris.with_preprocess(steps=["sample_n", "continuous_to_categorical"],  params_list=[{"n": 1000}, {"encode": "ordinal"}], in_place=True)
data, label = iris._data, iris._label


KeyboardInterrupt: 

In [13]:
sns.set_theme(rc={"figure.dpi": 300})

def normalize(d):
    # normalize to 0 1
    d = (d - d.min()) / (d.max() - d.min())
    return d

def plot_hist(dsn: str, k: str, d: np.ndarray, bins: int = 100):
    plot_name = f"./img/hm2/{dsn}_{k}_ordinal_hist.png"
    if os.path.exists(plot_name):
        print(f"Plot {plot_name} already exists. Skip.")
        return 
    h = sns.histplot(data=d.flatten(), bins=bins)
    h.set(
        xlabel=f"{dsn} using {k}",
        ylabel="Count"
    )
    plt.savefig(plot_name)
    plt.close()

for dsn in AVAIL_DATASETS:
    ds = load_uci(dsn)
    ds.with_preprocess(steps=["sample_n", "continuous_to_categorical", "remove_constant_columns"],  params_list=[{"n": 1000}, {"encode": "ordinal"}, {}], in_place=True)
    data, label = ds._data, ds._label
    
    nks = []
    freq = []
    N = data.shape[0]
    for col in data.columns:
        nks.append(data[col].unique().shape[0])
        freq.append(dict(data[col].value_counts()))
    prob = [{k: v/N for k, v in f.items()} for f in freq]
    prob2 = [{k: (v * (v - 1))/(N * (N-1)) for k, v in f.items()} for f in freq]
    nks = np.array(nks)
    params = {"prob": prob, "prob2": prob2, "nks": nks, "N": N, "freq": freq}
    for k in _AVAIL_CAT_METRICS:
        plot_name = f"./img/hm2/{dsn}_{k}_ordinal_heatmap.png"
        if os.path.exists(plot_name):
            print(f"Plot {plot_name} already exists. Skip.")
            continue
        print(f"Plotting {plot_name} ...")
        d = pairwise_distances(data, metric=get_dist_metric(k, params))
        # normalize d to 0~1
        d = normalize(d)
        h = sns.heatmap(d)
        h.set(xlabel = f"{dsn} with {k}")
        plt.savefig(plot_name)
        plt.close()
        plot_hist(dsn, k, d)


Plotting ./img/hm2/iris_overlap_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_eskin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_iof_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_of_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_lin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_lin1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_goodall1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_goodall2_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_goodall3_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_goodall4_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_smirnov_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_gambaryan_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_burnaby_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/iris_anderberg_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_overlap_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_eskin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_iof_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_of_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_lin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_lin1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_goodall1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_goodall2_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_goodall3_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_goodall4_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_smirnov_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_gambaryan_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_burnaby_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/adult_anderberg_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_overlap_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_eskin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_iof_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_of_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_lin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_lin1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_goodall1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_goodall2_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_goodall3_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_goodall4_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_smirnov_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_gambaryan_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_burnaby_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/yeast_anderberg_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_overlap_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_eskin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_iof_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_of_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_lin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_lin1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_goodall1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_goodall2_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_goodall3_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_goodall4_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_smirnov_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_gambaryan_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_burnaby_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/letter-recognition_anderberg_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_overlap_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_eskin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_iof_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_of_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_lin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_lin1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_goodall1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_goodall2_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_goodall3_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_goodall4_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_smirnov_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_gambaryan_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_burnaby_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/image-segmentation_anderberg_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_overlap_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_eskin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_iof_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_of_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_lin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_lin1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_goodall1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_goodall2_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_goodall3_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_goodall4_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_smirnov_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_gambaryan_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_burnaby_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/balance-scale_anderberg_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_overlap_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_eskin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_iof_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_of_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_lin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_lin1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_goodall1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_goodall2_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_goodall3_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_goodall4_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_smirnov_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_gambaryan_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_burnaby_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/glass-identification_anderberg_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_overlap_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_eskin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_iof_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_of_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_lin_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_lin1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_goodall1_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_goodall2_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_goodall3_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_goodall4_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_smirnov_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_gambaryan_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_burnaby_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


Plotting ./img/hm2/wine_anderberg_ordinal_heatmap.png ...


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):


In [39]:

from sklearn.preprocessing import KBinsDiscretizer
kbin = KBinsDiscretizer(n_bins=3, encode="onehot-dense", strategy="uniform", subsample=None)


In [40]:
columns = data.columns
num_idx = []
for col in columns:
    if pd.api.types.is_numeric_dtype(data[col]):
        num_idx.append(col)
kdata = kbin.fit_transform(data[num_idx])
kdata = pd.DataFrame(kdata, columns=kbin.get_feature_names_out())

In [41]:
data = data.drop(columns=num_idx)
data = pd.concat([data, kdata], axis=1)
data

Unnamed: 0,sepal_length_0.0,sepal_length_1.0,sepal_length_2.0,sepal_width_0.0,sepal_width_1.0,sepal_width_2.0,petal_length_0.0,petal_length_1.0,petal_length_2.0,petal_width_0.0,petal_width_1.0,petal_width_2.0
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
184,,,,,,,,,,,,
187,,,,,,,,,,,,
34,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
37,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [42]:
data[data.isin([np.nan, np.inf, -np.inf]).any(axis=1)]
# for col in data.columns:
#     data[col] = data[col].astype(int).astype("category")

Unnamed: 0,sepal_length_0.0,sepal_length_1.0,sepal_length_2.0,sepal_width_0.0,sepal_width_1.0,sepal_width_2.0,petal_length_0.0,petal_length_1.0,petal_length_2.0,petal_width_0.0,petal_width_1.0,petal_width_2.0
149,,,,,,,,,,,,
184,,,,,,,,,,,,
187,,,,,,,,,,,,


In [17]:
# drop data row with inf or nan
data = data.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [20]:
data = data.astype("category")

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   sepal_length_0.0  149 non-null    category
 1   sepal_length_1.0  149 non-null    category
 2   sepal_length_2.0  149 non-null    category
 3   sepal_width_0.0   149 non-null    category
 4   sepal_width_1.0   149 non-null    category
 5   sepal_width_2.0   149 non-null    category
 6   petal_length_0.0  149 non-null    category
 7   petal_length_1.0  149 non-null    category
 8   petal_length_2.0  149 non-null    category
 9   petal_width_0.0   149 non-null    category
 10  petal_width_1.0   149 non-null    category
 11  petal_width_2.0   149 non-null    category
dtypes: category(12)
memory usage: 3.3 KB


In [5]:
a = [1,2,3]
a = pd.DataFrame(a, columns=["a"], index=[3,2,1])
a

Unnamed: 0,a
3,1
2,2
1,3


In [8]:
a.reset_index(drop=True)

Unnamed: 0,a
0,1
1,2
2,3
