<a href="https://colab.research.google.com/github/HenryZumaeta/MISCELANEAS/blob/Zeta/PYTHON/Asociacion_Numerico_Categorico_Correlacion_Incertidumbre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Replicando el correlograma de la librería sweetviz

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable
from scipy.stats import chi2_contingency, entropy

def calculate_correlation_matrix(df):
    # Calcula la matriz de correlación de Pearson para variables numéricas
    corr_matrix = df.corr(method='pearson')
    return corr_matrix

def calculate_uncertainty_coefficient(df, cat_cols):
    # Calcula el coeficiente de incertidumbre para variables categóricas
    def uncertainty_coefficient(x, y):
        contingency_table = pd.crosstab(x, y)
        chi2 = chi2_contingency(contingency_table)[0]
        n = contingency_table.sum().sum()
        entropy_x = entropy(x.value_counts(normalize=True))
        entropy_y = entropy(y.value_counts(normalize=True))
        return chi2 / n / (min(entropy_x, entropy_y))

    n = len(cat_cols)
    uc_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i == j:
                uc_matrix[i, j] = 1
            else:
                uc_matrix[i, j] = uncertainty_coefficient(df[cat_cols[i]], df[cat_cols[j]])
    return pd.DataFrame(uc_matrix, index=cat_cols, columns=cat_cols)

def plot_associations(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    cat_cols = df.select_dtypes(include=[object, 'category']).columns

    if numeric_cols.empty:
        raise ValueError("No numeric columns found in the dataset.")
    if cat_cols.empty:
        raise ValueError("No categorical columns found in the dataset.")

    corr_matrix = calculate_correlation_matrix(df[numeric_cols])
    uc_matrix = calculate_uncertainty_coefficient(df, cat_cols)

    fig, ax = plt.subplots(figsize=(10, 8))
    norm = Normalize(vmin=-1, vmax=1)
    cmap = plt.cm.bwr

    n = len(df.columns)
    ax.set_xlim(0, n)
    ax.set_ylim(0, n)

    found_values = False

    for i, col1 in enumerate(df.columns):
        for j, col2 in enumerate(df.columns):
            if col1 in numeric_cols and col2 in numeric_cols:
                corr_value = corr_matrix.loc[col1, col2]
                if not np.isnan(corr_value):
                    size = abs(corr_value) * 1000
                    color = cmap(norm(corr_value))
                    shape = 'o'
                    ax.scatter(j, n - i - 1, s=size, c=[color], marker=shape)
                    found_values = True
            elif col1 in cat_cols and col2 in cat_cols:
                uc_value = uc_matrix.loc[col1, col2]
                if not np.isnan(uc_value):
                    size = uc_value * 1000
                    color = cmap(norm(uc_value))
                    shape = 's'
                    ax.scatter(j, n - i - 1, s=size, c=[color], marker=shape)
                    found_values = True

    if not found_values:
        print("No valid associations found to plot.")
        return

    ax.set_xticks(range(n))
    ax.set_xticklabels(df.columns, rotation=90)
    ax.set_yticks(range(n))
    ax.set_yticklabels(df.columns[::-1])

    sm = ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    cbar = fig.colorbar(sm, ax=ax)
    cbar.set_label('Correlation')

    plt.show()

# Uso: simulacion de data 'raleo_base_clus_clusteres.csv'
data = {
    'data_Tcamp': [1, 2, 3, 4, 5],
    'data_sumCOD': [2, 3, 4, 5, 6],
    'data_sumHRSLB': [3, 4, 5, 6, 7],
    'data_GRAD02': ['PRIMARIA COMPLETA', 'SECUNDARIA COMPLETA', 'PRIMARIA INCOMPLETA', 'SECUNDARIA COMPLETA', 'SECUNDARIA INCOMPLETA'],
    'data_sum.variedad_total_AC': [5, 6, 7, 8, 9],
    'data_sum.variedad_total_CC': [6, 7, 8, 9, 10],
    'data_sum.variedad_total_CP': [7, 8, 9, 10, 11],
    'data_sum.variedad_total_LS': [8, 9, 10, 11, 12]
}

df = pd.DataFrame(data)

plot_associations(df)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from scipy.stats import chi2_contingency

def calculate_correlation(df):
    # Calcula la correlación de Pearson solo para columnas numéricas
    numeric_df = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr(method='pearson')
    return correlation_matrix

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    return np.sqrt(chi2 / (n * (min(confusion_matrix.shape) - 1)))

def calculate_association(df):
    cols = df.columns
    association_matrix = pd.DataFrame(index=cols, columns=cols)

    for i in range(len(cols)):
        for j in range(i, len(cols)):
            col1 = df[cols[i]]
            col2 = df[cols[j]]

            if col1.dtype == 'object' or col2.dtype == 'object':
                association_matrix.iat[i, j] = cramers_v(col1, col2)
                association_matrix.iat[j, i] = association_matrix.iat[i, j]
            else:
                association_matrix.iat[i, j] = np.nan
                association_matrix.iat[j, i] = np.nan

    return association_matrix

def plot_association_correlation(df, title, note):
    corr = calculate_correlation(df)
    assoc = calculate_association(df)

    plt.figure(figsize=(10, 8))
    plt.title(title)

    sns.heatmap(corr, annot=False, cmap="coolwarm", center=0, cbar_kws={'label': 'Correlación (Pearson)'}, square=True, linewidths=.5)

    for i in range(len(corr.columns)):
        for j in range(len(corr.columns)):
            if i != j:
                size = abs(assoc.iloc[i, j])
                if not np.isnan(size):
                    plt.gca().add_patch(mpatches.Rectangle((j, i), 1, 1, fill=False, edgecolor='blue', lw=size * 5, linestyle='-', alpha=0.3))

    for i in range(len(corr.columns)):
        for j in range(len(corr.columns)):
            if i != j:
                size = abs(corr.iloc[i, j])
                if not np.isnan(size):
                    plt.gca().add_patch(plt.Circle((j + 0.5, i + 0.5), size / 2, color='blue', alpha=0.3))

    plt.xticks(np.arange(len(df.columns)) + 0.5, df.columns, rotation=90)
    plt.yticks(np.arange(len(df.columns)) + 0.5, df.columns, rotation=0)
    plt.gca().set_xticks(np.arange(len(df.columns)) + 0.5, minor=True)
    plt.gca().set_yticks(np.arange(len(df.columns)) + 0.5, minor=True)
    plt.gca().grid(False, which='minor', color='black', linestyle='-', linewidth=2)

    plt.figtext(0.5, -0.1, note, wrap=True, horizontalalignment='center', fontsize=10, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})

    plt.show()


# Título y nota
title = "Asociaciones"
note = ("Solo incluyendo el conjunto de datos analizado"
        "■ Los cuadrados son asociaciones categóricas (coeficiente de incertidumbre y razón de correlación) de 0 a 1. "
        "El coeficiente de incertidumbre es asimétrico, (es decir, los valores de la ETIQUETA DE FILA indican cuánto INFORMAN a cada ETIQUETA en la PARTE SUPERIOR). "
        "• Los círculos son las correlaciones numéricas simétricas (Pearson) de -1 a 1. La diagonal trivial se deja intencionalmente en blanco para mayor claridad.")


# Gráfica
plot_association_correlation(df, title, note)


In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import matplotlib.patches as patches
from textwrap import wrap
from scipy.stats import chi2_contingency

# Definiciones globales y configuraciones
CORRELATION_ERROR = 83572398457329.0
CORRELATION_IDENTICAL = 1357239845732.0

def wrap_custom(source_text, separator_chars, width=70, keep_separators=True):
    current_length = 0
    latest_separator = -1
    current_chunk_start = 0
    output = ""
    char_index = 0
    while char_index < len(source_text):
        if source_text[char_index] in separator_chars:
            latest_separator = char_index
        output += source_text[char_index]
        current_length += 1
        if current_length == width:
            if latest_separator >= current_chunk_start:
                cutting_length = char_index - latest_separator
                if not keep_separators:
                    cutting_length += 1
                if cutting_length:
                    output = output[:-cutting_length]
                output += "\n"
                current_chunk_start = latest_separator + 1
                char_index = current_chunk_start
            else:
                output += "\n"
                current_chunk_start = char_index + 1
                latest_separator = current_chunk_start - 1
                char_index += 1
            current_length = 0
        else:
            char_index += 1
    return output

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    if confusion_matrix.shape[0] <= 1 or confusion_matrix.shape[1] <= 1:
        return np.nan
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    return np.sqrt(chi2 / (n * (min(confusion_matrix.shape) - 1)))

def make_zero_square_dataframe(features):
    new_dataframe = pd.DataFrame()
    for feature in features:
        new_dataframe[feature] = pd.Series(dtype=float)
    return new_dataframe.reindex(list(range(0, len(features)))).reset_index(drop=True).fillna(0.0)

def calculate_association(df):
    cols = df.columns
    association_matrix = pd.DataFrame(index=cols, columns=cols)

    for i in range(len(cols)):
        for j in range(i, len(cols)):
            col1 = df[cols[i]]
            col2 = df[cols[j]]

            if col1.dtype == 'object' or col2.dtype == 'object':
                association_matrix.iat[i, j] = cramers_v(col1, col2)
                association_matrix.iat[j, i] = association_matrix.iat[i, j]
            else:
                association_matrix.iat[i, j] = np.nan
                association_matrix.iat[j, i] = np.nan

    return association_matrix

def calculate_correlation(df):
    numeric_df = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr(method='pearson')
    return correlation_matrix

def corrplot(correlation_dataframe, dataframe_report, size_scale=100, marker='s'):
    corr = pd.melt(correlation_dataframe.reset_index(), id_vars='index')
    corr.columns = ['x', 'y', 'value']

    def heatmap(y, x, figure_size, **kwargs):
        color = kwargs.get('color', [1]*len(x))
        palette = [(0.85, (0.85/128)*i, (0.85/128)*i) for i in range(0,128)] + [(0.85 - 0.85*(i-128.0)/128.0, 0.85 - 0.85*(i-128.0)/128.0, 0.85) for i in range(128,256)]
        color_min, color_max = kwargs.get('color_range', (min(color), max(color)))

        def value_to_color(val):
            if color_min == color_max:
                return palette[-1]
            if val == CORRELATION_IDENTICAL or val == CORRELATION_ERROR:
                return palette[-1]
            val_position = float((val - color_min)) / (color_max - color_min)
            val_position = min(max(val_position, 0), 1)
            val_position = math.pow(val_position, 0.925)
            ind = int(val_position * 255)
            return palette[ind]

        size = kwargs.get('size', [1]*len(x))
        size_min, size_max = kwargs.get('size_range', (min(size), max(size)))
        size_scale = kwargs.get('size_scale', 500) / len(x)

        def value_to_size(val):
            if val == 0 or val == abs(CORRELATION_IDENTICAL) or val == abs(CORRELATION_ERROR):
                return 0.0
            if size_min == size_max:
                return 1 * size_scale
            val_position = (val - size_min) * 0.999 / (size_max - size_min) + 0.001
            val_position = min(max(val_position, 0), 1)
            val_position = math.pow(val_position, 0.5)
            return val_position

        def do_wrapping(label, length):
            return wrap_custom(label, ["_", "-"], length)

        wrap_x = 12
        wrap_y = 13
        x_names = [t for t in kwargs.get('x_order', sorted(set([v for v in x])))]
        x_names = [do_wrapping(label, wrap_x) for label in x_names]
        x_to_num = {p[1]:p[0] for p in enumerate(x_names)}

        y_names = [t for t in kwargs.get('y_order', sorted(set([v for v in y])))]
        y_names = [do_wrapping(label, wrap_y) for label in y_names]
        y_to_num = {p[1]:p[0] for p in enumerate(y_names)}

        figure, axs = plt.subplots(1, 1, figsize=figure_size)

        marker = kwargs.get('marker', 's')

        kwargs_pass_on = {k:v for k,v in kwargs.items() if k not in ['color', 'palette', 'color_range', 'size', 'size_range', 'size_scale', 'marker', 'x_order', 'y_order']}

        axs.tick_params(labelbottom='on', labeltop='on')
        axs.set_xticks([v for k,v in x_to_num.items()])
        axs.set_xticklabels([k for k in x_to_num], rotation=90, horizontalalignment='center', linespacing=0.8)
        axs.set_yticks([v for k,v in y_to_num.items()])
        axs.set_yticklabels([k for k in y_to_num], linespacing=0.85)

        axs.grid(False, 'major')
        axs.grid(True, 'minor')
        axs.set_xticks([t + 0.5 for t in axs.get_xticks()], minor=True)
        axs.set_yticks([t + 0.5 for t in axs.get_yticks()], minor=True)

        axs.set_xlim([-0.5, max([v for v in x_to_num.values()]) + 0.5])
        axs.set_ylim([-0.5, max([v for v in y_to_num.values()]) + 0.5])
        axs.set_facecolor('#F1F1F1')

        delta_in_pix = axs.transData.transform((1, 1)) - axs.transData.transform((0, 0))

        index = 0
        for cur_x, cur_y in zip(x,y):
            wrapped_x_name = do_wrapping(cur_x, wrap_x)
            wrapped_y_name = do_wrapping(cur_y, wrap_y)
            before_coordinate = np.array(axs.transData.transform((x_to_num[wrapped_x_name]-0.5, y_to_num[wrapped_y_name] -0.5)))
            after_coordinate = np.array(axs.transData.transform((x_to_num[wrapped_x_name]+0.5, y_to_num[wrapped_y_name] +0.5)))
            before_pixels = np.round(before_coordinate, 0)
            after_pixels = np.round(after_coordinate, 0)
            desired_fraction = value_to_size(size[index])
            if desired_fraction == 0.0:
                index += 1
                continue
            use_rectangle = True if dataframe_report[cur_x]["type"] != "NUM" or dataframe_report[cur_y]["type"] != "NUM" else False
            delta_in_pix = after_pixels - before_pixels
            gap = np.round((1.0 - desired_fraction) * delta_in_pix / 2, 0)
            start = before_pixels + gap[0]
            ending = after_pixels - gap[0]
            start[0] += 1
            ending[1] -= 1
            start_doc = axs.transData.inverted().transform(start)
            ending_doc = axs.transData.inverted().transform(ending)
            cur_size = ending_doc - start_doc
            if use_rectangle:
                                    cur_rect = patches.Rectangle((start_doc[0], start_doc[1]), cur_size[0], cur_size[1], facecolor=value_to_color(color[index]), antialiased=True)
            else:
                cur_rect = patches.Circle((start_doc[0] + cur_size[0] / 2, start_doc[1] + cur_size[1] / 2), cur_size[0] / 2, facecolor=value_to_color(color[index]), antialiased=True)
            cur_rect.set_antialiased(True)
            axs.add_patch(cur_rect)
            index += 1

        if color_min < color_max:
            ax = plt.subplot2grid((1, 15), (0, 14))
            col_x = [0] * len(palette)
            bar_y = np.linspace(color_min, color_max, len(palette))
            ax.set_ylim(-1, 1)
            bar_height = bar_y[1] - bar_y[0]
            ax.barh(
                y=bar_y,
                width=[5] * len(palette),
                left=col_x,
                height=bar_height,
                color=palette,
                linewidth=0)
            ax.set_xlim(1, 2)
            ax.grid(False)
            ax.set_facecolor('white')
            ax.set_xticks([])
            ax.set_yticks(np.linspace(min(bar_y), max(bar_y), 3))
            ax.yaxis.tick_right()
        return figure

    return heatmap(
        corr['y'], corr['x'],
        figure_size=(20, 15),
        color=corr['value'], color_range=[-1, 1],
        size=corr['value'].abs(), size_range=[0, 1],
        marker=marker,
        x_order=correlation_dataframe.columns,
        y_order=correlation_dataframe.columns[::-1],
        size_scale=size_scale,
        dataframe_report=dataframe_report
    )

def create_dataframe_report(df):
    dataframe_report = {}
    for col in df.columns:
        col_type = 'NUM' if pd.api.types.is_numeric_dtype(df[col]) else 'CAT'
        dataframe_report[col] = {"type": col_type}
    return dataframe_report

def plot_association_correlation(df, title, note):
    # Crear el reporte del dataframe
    dataframe_report = create_dataframe_report(df)

    # Calcular correlaciones y asociaciones
    corr = calculate_correlation(df)
    assoc = calculate_association(df)

    # Crear un dataframe combinado para la visualización
    combined = corr.copy()
    for col in assoc.columns:
        if col not in combined.columns:
            combined[col] = np.nan
    for index, row in assoc.iterrows():
        for col in assoc.columns:
            combined.at[index, col] = assoc.at[index, col]

    # Generar la gráfica
    plt.figure(figsize=(20, 15))
    plt.title(title)

    corrplot(combined, dataframe_report, size_scale=100, marker='s')

    # Añadir la nota
    plt.figtext(0.5, -0.1, note, wrap=True, horizontalalignment='center', fontsize=12, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})

    plt.show()

# Nota y título
title = "Asociaciones"
note = ("Solo incluyendo el conjunto de datos analizado"
        "■ Los cuadrados son asociaciones categóricas (coeficiente de incertidumbre y razón de correlación) de 0 a 1. "
        "El coeficiente de incertidumbre es asimétrico, (es decir, los valores de la ETIQUETA DE FILA indican cuánto INFORMAN a cada ETIQUETA en la PARTE SUPERIOR). "
        "• Los círculos son las correlaciones numéricas simétricas (Pearson) de -1 a 1. La diagonal trivial se deja intencionalmente en blanco para mayor claridad.")

# Gráfica
plot_association_correlation(df, title, note)