<a href="https://colab.research.google.com/github/HenryZumaeta/MISCELANEAS/blob/Zeta/PYTHON/Asociacion_Numerico_Categorico_Correlacion_Incertidumbre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Replicando el correlograma de la librería sweetviz

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable
from scipy.stats import chi2_contingency, entropy

def calculate_correlation_matrix(df):
    # Calcula la matriz de correlación de Pearson para variables numéricas
    corr_matrix = df.corr(method='pearson')
    return corr_matrix

def calculate_uncertainty_coefficient(df, cat_cols):
    # Calcula el coeficiente de incertidumbre para variables categóricas
    def uncertainty_coefficient(x, y):
        contingency_table = pd.crosstab(x, y)
        chi2 = chi2_contingency(contingency_table)[0]
        n = contingency_table.sum().sum()
        entropy_x = entropy(x.value_counts(normalize=True))
        entropy_y = entropy(y.value_counts(normalize=True))
        return chi2 / n / (min(entropy_x, entropy_y))

    n = len(cat_cols)
    uc_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i == j:
                uc_matrix[i, j] = 1
            else:
                uc_matrix[i, j] = uncertainty_coefficient(df[cat_cols[i]], df[cat_cols[j]])
    return pd.DataFrame(uc_matrix, index=cat_cols, columns=cat_cols)

def plot_associations(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    cat_cols = df.select_dtypes(include=[object, 'category']).columns

    if numeric_cols.empty:
        raise ValueError("No numeric columns found in the dataset.")
    if cat_cols.empty:
        raise ValueError("No categorical columns found in the dataset.")

    corr_matrix = calculate_correlation_matrix(df[numeric_cols])
    uc_matrix = calculate_uncertainty_coefficient(df, cat_cols)

    fig, ax = plt.subplots(figsize=(10, 8))
    norm = Normalize(vmin=-1, vmax=1)
    cmap = plt.cm.bwr

    n = len(df.columns)
    ax.set_xlim(0, n)
    ax.set_ylim(0, n)

    found_values = False

    for i, col1 in enumerate(df.columns):
        for j, col2 in enumerate(df.columns):
            if col1 in numeric_cols and col2 in numeric_cols:
                corr_value = corr_matrix.loc[col1, col2]
                if not np.isnan(corr_value):
                    size = abs(corr_value) * 1000
                    color = cmap(norm(corr_value))
                    shape = 'o'
                    ax.scatter(j, n - i - 1, s=size, c=[color], marker=shape)
                    found_values = True
            elif col1 in cat_cols and col2 in cat_cols:
                uc_value = uc_matrix.loc[col1, col2]
                if not np.isnan(uc_value):
                    size = uc_value * 1000
                    color = cmap(norm(uc_value))
                    shape = 's'
                    ax.scatter(j, n - i - 1, s=size, c=[color], marker=shape)
                    found_values = True

    if not found_values:
        print("No valid associations found to plot.")
        return

    ax.set_xticks(range(n))
    ax.set_xticklabels(df.columns, rotation=90)
    ax.set_yticks(range(n))
    ax.set_yticklabels(df.columns[::-1])

    sm = ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    cbar = fig.colorbar(sm, ax=ax)
    cbar.set_label('Correlation')

    plt.show()

# Uso: simulacion de data 'raleo_base_clus_clusteres.csv'
data = {
    'data_Tcamp': [1, 2, 3, 4, 5],
    'data_sumCOD': [2, 3, 4, 5, 6],
    'data_sumHRSLB': [3, 4, 5, 6, 7],
    'data_GRAD02': ['PRIMARIA COMPLETA', 'SECUNDARIA COMPLETA', 'PRIMARIA INCOMPLETA', 'SECUNDARIA COMPLETA', 'SECUNDARIA INCOMPLETA'],
    'data_sum.variedad_total_AC': [5, 6, 7, 8, 9],
    'data_sum.variedad_total_CC': [6, 7, 8, 9, 10],
    'data_sum.variedad_total_CP': [7, 8, 9, 10, 11],
    'data_sum.variedad_total_LS': [8, 9, 10, 11, 12]
}

df = pd.DataFrame(data)

plot_associations(df)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from scipy.stats import chi2_contingency

def calculate_correlation(df):
    # Calcula la correlación de Pearson solo para columnas numéricas
    numeric_df = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr(method='pearson')
    return correlation_matrix

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    return np.sqrt(chi2 / (n * (min(confusion_matrix.shape) - 1)))

def calculate_association(df):
    cols = df.columns
    association_matrix = pd.DataFrame(index=cols, columns=cols)

    for i in range(len(cols)):
        for j in range(i, len(cols)):
            col1 = df[cols[i]]
            col2 = df[cols[j]]

            if col1.dtype == 'object' or col2.dtype == 'object':
                association_matrix.iat[i, j] = cramers_v(col1, col2)
                association_matrix.iat[j, i] = association_matrix.iat[i, j]
            else:
                association_matrix.iat[i, j] = np.nan
                association_matrix.iat[j, i] = np.nan

    return association_matrix

def plot_association_correlation(df, title, note):
    corr = calculate_correlation(df)
    assoc = calculate_association(df)

    plt.figure(figsize=(10, 8))
    plt.title(title)

    sns.heatmap(corr, annot=False, cmap="coolwarm", center=0, cbar_kws={'label': 'Correlación (Pearson)'}, square=True, linewidths=.5)

    for i in range(len(corr.columns)):
        for j in range(len(corr.columns)):
            if i != j:
                size = abs(assoc.iloc[i, j])
                if not np.isnan(size):
                    plt.gca().add_patch(mpatches.Rectangle((j, i), 1, 1, fill=False, edgecolor='blue', lw=size * 5, linestyle='-', alpha=0.3))

    for i in range(len(corr.columns)):
        for j in range(len(corr.columns)):
            if i != j:
                size = abs(corr.iloc[i, j])
                if not np.isnan(size):
                    plt.gca().add_patch(plt.Circle((j + 0.5, i + 0.5), size / 2, color='blue', alpha=0.3))

    plt.xticks(np.arange(len(df.columns)) + 0.5, df.columns, rotation=90)
    plt.yticks(np.arange(len(df.columns)) + 0.5, df.columns, rotation=0)
    plt.gca().set_xticks(np.arange(len(df.columns)) + 0.5, minor=True)
    plt.gca().set_yticks(np.arange(len(df.columns)) + 0.5, minor=True)
    plt.gca().grid(False, which='minor', color='black', linestyle='-', linewidth=2)

    plt.figtext(0.5, -0.1, note, wrap=True, horizontalalignment='center', fontsize=10, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})

    plt.show()


# Título y nota
title = "Asociaciones"
note = ("Solo incluyendo el conjunto de datos analizado"
        "■ Los cuadrados son asociaciones categóricas (coeficiente de incertidumbre y razón de correlación) de 0 a 1. "
        "El coeficiente de incertidumbre es asimétrico, (es decir, los valores de la ETIQUETA DE FILA indican cuánto INFORMAN a cada ETIQUETA en la PARTE SUPERIOR). "
        "• Los círculos son las correlaciones numéricas simétricas (Pearson) de -1 a 1. La diagonal trivial se deja intencionalmente en blanco para mayor claridad.")


# Gráfica
plot_association_correlation(df, title, note)


In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import matplotlib.patches as patches
from textwrap import wrap
from scipy.stats import chi2_contingency

# Definiciones globales y configuraciones
CORRELATION_ERROR = 83572398457329.0
CORRELATION_IDENTICAL = 1357239845732.0

def wrap_custom(source_text, separator_chars, width=70, keep_separators=True):
    current_length = 0
    latest_separator = -1
    current_chunk_start = 0
    output = ""
    char_index = 0
    while char_index < len(source_text):
        if source_text[char_index] in separator_chars:
            latest_separator = char_index
        output += source_text[char_index]
        current_length += 1
        if current_length == width:
            if latest_separator >= current_chunk_start:
                cutting_length = char_index - latest_separator
                if not keep_separators:
                    cutting_length += 1
                if cutting_length:
                    output = output[:-cutting_length]
                output += "\n"
                current_chunk_start = latest_separator + 1
                char_index = current_chunk_start
            else:
                output += "\n"
                current_chunk_start = char_index + 1
                latest_separator = current_chunk_start - 1
                char_index += 1
            current_length = 0
        else:
            char_index += 1
    return output

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    if confusion_matrix.shape[0] <= 1 or confusion_matrix.shape[1] <= 1:
        return np.nan
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    return np.sqrt(chi2 / (n * (min(confusion_matrix.shape) - 1)))

def make_zero_square_dataframe(features):
    new_dataframe = pd.DataFrame()
    for feature in features:
        new_dataframe[feature] = pd.Series(dtype=float)
    return new_dataframe.reindex(list(range(0, len(features)))).reset_index(drop=True).fillna(0.0)

def calculate_association(df):
    cols = df.columns
    association_matrix = pd.DataFrame(index=cols, columns=cols)

    for i in range(len(cols)):
        for j in range(i, len(cols)):
            col1 = df[cols[i]]
            col2 = df[cols[j]]

            if col1.dtype == 'object' or col2.dtype == 'object':
                association_matrix.iat[i, j] = cramers_v(col1, col2)
                association_matrix.iat[j, i] = association_matrix.iat[i, j]
            else:
                association_matrix.iat[i, j] = np.nan
                association_matrix.iat[j, i] = np.nan

    return association_matrix

def calculate_correlation(df):
    numeric_df = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr(method='pearson')
    return correlation_matrix

def corrplot(correlation_dataframe, dataframe_report, size_scale=100, marker='s'):
    corr = pd.melt(correlation_dataframe.reset_index(), id_vars='index')
    corr.columns = ['x', 'y', 'value']

    def heatmap(y, x, figure_size, **kwargs):
        color = kwargs.get('color', [1]*len(x))
        palette = [(0.85, (0.85/128)*i, (0.85/128)*i) for i in range(0,128)] + [(0.85 - 0.85*(i-128.0)/128.0, 0.85 - 0.85*(i-128.0)/128.0, 0.85) for i in range(128,256)]
        color_min, color_max = kwargs.get('color_range', (min(color), max(color)))

        def value_to_color(val):
            if color_min == color_max:
                return palette[-1]
            if val == CORRELATION_IDENTICAL or val == CORRELATION_ERROR:
                return palette[-1]
            val_position = float((val - color_min)) / (color_max - color_min)
            val_position = min(max(val_position, 0), 1)
            val_position = math.pow(val_position, 0.925)
            ind = int(val_position * 255)
            return palette[ind]

        size = kwargs.get('size', [1]*len(x))
        size_min, size_max = kwargs.get('size_range', (min(size), max(size)))
        size_scale = kwargs.get('size_scale', 500) / len(x)

        def value_to_size(val):
            if val == 0 or val == abs(CORRELATION_IDENTICAL) or val == abs(CORRELATION_ERROR):
                return 0.0
            if size_min == size_max:
                return 1 * size_scale
            val_position = (val - size_min) * 0.999 / (size_max - size_min) + 0.001
            val_position = min(max(val_position, 0), 1)
            val_position = math.pow(val_position, 0.5)
            return val_position

        def do_wrapping(label, length):
            return wrap_custom(label, ["_", "-"], length)

        wrap_x = 12
        wrap_y = 13
        x_names = [t for t in kwargs.get('x_order', sorted(set([v for v in x])))]
        x_names = [do_wrapping(label, wrap_x) for label in x_names]
        x_to_num = {p[1]:p[0] for p in enumerate(x_names)}

        y_names = [t for t in kwargs.get('y_order', sorted(set([v for v in y])))]
        y_names = [do_wrapping(label, wrap_y) for label in y_names]
        y_to_num = {p[1]:p[0] for p in enumerate(y_names)}

        figure, axs = plt.subplots(1, 1, figsize=figure_size)

        marker = kwargs.get('marker', 's')

        kwargs_pass_on = {k:v for k,v in kwargs.items() if k not in ['color', 'palette', 'color_range', 'size', 'size_range', 'size_scale', 'marker', 'x_order', 'y_order']}

        axs.tick_params(labelbottom='on', labeltop='on')
        axs.set_xticks([v for k,v in x_to_num.items()])
        axs.set_xticklabels([k for k in x_to_num], rotation=90, horizontalalignment='center', linespacing=0.8)
        axs.set_yticks([v for k,v in y_to_num.items()])
        axs.set_yticklabels([k for k in y_to_num], linespacing=0.85)

        axs.grid(False, 'major')
        axs.grid(True, 'minor')
        axs.set_xticks([t + 0.5 for t in axs.get_xticks()], minor=True)
        axs.set_yticks([t + 0.5 for t in axs.get_yticks()], minor=True)

        axs.set_xlim([-0.5, max([v for v in x_to_num.values()]) + 0.5])
        axs.set_ylim([-0.5, max([v for v in y_to_num.values()]) + 0.5])
        axs.set_facecolor('#F1F1F1')

        delta_in_pix = axs.transData.transform((1, 1)) - axs.transData.transform((0, 0))

        index = 0
        for cur_x, cur_y in zip(x,y):
            wrapped_x_name = do_wrapping(cur_x, wrap_x)
            wrapped_y_name = do_wrapping(cur_y, wrap_y)
            before_coordinate = np.array(axs.transData.transform((x_to_num[wrapped_x_name]-0.5, y_to_num[wrapped_y_name] -0.5)))
            after_coordinate = np.array(axs.transData.transform((x_to_num[wrapped_x_name]+0.5, y_to_num[wrapped_y_name] +0.5)))
            before_pixels = np.round(before_coordinate, 0)
            after_pixels = np.round(after_coordinate, 0)
            desired_fraction = value_to_size(size[index])
            if desired_fraction == 0.0:
                index += 1
                continue
            use_rectangle = True if dataframe_report[cur_x]["type"] != "NUM" or dataframe_report[cur_y]["type"] != "NUM" else False
            delta_in_pix = after_pixels - before_pixels
            gap = np.round((1.0 - desired_fraction) * delta_in_pix / 2, 0)
            start = before_pixels + gap[0]
            ending = after_pixels - gap[0]
            start[0] += 1
            ending[1] -= 1
            start_doc = axs.transData.inverted().transform(start)
            ending_doc = axs.transData.inverted().transform(ending)
            cur_size = ending_doc - start_doc
            if use_rectangle:
                                    cur_rect = patches.Rectangle((start_doc[0], start_doc[1]), cur_size[0], cur_size[1], facecolor=value_to_color(color[index]), antialiased=True)
            else:
                cur_rect = patches.Circle((start_doc[0] + cur_size[0] / 2, start_doc[1] + cur_size[1] / 2), cur_size[0] / 2, facecolor=value_to_color(color[index]), antialiased=True)
            cur_rect.set_antialiased(True)
            axs.add_patch(cur_rect)
            index += 1

        if color_min < color_max:
            ax = plt.subplot2grid((1, 15), (0, 14))
            col_x = [0] * len(palette)
            bar_y = np.linspace(color_min, color_max, len(palette))
            ax.set_ylim(-1, 1)
            bar_height = bar_y[1] - bar_y[0]
            ax.barh(
                y=bar_y,
                width=[5] * len(palette),
                left=col_x,
                height=bar_height,
                color=palette,
                linewidth=0)
            ax.set_xlim(1, 2)
            ax.grid(False)
            ax.set_facecolor('white')
            ax.set_xticks([])
            ax.set_yticks(np.linspace(min(bar_y), max(bar_y), 3))
            ax.yaxis.tick_right()
        return figure

    return heatmap(
        corr['y'], corr['x'],
        figure_size=(20, 15),
        color=corr['value'], color_range=[-1, 1],
        size=corr['value'].abs(), size_range=[0, 1],
        marker=marker,
        x_order=correlation_dataframe.columns,
        y_order=correlation_dataframe.columns[::-1],
        size_scale=size_scale,
        dataframe_report=dataframe_report
    )

def create_dataframe_report(df):
    dataframe_report = {}
    for col in df.columns:
        col_type = 'NUM' if pd.api.types.is_numeric_dtype(df[col]) else 'CAT'
        dataframe_report[col] = {"type": col_type}
    return dataframe_report

def plot_association_correlation(df, title, note):
    # Crear el reporte del dataframe
    dataframe_report = create_dataframe_report(df)

    # Calcular correlaciones y asociaciones
    corr = calculate_correlation(df)
    assoc = calculate_association(df)

    # Crear un dataframe combinado para la visualización
    combined = corr.copy()
    for col in assoc.columns:
        if col not in combined.columns:
            combined[col] = np.nan
    for index, row in assoc.iterrows():
        for col in assoc.columns:
            combined.at[index, col] = assoc.at[index, col]

    # Generar la gráfica
    plt.figure(figsize=(20, 15))
    plt.title(title)

    corrplot(combined, dataframe_report, size_scale=100, marker='s')

    # Añadir la nota
    plt.figtext(0.5, -0.1, note, wrap=True, horizontalalignment='center', fontsize=12, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})

    plt.show()

# Nota y título
title = "Asociaciones"
note = ("Solo incluyendo el conjunto de datos analizado"
        "■ Los cuadrados son asociaciones categóricas (coeficiente de incertidumbre y razón de correlación) de 0 a 1. "
        "El coeficiente de incertidumbre es asimétrico, (es decir, los valores de la ETIQUETA DE FILA indican cuánto INFORMAN a cada ETIQUETA en la PARTE SUPERIOR). "
        "• Los círculos son las correlaciones numéricas simétricas (Pearson) de -1 a 1. La diagonal trivial se deja intencionalmente en blanco para mayor claridad.")

# Gráfica
plot_association_correlation(df, title, note)

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar  11 09:19:35 2019

@author: alejandrokoury
"""

SEED = 1
TARGET_VARIABLE = 'cnt'
SPLITS = 4
ESTIMATORS = 50
METRIC = 'r2'
TIMESERIES = True

if METRIC == 'r2':
    from sklearn.metrics import r2_score as metric_scorer
else:
    from sklearn.metrics import accuracy_score as metric_scorer


import numpy as np
import pandas as pd
import seaborn as sns
from tempfile import mkdtemp
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy.stats import chi2_contingency
from scipy.stats.mstats import winsorize
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, TimeSeriesSplit, StratifiedKFold


def missing_data(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

def convert_to_category(df, cols):
    for i in cols:
        df[i] = df[i].astype('category')
    return df

def drop_columns(df, cols):
    return df.drop(df[cols], axis=1)

def types(df, types, exclude = None):
    types = df.select_dtypes(include=types)
    excluded = [TARGET_VARIABLE]
    if exclude:
        for i in exclude:
            excluded.append(i)
    cols = [col for col in types.columns if col not in excluded]
    return df[cols]

def numericals(df, exclude = None):
    return types(df, [np.number], exclude)

def categoricals(df, exclude = None):
    return types(df, ['category', object], exclude)

def numerical_correlated(df, threshold=0.9):
    corr_matrix = np.absolute(df.select_dtypes(include=[np.number]).corr(method='spearman')).abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    return [column for column in upper.columns if any(abs(upper[column]) > threshold)], corr_matrix

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1), (rcorr-1)))

def categorical_correlated(df, threshold=0.9):
    columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    corr = pd.DataFrame(index=columns, columns=columns, dtype=float)  # Asegurando que el DataFrame es de tipo float
    for i in range(0, len(columns)):
        for j in range(i+1, len(columns)):  # Asegura que no se compara la columna consigo misma
            if df[columns[i]].dtype.name == 'category' and df[columns[j]].dtype.name == 'category':
                # Asegurar que ambos campos son categóricos y están codificados como códigos de categoría
                val1 = df[columns[i]].cat.codes if df[columns[i]].dtype.name == 'category' else df[columns[i]]
                val2 = df[columns[j]].cat.codes if df[columns[j]].dtype.name == 'category' else df[columns[j]]
                cell = cramers_v(val1, val2)
                corr.at[columns[i], columns[j]] = cell
                corr.at[columns[j], columns[i]] = cell
    corr.fillna(value=np.nan, inplace=True)
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    return [column for column in upper.columns if any(upper[column] > threshold)], corr



def correlated(df, threshold = 0.9):
    categoric = categorical_correlated(df, threshold)
    numeric = numerical_correlated(df, threshold)

    plt.figure(figsize=(12,10))
    sns.heatmap(categoric[1],cbar=True,fmt =' .2f', annot=True, cmap='viridis').set_title('Categorical Correlation', fontsize=30)

    plt.figure(figsize=(12,10))
    sns.heatmap(numeric[1],cbar=True,fmt =' .2f', annot=True, cmap='viridis').set_title('Numerical Correlation', fontsize=30)

    correlated_cols = categoric[0] + numeric[0]

    if(len(correlated_cols) > 0):
        print('The following columns are correlated with a threshold of ' + str(threshold) + ': ' + str(correlated_cols))
    else:
        print('No correlated columns for the  ' + str(threshold) + ' threshold')

    return correlated_cols, categoric[1], numeric[1]

def winsorize_data(df, train_df, cols):
    for col in cols:
        train_df[col] = winsorize(train_df[col], limits = [0.01, 0.01])
        df[df[col] > max(train_df[col])][col] = max(train_df[col])
        df[df[col] < min(train_df[col])][col] = min(train_df[col])
    return df

def lof(df, training_df):
    lof = LocalOutlierFactor(n_neighbors=20, contamination='auto')
    y_pred = lof.fit_predict(training_df)
    outliers = np.where(y_pred == -1)
    print('Removing ' + str(len(outliers[0])) + ' records')
    return df.drop(outliers[0])

def one_hot_encode(df, cols):
    for i in cols:
        dummies = pd.get_dummies(df[i], prefix=i, drop_first = False)
        df = pd.concat([df, dummies], axis = 1)
        df = df.drop(i, axis = 1)

    return df

def under_represented_features(df, threshold = 0.99, holdout_df = None):
    under_rep = []
    for column in df:
        counts = df[column].value_counts()
        majority_freq = counts.iloc[0]
        if (majority_freq / len(df)) > threshold:
            under_rep.append(column)

    if not under_rep:
        print('No underrepresented features')
    else:
        if TARGET_VARIABLE in under_rep:
            print('The target variable is underrepresented, consider rebalancing')
            under_represented.remove(TARGET_VARIABLE)
        print(str(under_rep) + ' underrepresented, removing')

    df = drop_columns(df, under_rep)

    if holdout_df is not None:
        return df, drop_columns(holdout_df, under_rep)

    return df

def feature_importance(df, model):
    acc, scores, model = cv_evaluate(df, model = model)
    importances = model.feature_importances_
    std = np.std([tree.feature_importances_ for tree in model.estimators_],axis=0)
    indices = np.argsort(importances)

    X = df.loc[:, df.columns != TARGET_VARIABLE]
    print("Feature ranking:")
    plt.figure(figsize=(16, 14))
    plt.title("Feature importances")
    plt.barh(range(X.shape[1]), importances[indices],color="r", xerr=std[indices], align="center")
    plt.yticks(range(X.shape[1]), [list(df.loc[:, df.columns != TARGET_VARIABLE])[i] for i in indices])
    plt.ylim([-1, X.shape[1]])
    plt.show()

def plot_pca_components(df, variance = 0.9):
    X = df.loc[:, df.columns != TARGET_VARIABLE]
    y = df.loc[:, TARGET_VARIABLE]
    pca = PCA().fit(X)

    plt.figure()
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Variance (%)')
    plt.show()

def cv_evaluate(df, model, splits = SPLITS, transformers = None, grid = None):
    X = df.loc[:, df.columns != TARGET_VARIABLE]
    y = df.loc[:, TARGET_VARIABLE]
    if TIMESERIES:
        folds = TimeSeriesSplit(n_splits = splits)
    else:
        folds = StratifiedKFold(n_splits = splits, shuffle = True, random_state=SEED)

    train_size = int(len(df) * 0.85)
    X_train, X_validate, y_train, y_validate = X[0:train_size], X[train_size:len(df)], y[0:train_size], y[train_size:len(df)]

    if transformers:
        cachedir = mkdtemp()
        model = make_pipeline(model, memory = cachedir)
        for ind,i in enumerate(transformers):
            model.steps.insert(ind,[str(ind+1),i])

    if grid:
        model = RandomizedSearchCV(model, grid, scoring = METRIC, cv = folds, n_iter = 20, refit=True, return_train_score = False, error_score=0.0, random_state = SEED)
        model.fit(X_train, y_train)
        scores = model.cv_results_['mean_test_score']
    else:
        scores = cross_val_score(model, X_train, y_train, scoring = METRIC, cv = folds)
        model.fit(X_train, y_train)

    pred = model.predict(X_validate)
    final_score = metric_scorer(y_validate, pred)

    return final_score, scores, model

def feature_engineering_pipeline(df, models, transformers, splits = SPLITS):
    all_scores  = pd.DataFrame(columns = ['Model', 'Function', 'CV Score', 'Holdout Score', 'Difference', 'Outcome'])

    for model in models:
        top_cv_score, cv_scores, cv_model = cv_evaluate(df, model = model['model'], splits = splits)
        model['score'] = best_score = top_cv_score
        model['transformers'] = []
        all_scores = all_scores.append({'Model': model['name'], 'Function':'base_score','CV Score': '{:.2f} +/- {:.02}'.format(np.mean(cv_scores[cv_scores > 0.0]),np.std(cv_scores[cv_scores > 0.0])),'Holdout Score': top_cv_score, 'Difference': 0, 'Outcome': 'Base ' + model['name']}, ignore_index=True)

        for transformer in transformers:
            engineered_data = df.copy()
            outcome = 'Rejected'

            try:
                top_transformer_score, transformer_scores, cv_model = cv_evaluate(engineered_data, model = model['model'], transformers = [transformer['transformer']], splits = splits)
                difference = (top_transformer_score - best_score)

                if difference > 0:
                    model['transformers'] = [i for i in model['transformers'] if i['name'] != transformer['name']]
                    model['transformers'].append(transformer['transformer'])
                    outcome = 'Accepted'

                mean = np.mean(transformer_scores[transformer_scores > 0.0])
                std = np.std(transformer_scores[transformer_scores > 0.0])
                if np.isnan(mean) or np.isnan(std):
                    mean = 0.00
                    std = 0.00

                score = {'Model': model['name'], 'Function':transformer['name'],'CV Score': '{:.2f} +/- {:.02}'.format(mean,std),'Holdout Score': top_transformer_score, 'Difference': difference, 'Outcome': outcome}

            except:
                score = {'Model': model['name'], 'Function':transformer['name'],'CV Score': '0.00 +/- 0.00','Holdout Score': 0, 'Difference': 0, 'Outcome': 'Error'}

            all_scores = all_scores.append(score, ignore_index=True)
    return create_pipelines(models), all_scores

def create_pipelines(pipes):
    cachedir = mkdtemp()
    for item in pipes:
        item['pipeline'] = make_pipeline(*item['transformers'], item['model'], memory = cachedir)

    return sorted(pipes, key=lambda k: k['score'], reverse = True)


# Uso
correlated_cols, categoric_cols, numeric_cols = correlated(df_tratada, 0.9)