<a href="https://colab.research.google.com/github/HenryZumaeta/MISCELANEAS/blob/Zeta/PYTHON/Imputacion_Faltantes_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imputación por Random Forest
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


# Separar las columnas numéricas y categóricas
num_cols = df_combined.select_dtypes(include=[np.number]).columns
cat_cols = df_combined.select_dtypes(include=[object]).columns

# Imputación para variables numéricas usando RandomForestRegressor
imputer_num = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=0)
df_combined[num_cols] = imputer_num.fit_transform(df_combined[num_cols])

# Imputación para variables categóricas usando RandomForestClassifier
for col in cat_cols:
    df_combined[col] = df_combined[col].astype('category').cat.codes  # Convertir a códigos numéricos temporales para imputar
    imputer_cat = IterativeImputer(estimator=RandomForestClassifier(), max_iter=10, random_state=0)
    df_combined[col] = imputer_cat.fit_transform(df_combined[[col]])
    df_combined[col] = pd.Categorical.from_codes(df_combined[col].astype(int), categories=df_combined[col].astype('category').cat.categories)  # Reconstruir categorías originales

# Guardar el DataFrame imputado en un archivo CSV
#df_combined.to_csv('raleo_base_clus_clusteres_imputed.csv', index=False)

print("DataFrame imputado:\n", df_combined.head())


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.utils.validation import check_is_fitted

def missForest(xmis, max_iter=10, n_trees=100, variablewise=False, decreasing=False,
               verbose=False, mtry=None, replace=True, classwt=None, cutoff=None,
               strata=None, sampsize=None, nodesize=None, maxnodes=None, xtrue=None,
               parallelize="no"):

    def initialize(xmis):
        n, p = xmis.shape
        ximp = xmis.copy()
        var_type = []
        for col in xmis.columns:
            if pd.api.types.is_numeric_dtype(xmis[col]):
                var_type.append("numeric")
                ximp[col].fillna(ximp[col].mean(), inplace=True)
            elif pd.api.types.is_categorical_dtype(xmis[col]):
                var_type.append("factor")
                most_frequent = ximp[col].mode()[0]
                ximp[col].fillna(most_frequent, inplace=True)
            else:
                raise ValueError(f"Column {col} must be factor or numeric")
        return ximp, var_type

    def stop_criterion(var_type, conv_new, conv_old, iter, max_iter):
        if len(set(var_type)) == 1:
            return (conv_new < conv_old) & (iter < max_iter)
        else:
            return ((conv_new[0] < conv_old[0]) or (conv_new[1] < conv_old[1])) & (iter < max_iter)

    n, p = xmis.shape
    if mtry is None:
        mtry = int(np.floor(np.sqrt(p)))

    ximp, var_type = initialize(xmis)

    if any(xmis.isnull().all()):
        indCmis = xmis.columns[xmis.isnull().all()]
        xmis = xmis.drop(columns=indCmis)
        ximp = ximp.drop(columns=indCmis)
        p = xmis.shape[1]
        if verbose:
            print(f"  removed variable(s) {indCmis} due to the missingness of all entries")

    NAloc = xmis.isnull()
    noNAvar = NAloc.sum()
    sort_j = noNAvar.argsort()
    if decreasing:
        sort_j = sort_j[::-1]
    sort_noNAvar = noNAvar[sort_j]
    nzsort_j = sort_j[sort_noNAvar > 0]

    Ximp = []
    iter = 0
    k = len(set(var_type))
    conv_new = np.zeros(k)
    conv_old = np.full(k, np.inf)
    OOBerror = np.zeros(p)
    names_OOBerror = np.array(var_type)

    if k == 1:
        convergence = np.array([])
        OOBerr = np.zeros(1)
    else:
        convergence = np.full((max_iter, 2), np.nan)
        OOBerr = np.zeros(2)

    while stop_criterion(var_type, conv_new, conv_old, iter, max_iter):
        if iter != 0:
            conv_old = conv_new.copy()
            OOBerrOld = OOBerr.copy()

        if verbose:
            print(f"  missForest iteration {iter + 1} in progress...")

        ximp_old = ximp.copy()

        for s in range(p):
            var_ind = sort_j[s]
            if noNAvar[var_ind] != 0:
                obsi = ~NAloc.iloc[:, var_ind]
                misi = NAloc.iloc[:, var_ind]
                obsY = ximp.loc[obsi, xmis.columns[var_ind]]
                obsX = ximp.loc[obsi, xmis.columns != xmis.columns[var_ind]]
                misX = ximp.loc[misi, xmis.columns != xmis.columns[var_ind]]
                typeY = var_type[var_ind]

                if typeY == "numeric":
                    model = RandomForestRegressor(n_estimators=n_trees, max_features=mtry, bootstrap=replace)
                    model.fit(obsX, obsY)
                    misY = model.predict(misX)
                    OOBerror[var_ind] = np.mean((model.predict(obsX) - obsY) ** 2)
                else:
                    model = RandomForestClassifier(n_estimators=n_trees, max_features=mtry, bootstrap=replace)
                    model.fit(obsX, obsY)
                    misY = model.predict(misX)
                    OOBerror[var_ind] = 1 - np.mean(model.predict(obsX) == obsY)

                ximp.loc[misi, xmis.columns[var_ind]] = misY

        iter += 1
        Ximp.append(ximp.copy())

        for t_type in set(var_type):
            t_ind = [i for i, x in enumerate(var_type) if x == t_type]
            if t_type == "numeric":
                conv_new[0] = np.sum((ximp.iloc[:, t_ind] - ximp_old.iloc[:, t_ind]) ** 2) / np.sum(ximp.iloc[:, t_ind] ** 2)
            else:
                conv_new[1] = np.sum(ximp.iloc[:, t_ind] != ximp_old.iloc[:, t_ind]) / (n * np.sum(np.array(var_type) == "factor"))

        if not variablewise:
            NRMSE = np.sqrt(np.mean(OOBerror[np.array(var_type) == "numeric"]) / np.var(xmis.to_numpy()[:, np.array(var_type) == "numeric"], ddof=1))
            PFC = np.mean(OOBerror[np.array(var_type) == "factor"])
            if k == 1:
                OOBerr[0] = NRMSE if "numeric" in var_type else PFC
            else:
                OOBerr = np.array([NRMSE, PFC])
        else:
            OOBerr = OOBerror
            names_OOBerror[np.array(var_type) == "numeric"] = "MSE"
            names_OOBerror[np.array(var_type) == "factor"] = "PFC"

        if xtrue is not None:
            err = mix_error(ximp, xmis, xtrue)

        if verbose:
            print(f"    error(s): {err}" if xtrue is not None else "")
            print(f"    estimated error(s): {OOBerr}")
            print(f"    difference(s): {conv_new}")

    if iter == max_iter:
        result = {"ximp": Ximp[-1], "OOBerror": OOBerr}
    else:
        result = {"ximp": Ximp[-2], "OOBerror": OOBerrOld, "error": err} if xtrue is not None else {"ximp": Ximp[-2], "OOBerror": OOBerrOld}

    return result

def mix_error(ximp, xmis, xtrue):
    n, p = xtrue.shape
    error = np.zeros(p)
    for j in range(p):
        if pd.api.types.is_numeric_dtype(xtrue.iloc[:, j]):
            error[j] = np.sqrt(np.sum((ximp.iloc[:, j] - xtrue.iloc[:, j]) ** 2) / np.sum((xmis.iloc[:, j] - xtrue.iloc[:, j]) ** 2))
        else:
            error[j] = np.sum(ximp.iloc[:, j] != xtrue.iloc[:, j]) / np.sum(xmis.iloc[:, j] != xtrue.iloc[:, j])
    return error


In [None]:
# Crear un DataFrame de ejemplo con valores faltantes
data = {
    'A': [1, 2, np.nan, 4],
    'B': [np.nan, 1, 3, 4],
    'C': [4, 2, np.nan, 3]
}
df = pd.DataFrame(data)

# Convertir la columna categórica a tipo 'category'
df['C'] = df['C'].astype('category')

# Ejecutar la función missForest
result = missForest(df, max_iter=10, n_trees=100, verbose=True)

# Obtener los datos imputados
imputed_data = result['ximp']

# Mostrar los datos imputados
print(imputed_data)

In [None]:
#!pip install missingpy

import numpy as np
import pandas as pd
from missingpy import MissForest

# Crear un DataFrame de ejemplo con valores faltantes
data = {
    'A': [1, 2, np.nan, 4],
    'B': [np.nan, 1, 3, 4],
    'C': ['a', 'b', np.nan, 'd']
}
df = pd.DataFrame(data)

# Convertir la columna categórica a tipo 'category'
df['C'] = df['C'].astype('category')

# Inicializar el imputador MissForest
imputer = MissForest(verbose=True)

# Realizar la imputación
imputed_data = imputer.fit_transform(df)

# Convertir el resultado a un DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=df.columns)

# Mostrar los datos imputados
print(imputed_df)


In [None]:
#pip install MissForest
from missforest.missforest import MissForest
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


df = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]

np.random.seed(0)
clf = RandomForestClassifier(n_jobs=-1)
rgr = RandomForestRegressor(n_jobs=-1)

mf = MissForest(clf, rgr)
df_imputed = mf.fit_transform(df)

In [None]:
from missingpy import MissForest
from sklearn.neighbors import DistanceMetric
nan = float("NaN")
np.random.seed(0)
X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
imputer = MissForest()
imputer.fit_transform(X)

In [None]:
import pandas as pd
import numpy as np
from missingpy import MissForest

# Crear un DataFrame con valores faltantes
np.random.seed(0)
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, 4, 5],
    'C': [1, np.nan, 3, 4, 5]
})

# Imprimir el DataFrame original con valores faltantes
print("DataFrame original con valores faltantes:")
print(df)

# Crear una instancia del imputador MissForest
imputer = MissForest()

# Imputar los valores faltantes
df_imputed = imputer.fit_transform(df)

# Convertir el resultado a un DataFrame de Pandas
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

# Imprimir el DataFrame imputado
print("\nDataFrame con valores imputados:")
print(df_imputed)


In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import BayesianRidge, Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils import resample

# Mapear los métodos a los imputadores correspondientes
method_map = {
    'mean': SimpleImputer(strategy='mean'),
    'norm': IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=0),
    'norm.nob': IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=0),  # Ajustado
    'norm.boot': IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=0),  # Ajustado
    'norm.predict': IterativeImputer(estimator=LinearRegression(), max_iter=10, random_state=0),  # Ajustado
    'lasso.norm': IterativeImputer(estimator=Lasso(), max_iter=10, random_state=0),  # Sustituido
    'lasso.select.norm': IterativeImputer(estimator=Lasso(), max_iter=10, random_state=0),  # Sustituido
    'quadratic': IterativeImputer(estimator=LinearRegression(), max_iter=10, random_state=0),  # Sustituido
    'ri': IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=0),
    'logreg': IterativeImputer(estimator=LogisticRegression(), max_iter=10, random_state=0),
    'logreg.boot': IterativeImputer(estimator=LogisticRegression(), max_iter=10, random_state=0),  # Ajustado
    'lasso.logreg': IterativeImputer(estimator=LogisticRegression(), max_iter=10, random_state=0),  # Sustituido
    'lasso.select.logreg': IterativeImputer(estimator=LogisticRegression(), max_iter=10, random_state=0),  # Sustituido
    'rf': IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=0),
    'cart': IterativeImputer(estimator=DecisionTreeRegressor(), max_iter=10, random_state=0),
    # Otros métodos pueden ser agregados aquí
}

def imputar_datos_numericos(data, method='mean', m=5, max_iter=10, random_state=None):
    """
    Imputación de datos numéricos utilizando diferentes métodos.

    Args:
    data (pd.DataFrame): DataFrame con los datos a imputar.
    method (str): Método de imputación a utilizar. Valores posibles:
                  'mean', 'norm', 'logreg', 'rf', 'cart', etc.
    m (int): Número de conjuntos de imputación a generar.
    max_iter (int): Número máximo de iteraciones para cada imputación.
    random_state (int): Semilla para asegurar reproducibilidad.

    Returns:
    list: Una lista de DataFrames imputados.
    """
    if method not in method_map:
        raise ValueError(f"Método de imputación {method} no reconocido. "
                         f"Elija entre {list(method_map.keys())}")

    imputer = method_map[method]

    imputaciones = []

    for _ in range(m):
        imputacion = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
        imputaciones.append(imputacion)

    return imputaciones

# Ejemplo de uso
if __name__ == "__main__":
    # Crear un DataFrame de ejemplo con datos faltantes
    np.random.seed(0)
    df = pd.DataFrame({
        'A': [1, 2, np.nan, 4, 5],
        'B': [5, np.nan, np.nan, 8, 10],
        'C': [np.nan, 1, 1, 2, np.nan]
    })

    # Mostrar el DataFrame original
    print("DataFrame original:")
    print(df)

    # Imputar los datos numéricos faltantes usando el método 'mean'
    imputaciones = imputar_datos_numericos(df, method='mean', m=5, max_iter=10, random_state=0)

    # Mostrar los DataFrames imputados
    for i, imputacion in enumerate(imputaciones):
        print(f"\nDataFrame imputado {i+1} (método 'mean'):")
        print(imputacion)

    # Imputar los datos numéricos faltantes usando el método 'norm'
    imputaciones = imputar_datos_numericos(df, method='norm', m=5, max_iter=10, random_state=0)

    # Mostrar los DataFrames imputados
    for i, imputacion in enumerate(imputaciones):
        print(f"\nDataFrame imputado {i+1} (método 'norm'):")
        print(imputacion)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed
from sklearn.metrics import mean_squared_error, accuracy_score

def missForest(xmis, maxiter=10, ntree=100, variablewise=False,
               decreasing=False, verbose=False, mtry=None,
               replace=True, classwt=None, cutoff=None, strata=None,
               sampsize=None, nodesize=None, maxnodes=None, xtrue=None,
               parallelize='no'):

    n, p = xmis.shape
    if mtry is None:
        mtry = int(np.sqrt(p))

    if classwt is not None:
        assert len(classwt) == p and isinstance(classwt, list), "classwt must be a list of length p"
    if cutoff is not None:
        assert len(cutoff) == p and isinstance(cutoff, list), "cutoff must be a list of length p"
    if strata is not None:
        assert len(strata) == p and isinstance(strata, list), "strata must be a list of length p"
    if nodesize is not None:
        assert len(nodesize) == 2, "nodesize must be of length 2"

    # Remove columns with all missing values
    na_counts = xmis.isnull().sum()
    all_na_cols = na_counts[na_counts == n].index
    if len(all_na_cols) > 0:
        xmis = xmis.drop(columns=all_na_cols)
        p = xmis.shape[1]
        if verbose:
            print(f"Removed columns {all_na_cols.tolist()} due to all values being missing.")

    parallelize = parallelize.lower()
    if parallelize not in ['no', 'variables', 'forests']:
        raise ValueError("parallelize must be one of 'no', 'variables', or 'forests'")

    if parallelize != 'no' and len(pd.unique(xmis.dtypes)) > 1:
        raise ValueError("Parallelization is not supported with mixed data types")

    # Initial imputation
    ximp = xmis.copy()
    varType = ximp.dtypes.apply(lambda dt: 'numeric' if np.issubdtype(dt, np.number) else 'factor')

    for col in ximp.columns:
        if varType[col] == 'numeric':
            ximp[col].fillna(ximp[col].mean(), inplace=True)
        else:
            most_common = ximp[col].mode()[0]
            ximp[col].fillna(most_common, inplace=True)

    # Convergence criteria
    convNew = np.zeros(2)
    convOld = np.inf * np.ones(2)
    OOBerror = np.zeros(p)

    iter = 0
    stopCriterion = lambda varType, convNew, convOld, iter, maxiter: \
                    (convNew[0] < convOld[0] or convNew[1] < convOld[1]) and iter < maxiter

    while stopCriterion(varType, convNew, convOld, iter, maxiter):
        if iter != 0:
            convOld = convNew
            OOBerrOld = OOBerror

        if verbose:
            print(f"missForest iteration {iter + 1} in progress...")

        ximp_old = ximp.copy()
        na_loc = xmis.isnull()

        # Order variables by number of missing values
        noNAvar = na_loc.sum()
        sort_j = noNAvar.sort_values(ascending=not decreasing).index
        nzsort_j = sort_j[noNAvar[sort_j] > 0]

        if parallelize == 'variables':
            results = Parallel(n_jobs=-1)(delayed(impute_column)(ximp, na_loc, varType, i, mtry, ntree, replace, nodesize, maxnodes) for i in nzsort_j)
            for res in results:
                ximp.loc[na_loc[res['varInd']], res['varInd']] = res['misY']
                OOBerror[res['varInd']] = res['oerr']
        else:
            for varInd in nzsort_j:
                if na_loc[varInd].sum() > 0:
                    ximp, OOBerror[varInd] = impute_column(ximp, na_loc, varType, varInd, mtry, ntree, replace, nodesize, maxnodes)

        iter += 1

        # Convergence check
        numeric_indices = np.where(varType == 'numeric')[0]
        factor_indices = np.where(varType == 'factor')[0]

        convNew[0] = np.sum((ximp.iloc[:, numeric_indices] - ximp_old.iloc[:, numeric_indices])**2) / np.sum(ximp.iloc[:, numeric_indices]**2)
        convNew[1] = np.sum((ximp.iloc[:, factor_indices] != ximp_old.iloc[:, factor_indices])) / (n * len(factor_indices))

        if verbose:
            print(f"Convergence: Numeric = {convNew[0]}, Factor = {convNew[1]}")

    if iter == maxiter:
        ximp_final = ximp
    else:
        ximp_final = ximp_old

    return ximp_final, OOBerror

def impute_column(ximp, na_loc, varType, varInd, mtry, ntree, replace, nodesize, maxnodes):
    obsi = ~na_loc[varInd]
    misi = na_loc[varInd]
    obsY = ximp.loc[obsi, varInd]
    obsX = ximp.loc[obsi, ximp.columns != varInd]
    misX = ximp.loc[misi, ximp.columns != varInd]

    if varType[varInd] == 'numeric':
        model = RandomForestRegressor(n_estimators=ntree, max_features=mtry, bootstrap=replace, min_samples_leaf=nodesize[0] if nodesize else 1, max_leaf_nodes=maxnodes)
        model.fit(obsX, obsY)
        misY = model.predict(misX)
        oerr = mean_squared_error(obsY, model.predict(obsX))
    else:
        le = LabelEncoder()
        obsY_enc = le.fit_transform(obsY)
        model = RandomForestClassifier(n_estimators=ntree, max_features=mtry, bootstrap=replace, min_samples_leaf=nodesize[1] if nodesize else 5, max_leaf_nodes=maxnodes)
        model.fit(obsX, obsY_enc)
        misY = le.inverse_transform(model.predict(misX))
        oerr = 1 - accuracy_score(obsY_enc, model.predict(obsX))

    return {'varInd': varInd, 'misY': misY, 'oerr': oerr}

# Ejemplo de uso
data = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': ['a', 'b', 'c', np.nan, 'e'],
    'C': [np.nan, 1.5, 2.5, 3.5, np.nan]
})

imputed_data, oob_error = missForest(data, verbose=True)
print(imputed_data)


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed
from sklearn.metrics import mean_squared_error, accuracy_score

def missForest(xmis, maxiter=10, ntree=100, variablewise=False,
               decreasing=False, verbose=False, mtry=None,
               replace=True, classwt=None, cutoff=None, strata=None,
               sampsize=None, nodesize=None, maxnodes=None, xtrue=None,
               parallelize='no'):

    n, p = xmis.shape
    if mtry is None:
        mtry = int(np.sqrt(p))

    if classwt is not None:
        assert len(classwt) == p and isinstance(classwt, list), "classwt must be a list of length p"
    if cutoff is not None:
        assert len(cutoff) == p and isinstance(cutoff, list), "cutoff must be a list of length p"
    if strata is not None:
        assert len(strata) == p and isinstance(strata, list), "strata must be a list of length p"
    if nodesize is not None:
        assert len(nodesize) == 2, "nodesize must be of length 2"

    # Remove columns with all missing values
    na_counts = xmis.isnull().sum()
    all_na_cols = na_counts[na_counts == n].index
    if len(all_na_cols) > 0:
        xmis = xmis.drop(columns=all_na_cols)
        p = xmis.shape[1]
        if verbose:
            print(f"Removed columns {all_na_cols.tolist()} due to all values being missing.")

    parallelize = parallelize.lower()
    if parallelize not in ['no', 'variables', 'forests']:
        raise ValueError("parallelize must be one of 'no', 'variables', or 'forests'")

    if parallelize != 'no' and len(pd.unique(xmis.dtypes)) > 1:
        raise ValueError("Parallelization is not supported with mixed data types")

    # Initial imputation
    ximp = xmis.copy()
    varType = ximp.dtypes.apply(lambda dt: 'numeric' if np.issubdtype(dt, np.number) else 'factor')

    label_encoders = {}
    for col in ximp.columns:
        if varType[col] == 'numeric':
            ximp[col].fillna(ximp[col].mean(), inplace=True)
        else:
            le = LabelEncoder()
            ximp[col] = le.fit_transform(ximp[col].astype(str))
            ximp[col].fillna(le.transform([ximp[col].mode()[0]])[0], inplace=True)
            label_encoders[col] = le

    # Convergence criteria
    convNew = np.zeros(2)
    convOld = np.inf * np.ones(2)
    OOBerror = np.zeros(p)

    iter = 0
    stopCriterion = lambda varType, convNew, convOld, iter, maxiter: \
                    (convNew[0] < convOld[0] or convNew[1] < convOld[1]) and iter < maxiter

    while stopCriterion(varType, convNew, convOld, iter, maxiter):
        if iter != 0:
            convOld = convNew
            OOBerrOld = OOBerror

        if verbose:
            print(f"missForest iteration {iter + 1} in progress...")

        ximp_old = ximp.copy()
        na_loc = xmis.isnull()

        # Order variables by number of missing values
        noNAvar = na_loc.sum()
        sort_j = noNAvar.sort_values(ascending=not decreasing).index
        nzsort_j = sort_j[noNAvar[sort_j] > 0]

        if parallelize == 'variables':
            results = Parallel(n_jobs=-1)(delayed(impute_column)(ximp, na_loc, varType, i, mtry, ntree, replace, nodesize, maxnodes, label_encoders) for i in nzsort_j)
            for res in results:
                ximp.loc[na_loc[res['varInd']], res['varInd']] = res['misY']
                OOBerror[res['varInd']] = res['oerr']
        else:
            for varInd in nzsort_j:
                if na_loc[varInd].sum() > 0:
                    ximp, OOBerror[varInd] = impute_column(ximp, na_loc, varType, varInd, mtry, ntree, replace, nodesize, maxnodes, label_encoders)

        iter += 1

        # Convergence check
        numeric_indices = np.where(varType == 'numeric')[0]
        factor_indices = np.where(varType == 'factor')[0]

        convNew[0] = np.sum((ximp.iloc[:, numeric_indices] - ximp_old.iloc[:, numeric_indices])**2) / np.sum(ximp.iloc[:, numeric_indices]**2)
        convNew[1] = np.sum((ximp.iloc[:, factor_indices] != ximp_old.iloc[:, factor_indices])) / (n * len(factor_indices))

        if verbose:
            print(f"Convergence: Numeric = {convNew[0]}, Factor = {convNew[1]}")

    if iter == maxiter:
        ximp_final = ximp
    else:
        ximp_final = ximp_old

    # Decode factor variables
    for col in label_encoders:
        ximp_final[col] = label_encoders[col].inverse_transform(ximp_final[col].astype(int))

    return ximp_final, OOBerror

def impute_column(ximp, na_loc, varType, varInd, mtry, ntree, replace, nodesize, maxnodes, label_encoders):
    obsi = ~na_loc[varInd]
    misi = na_loc[varInd]
    obsY = ximp.loc[obsi, varInd]
    obsX = ximp.loc[obsi, ximp.columns != varInd]
    misX = ximp.loc[misi, ximp.columns != varInd]

    if varType[varInd] == 'numeric':
        model = RandomForestRegressor(n_estimators=ntree, max_features=mtry, bootstrap=replace, min_samples_leaf=nodesize[0] if nodesize else 1, max_leaf_nodes=maxnodes)
        model.fit(obsX, obsY)
        misY = model.predict(misX)
        oerr = mean_squared_error(obsY, model.predict(obsX))
    else:
        model = RandomForestClassifier(n_estimators=ntree, max_features=mtry, bootstrap=replace, min_samples_leaf=nodesize[1] if nodesize else 5, max_leaf_nodes=maxnodes)
        model.fit(obsX, obsY)
        misY = model.predict(misX)
        oerr = 1 - accuracy_score(obsY, model.predict(obsX))

    return {'varInd': varInd, 'misY': misY, 'oerr': oerr}

# Ejemplo de uso
data = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': ['a', 'b', 'c', np.nan, 'e'],
    'C': [np.nan, 1.5, 2.5, 3.5, np.nan]
})

imputed_data, oob_error = missForest(data, verbose=True)
print(imputed_data)


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from joblib import Parallel, delayed

def miss_forest(xmis, maxiter=10, ntree=100, variablewise=False,
                decreasing=False, verbose=False, mtry=None,
                replace=True, classwt=None, cutoff=None, strata=None,
                sampsize=None, nodesize=None, maxnodes=None, xtrue=None,
                parallelize='no'):
    n, p = xmis.shape
    if mtry is None:
        mtry = int(np.floor(np.sqrt(p)))

    ximp = xmis.copy()
    varType = ['numeric' if pd.api.types.is_numeric_dtype(xmis.iloc[:, i]) else 'categorical'
               for i in range(p)]

    if verbose:
        print("Starting missForest imputation...")

    for iteration in range(maxiter):
        if verbose:
            print(f"Iteration {iteration + 1}")

        for column in range(p):
            if xmis.iloc[:, column].isna().all():
                continue  # Skip columns with all values missing

            y = ximp.iloc[:, column]
            X = ximp.drop(ximp.columns[column], axis=1)
            is_numeric = varType[column] == 'numeric'

            # Prepare model based on type of variable
            if is_numeric:
                model = RandomForestRegressor(n_estimators=ntree, max_features=mtry, bootstrap=replace,
                                              max_leaf_nodes=maxnodes, min_samples_leaf=nodesize)
            else:
                model = RandomForestClassifier(n_estimators=ntree, max_features=mtry, bootstrap=replace,
                                               max_leaf_nodes=maxnodes, min_samples_leaf=nodesize)

            not_na_indices = ~y.isna()
            na_indices = y.isna()
            if not_na_indices.any():
                model.fit(X.loc[not_na_indices], y[not_na_indices])
                predictions = model.predict(X.loc[na_indices])
                ximp.loc[na_indices, ximp.columns[column]] = predictions

        # Check for convergence (implementation needed as per specific criteria)

    if verbose:
        print("Imputation completed.")

    return ximp

# Example usage
# df = pd.read_csv('your_data.csv')  # Load your data into a DataFrame
# imputed_data = miss_forest(df)

# Example usage
# Ejemplo de uso
data = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': ['a', 'b', 'c', np.nan, 'e'],
    'C': [np.nan, 1.5, 2.5, 3.5, np.nan]
})

imputed_data = miss_forest(data)
