In [None]:
# Max iterations for every loop to prevent infinite loops
MAX_ITER = 100000

def check_max_iter(i):
    if i > MAX_ITER:
        raise Exception("Max iterations reached")

#### Functions to be used later

##### Data preprocessing

In [None]:
from sklearn import preprocessing

Label Encoding

Labels are encoded by sklearn's LabelEncoder. The LabelEncoders are stored in the `label_encoders` dictionary.

In [None]:
def encode_labels(data, encoders_dict = None, key = None):
    """
    Encodes labels of categorical data to numeric values.

    Parameters
    ----------
    data : pandas.DataFrame, pandas.Series, any
        Data to encode.
        If pandas.DataFrame or pandas.Series, all columns will be encoded.
        If any other type, it will be encoded using the encoder with key.
    encoders_dict : dict, optional
        Dictionary of encoders to use. If not provided, new encoders will be created.
    key : any, optional
        Key of the encoder to use. Required if encoders_dict is not None.

    Returns
    -------
    pandas.DataFrame
        Encoded data.
    dict
        Dictionary of encoders used.
    """
    if encoders_dict is None:
        encoders_dict = {}
    if type(data) == pd.Series or type(data) == pd.DataFrame:
        data = data.copy()
        i = 0
        for column in data.columns:
            i += 1
            check_max_iter(i)
            if data[column].dtype == object:
                if column not in encoders_dict:
                    encoders_dict[column] = preprocessing.LabelEncoder()
                    encoders_dict[column].fit(data[column])
                data[column] = encoders_dict[column].transform(data[column])
    else:
        if type(data) == str or type(data) == int:
            data = [data]
        if key == None:
            return data, encoders_dict
        if key not in encoders_dict.keys():
            encoders_dict[key] = preprocessing.LabelEncoder()
            encoders_dict[key].fit(data)
        data = encoders_dict[key].transform(data)
    return data, encoders_dict

def decode_labels(data, encoders_dict, key = None):
    """
    Decodes labels of categorical data from numeric values.

    Parameters
    ----------
    data : pandas.DataFrame, pandas.Series, any
        Data to decode.
        If pandas.DataFrame or pandas.Series, all columns will be decoded.
        If any other type, it will be decoded using the encoder with key.
    encoders_dict : dict
        Dictionary of encoders to use.
    key : any, optional
        Key of the encoder to use. Required if data is not pandas.DataFrame or pandas.Series.
        
    Returns
    -------
    pandas.DataFrame
        Decoded data.
    """
    if type(data) == pd.Series or type(data) == pd.DataFrame:
        data = data.copy()
        i = 0
        for column in data.columns:
            i += 1
            check_max_iter(i)
            if column in encoders_dict.keys():
                try:
                    data[column] = encoders_dict[column].inverse_transform(data[column])
                except ValueError:
                    print(f"Error decoding column {column} with encoder {encoders_dict[column]} and unique data {data[column].unique()}")
    else:
        if type(data) == str or type(data) == int:
            data = [data]
        if key == None:
            return data
        if key in encoders_dict.keys():
            data = encoders_dict[key].inverse_transform(data)
    return data

Scaling

Numeric features are scaled by sklearn's StandardScaler. The StandardScalers are stored in the `standard_scalers` dictionary.

In [None]:
def scale_numeric(data, scaler_dict = None, key = None):
    """
    Scales numeric values of data.

    Parameters
    ----------
    data : pandas.DataFrame, pandas.Series, any
        Data to scale.
        If pandas.DataFrame or pandas.Series, all columns will be scaled.
        If any other type, it will be scaled using the scaler with key.
    scaler_dict : dict, optional
        Dictionary of scalers to use. If not provided, new scalers will be created.
    key : any, optional
        Key of the scaler to use. Required if scaler_dict is not None.
        If data is pandas.DataFrame or pandas.Series, key is list of columns to keep.

    Returns
    -------
    pandas.DataFrame
        Scaled data.
    dict
        Dictionary of scalers used.
    """
    if scaler_dict is None:
        scaler_dict = {}
    if type(data) == pd.Series or type(data) == pd.DataFrame:
        data = data.copy()
        i = 0
        for column in data.columns:
            i += 1
            check_max_iter(i)
            if key is not None and column in key:
                continue
            if data[column].dtype == float or data[column].dtype == int:
                if column not in scaler_dict:
                    scaler_dict[column] = preprocessing.MinMaxScaler()
                    scaler_dict[column].fit(data[column].values.reshape(-1, 1))
                data[column] = scaler_dict[column].transform(data[column].values.reshape(-1, 1))
    else:
        if type(data) == float or type(data) == int:
            data = [data]
        if key == None:
            return data, scaler_dict
        if key not in scaler_dict.keys():
            scaler_dict[key] = preprocessing.MinMaxScaler()
            scaler_dict[key].fit(data)
        data = scaler_dict[key].transform(data)
    return data, scaler_dict

def unscale_numeric(data, scaler_dict, key = None):
    """
    Unscales numeric values of data.

    Parameters
    ----------
    data : pandas.DataFrame, pandas.Series, any
        Data to unscale.
        If pandas.DataFrame or pandas.Series, all columns will be unscaled.
        If any other type, it will be unscaled using the scaler with key.
    scaler_dict : dict
        Dictionary of scalers to use.
    key : any, optional
        Key of the scaler to use. Required if data is not pandas.DataFrame or pandas.Series.
        If data is pandas.DataFrame or pandas.Series, key is list of columns to keep.

    Returns
    -------
    pandas.DataFrame
        Unscaled data.
    """
    if type(data) == pd.Series or type(data) == pd.DataFrame:
        data = data.copy()
        i = 0
        for column in data.columns:
            i += 1
            check_max_iter(i)
            if key is not None and column in key:
                continue
            if column in scaler_dict.keys():
                data[column] = scaler_dict[column].inverse_transform(np.array(data[column]).reshape(1, -1)).reshape(-1)
    else:
        if type(data) == float or type(data) == int:
            data = [data]
        if key == None:
            return data
        if key in scaler_dict.keys():
            data = scaler_dict[key].inverse_transform(data)
    return data