# Importing the required dependencies using dependency importer

In [1]:
import logging
import importlib
from typing import List, Dict, Union, Callable
from sklearn.base import BaseEstimator, TransformerMixin

# Logger configuration
logger = logging.getLogger()
logger.setLevel(logging.INFO)

class ImportRequiredDependencies:
    def import_through_selection (
        self, standard_module: bool = False, 
        sklearn_module: bool = False, 
        package_module: str = None, 
        modules_to_import: List[str] = None
    ):
        """
        Dynamically imports modules. Inserts dynamically imported modules inside globals()

        Parameters:
            standard_module (bool): Set to True if importing modules not inside packages
            sklearn_module (bool): Set to True if importing modules from Scikit-Learn
            module (str): Scikit-Learn module to get attribute from
            modules_to_import (Dict[str, str]): Modules containing the key-value modules to import

        Returns:
            None
        """
        try:
            if standard_module:
                for module in modules_to_import:
                    logging.info("[*] Importing module: {}".format(module))
                    globals()[module] = importlib.import_module(module)
            if sklearn_module:
                for module in modules_to_import:
                    logging.info("[*] Importing {}".format(module))
                    globals()[module] = getattr(importlib.import_module(package_module), module)
        except ModuleNotFoundError as non_existent_module:
            logging.error("[!] Error: {}".format(non_existent_module))

In [24]:
importer_one_params = {
    "standard_module" : True,
    "modules_to_import": ["numpy", "pandas"]
}

importer = ImportRequiredDependencies()
importer.import_through_selection(**importer_one_params)

INFO:root:[*] Importing module: numpy
INFO:root:[*] Importing module: pandas


In [27]:

# WARNING: DO NOT IMPORT THE MODULES ABOVE THE CLASS. USE DEPENDENCY IMPORTER

from typing import List, Dict, Union
import os
import re
import logging
import numpy
import pandas
import ucimlrepo

logger = logging.getLogger()
logger.setLevel(logging.INFO)

class LoadDataset:
    """
    Dataset loader that will load and create three datasets

    This class will either take in a UCI Machine Learning repository ID or filesystem path
    to load and create three datasets: main dataframe, copy of main dataset, a numpy representation
    of the original dataset

    Parameters:
        uci_id (int): ID of ucimlrepo dataset that will be used to get the dataset
        load_method (str): Option that will get a function reference that will load the dataset
        filesystem_path (str): Filesystem path that points to the dataset file
    """
    def __init__ (self, uci_id: int = None, load_method: str = None, filesystem_path: str = None, **kwargs):
        self.uci_id = uci_id
        self.loader_method = load_method
        self.fs_path = filesystem_path
        self.extra_params = kwargs
        self.datasets = {}
        self.loader_methods = {
            "csv": pandas.read_csv,
            "xlsx": pandas.read_excel,
            "json": pandas.read_json,
            "pickle": pandas.read_pickle,
            "uci": ucimlrepo.fetch_ucirepo
        }

    def _get_loading_method (self):
        """
        Get loader reference if loader_method property exists in loader_methods property keys

        Parameters:
            None

        Returns:
            Pandas dataframe reader or UCI Machine Learning Repository reader
        """
        if self.loader_method in self.loader_methods.keys():
            return self.loader_methods.get(self.loader_method)

    def _load_pandas (self):
        """
        Will get loader reference and if filesystem paths exists, it will assign/create
        three datasets: main dataframe, copy of main dataframe and a numpy representation of main dataframe

        Parameters:
            None

        Returns:
            datasets (dict): Three datasets, two pandas dataframe and one numpy representation
        """
        loader = self._get_loading_method()

        if os.path.exists(self.fs_path):
            self.datasets = {
                "main_df": loader(self.fs_path, **self.extra_params),
                "copy_df": loader(self.fs_path, **self.extra_params).copy(),
                "numpy_df": loader(self.fs_path, **self.extra_params).to_numpy()
            }
            return self.datasets

    def _load_uci (self):
        """
        Will load a temporary raw dataset from UCI Machine Learning Repository, then it will
        assign/create three datasets

        Parameters:
            None

        Returns:
            datasets (dict): Three datasets, two pandas dataframe and one numpy representation
        """
        loader = self._get_loading_method()
        temporary_dataset = loader(id=self.uci_id)

        self.datasets = {
            "main_df": pandas.DataFrame(temporary_dataset.data.original, **self.extra_params),
            "copy_df": pandas.DataFrame(temporary_dataset.data.original, **self.extra_params).copy(),
            "numpy_df": pandas.DataFrame(temporary_dataset.data.original, **self.extra_params).to_numpy()
        }
        return self.datasets

    def load (self, use_uci: bool = False, use_pandas: bool = False):
        """
        Method that allows the user to choose wether to load datasets via Pandas or ucimlrepo

        Parameters:
            use_uci (bool): Set to True if datasets need to be loaded using ucimlrepo
            use_pandas (bool): Set to True if datasets need to be loaded using Pandas

        Returns:
            datasets (dict): Three datasets, two pandas dataframe and one numpy representation
        """
        if use_uci:
            logging.info("[*] Creating three datasets using pandas")
            dataset_dictionary = self._load_uci()
        if use_pandas:
            logging.info("[*] Creating three datasets using ucimlrepo")
            dataset_dictionary = self._load_pandas()

        return dataset_dictionary

    def reset_datasets (self, dataset_dict: Dict[str, Union[numpy.ndarray, pandas.DataFrame]] = None):
        """
        Method will reset the datasets dictionary should the datasets dictionary gets messed up

        Parameters:
            dataset_dict (dict): Dictionary that contains the three original datasets

        Returns:
            dataset_dict (dict): Resetted dictionary
        """
        dataset_dict = self.datasets
        return dataset_dict


In [28]:
loader_params = {
    "uci_id": 53,
    "load_method": "uci"
}

loader = LoadDataset(**loader_params)
datasets = loader.load(use_uci=True)

INFO:root:[*] Creating three datasets using pandas


In [29]:
datasets.get("main_df")

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [40]:

# WARNING: DO NOT COPY PASTE THE IMPORTS ABOVE CLASS. USE DEPENDENCY IMPORTER
from typing import *
import re
import logging
import numpy
import pandas
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, MaxAbsScaler

logger = logging.getLogger()
logger.setLevel(logging.INFO)

class ScaleColumns (BaseEstimator, TransformerMixin):
    def __init__ (
        self, 
        columns_to_preprocess: Union[str, List[str]] = None, 
        scaler_parameters: Dict[str, Union[str, float, numpy.ndarray, pandas.DataFrame]] = None, 
        scaling_preprocessing_type: str = None, 
        numpy_output: bool = False, 
        pandas_output: bool = False,
    ):
        self.columns = columns_to_preprocess
        self.scaler_type = scaling_preprocessing_type
        self.scaler_parameters = scaler_parameters
        self.numpy_output = numpy_output
        self.pandas_output = pandas_output

    def _is_correct_datatype (self, dataset: Union[numpy.ndarray, pandas.DataFrame] = None):
        numpy_datatypes = (numpy.int8, numpy.int16, numpy.int32, numpy.int64, numpy.float16, numpy.float32, numpy.float64)

        try:
            if isinstance(dataset, numpy.ndarray) and re.findall("(^int\.[0-9][0-9]|^float\.[0-9][0-9][0-9])", dataset.dtypes):
                logging.info("[*] Numpy dataset and samples type is correct")
                return True
            else:
                raise ValueError("Numpy dataset or Numpy samples datatypes is incorrect")

            if isinstance(dataset, pandas.DataFrame) and dataset[[self.columns]].dtypes.isin(numpy_datatypes).all():
                logging.info("[*] Pandas dataset and samples type is correct")
                return True
            else:
                raise ValueError("Pandas dataset or Pandas samples datatypes is incorrect")
        except ValueError as incorrect_datatypes_error:
            logging.error(incorrect_datatypes_error)

    def _transform_dataset (
        self, 
        retain_numpy: bool = None, 
        retain_pandas: bool = None, 
        scaler: TransformerMixin = None, 
        dataset: Union[numpy.ndarray, pandas.DataFrame] = None
    ):
        log_message = "[*] Scaler: {}\n[*] Dataset: {}\n[*] Columns: {}"

        if retain_numpy:
            logging.info(log_message.format(scaler.__class__.__name__, dataset, self.columns))
            return scaler.fit_transform(dataset)

        if retain_pandas:
            if scaler.__class__.__name__ == "Normalizer":
                logging.info(log_message.format("Normalizer", dataset, self.columns))
                dataset = pandas.DataFrame(
                    scaler.fit_transform(dataset.values), dataset.index, dataset.columns
                )
            else:
                logging.info(log_message.format(scaler.__class__.__name__, dataset, self.columns))
                dataset[[self.columns]] = pandas.DataFrame(
                    scaler.fit_transform(dataset[[self.columns]]), dataset[[self.columns]].columns
                )
            return dataset

    def fit_transform (self, X, y=None):
        scaler_instances = {
            "standard" : StandardScaler(**(self.scaler_parameters or {})),
            "minmax" : MinMaxScaler(**(self.scaler_parameters or {})),
            "maxabs" : MaxAbsScaler(**(self.scaler_parameters or {})),
            "normalizer" : Normalizer(**(self.scaler_parameters or {}))
        }

        if self.scaler_type in scaler_instances and self._is_correct_datatype(X):
            logging.info("[*] Passing dataset and other parameters to scaler function...")
            transformed_dataset = self._transform_dataset(
                self.numpy_output, self.pandas_output, scaledatasets.get("main_df")[["sepal length", "sepal width"]].dtypes
            )
            return transformed_dataset
        else:
            logging.error("Either scaling_preprocessing_type argument doesn't match or dataset type is incorrect")

scaler_params = {
    "columns_to_preprocess": ["sepal length", "sepal width"],
    "scaling_preprocessing_type": "standard",
    "pandas_output": True
}

scaler = ScaleColumns(**scaler_params)
datasets.get("main_df")[["sepal length"]] = scaler.fit_transform(datasets.get("main_df"))


  if isinstance(dataset, numpy.ndarray) and re.findall("(^int\.[0-9][0-9]|^float\.[0-9][0-9][0-9])", dataset.dtypes):
ERROR:root:Numpy dataset or Numpy samples datatypes is incorrect
ERROR:root:Either scaling_preprocessing_type argument doesn't match or dataset type is incorrect
