In [17]:
#!pip install polaris-ml
#!pip install fets
#!pip install mlflow

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
import xgboost as xgb

print("Ok")


Ok


In [2]:
"""
Cross Correlation module
"""

import logging
import warnings

import enlighten
import numpy as np
import pandas as pd
from polaris.feature.cleaner import Cleaner
# Used for tracking ML process results
from mlflow import log_metric, log_param, log_params, start_run
# Used for the pipeline interface of scikit learn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, train_test_split

# eXtreme Gradient Boost algorithm
from xgboost import XGBRegressor

#RandomForest
from sklearn.ensemble import RandomForestRegressor

#Extratrees regressor
from sklearn.ensemble import ExtraTreesRegressor

#AdaBoostRegressor
from sklearn.ensemble import AdaBoostRegressor

#GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

#HistGradientBoostingRegressor
#from sklearn.ensemble import HistGradientBoostingRegressor

#VotingRegressor
from sklearn.ensemble import VotingRegressor



In [3]:


LOGGER = logging.getLogger(__name__)
warnings.simplefilter(action='ignore', category=FutureWarning)
# Remove this line when feature engineering is in place
np.seterr(divide='ignore', invalid='ignore')


class XCorr(BaseEstimator, TransformerMixin):
    """ Cross Correlation predictor class
    """
    def __init__(self, dataset_metadata, cross_correlation_params, regressor):
        """ Initialize an XCorr object

            :param dataset_metadata: The metadata of the dataset
            :type dataset_metadata: PolarisMetadata
            :param cross_correlation_params: XCorr parameters
            :type cross_correlation_params: CrossCorrelationParameters
        """
        self._regressor = regressor
        self.models = None
        self._importances_map = None
        self._feature_cleaner = Cleaner(
            dataset_metadata, cross_correlation_params.dataset_cleaning_params)
        self.xcorr_params = {
            "random_state": cross_correlation_params.random_state,
            "test_size": cross_correlation_params.test_size,
            "gridsearch_scoring": cross_correlation_params.gridsearch_scoring,
            "gridsearch_n_splits":
            cross_correlation_params.gridsearch_n_splits,
        }
        # If we're importing from CSV, the dataset_metadata may not
        # have the feature_columns key.
        try:
            self.xcorr_params['feature_columns'] = dataset_metadata[
                'analysis']['feature_columns']
        except KeyError:
            LOGGER.info(
                "No feature_columns entry in metatdata, setting to empty array"
            )
            self.xcorr_params['feature_columns'] = []

        if cross_correlation_params.use_gridsearch:
            self.method = self.gridsearch
            self.mlf_logging = self.gridsearch_mlf_logging
        else:
            self.method = self.regression
            self.mlf_logging = self.regression_mlf_logging

        self.model_params = {
            "current": cross_correlation_params.model_params,
            "cpu": cross_correlation_params.model_cpu_params
        }
        
    @property
    def regressor(self):
        return self._regressor
    
    @regressor.setter
    def regressor(self, regressor):
        self._regressor = regressor

    @property
    def importances_map(self):
        """
        Return the importances_map value as Pandas Dataframe.

        """

        return self._importances_map

    @importances_map.setter
    def importances_map(self, importances_map):
        self._importances_map = importances_map

    def fit(self, X):
        """ Train on a dataframe

            The input dataframe will be split column by column
            considering each one as a prediction target.

            :param X: Input dataframe
            :type X: pd.DataFrame
            :raises Exception: If encountered any unhandled error
                during model fitting
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input data should be a DataFrame")

        if self.models is None:
            self.models = []

        manager = enlighten.get_manager()

        LOGGER.info("Clearing Data. Removing unnecessary columns")
        X = self._feature_cleaner.drop_constant_values(X)
        X = self._feature_cleaner.drop_non_numeric_values(X)
        X = self._feature_cleaner.handle_missing_values(X)

        self.reset_importance_map(X.columns)

        parameters = self.__build_parameters(X)

        pbar = manager.counter(total=len(parameters),
                               desc="Columns",
                               unit="columns")

        with start_run(run_name='cross_correlate', nested=True):
            self.mlf_logging()
            for column in parameters:
                LOGGER.info(column)
                try:
                    self.models.append(
                        self.method(X.drop([column], axis=1), X[column],
                                    self.model_params['current']))
                except Exception as err:  # pylint: disable-msg=broad-except
                    if self.model_params['current'].get(
                            "predictor") == "gpu_predictor":
                        LOGGER.info(" ".join([
                            "Encountered error using GPU.",
                            "Trying with CPU parameters now!"
                        ]))
                        self.model_params['current'] = self.model_params['cpu']
                    else:
                        raise err
                pbar.update()

    def transform(self):
        """ Unused method in this predictor """
        return self
    
        
    def regression(self, df_in, target_series, model_params):
        """ Fit a model to predict target_series with df_in features/columns
            and retain the features importances in the dependency matrix.

            :param df_in: Input dataframe representing the context, predictors
            :type df_in: pd.DataFrame
            :param target_series: pandas series of the target variable. Share
                the same indexes as the df_in dataframe
            :type target_series: pd.Series
            :param model_params: Parameters for the XGB model
            :type model_params: dict
            :return: A fitted XGBRegressor
            :rtype: XGBRegressor
        """
        # Split df_in and target to train and test dataset
        df_in_train, df_in_test, target_train, target_test = train_test_split(
            df_in,
            target_series,
            test_size=0.2,
            random_state=self.xcorr_params['random_state'])


        regressors_dict = {"XGB": XGBRegressor(**model_params),
                           "RandomForest": RandomForestRegressor(),
                           "AdaBoost": AdaBoostRegressor(),
                           "ExtraTrees": ExtraTreesRegressor(),
                           "GradientBoosting": GradientBoostingRegressor(),
                           "Voting": "VotingRegressor()"}

        """ if self._regressor == "XGboosting":
            # Create and train a XGBoost regressor
            regr_m = XGBRegressor(**model_params)
            
        elif self._regressor == "RandomForest":
            # Create and train a Sci-kit regressor
            regr_m = RandomForestRegressor()
        
        elif self._regressor == "AdaBoost":
            # Create and train a Sci-kit regressor
            regr_m = AdaBoostRegressor()
        
        elif self._regressor == "ExtraTrees":
             # Create and train a Sci-kit regressor
            regr_m = ExtraTreesRegressor()
        
        elif self._regressor == "GradientBoosting":
            # Create and train a Sci-kit regressor
            regr_m = GradientBoostingRegressor()
        
        elif self._regressor == "HistGradientBoosting":
            # Create and train a Sci-kit regressor
            #regr_m = HistGradientBoostingRegressor()
            pass
            
        elif self._regressor == "Voting":
            # Create and train a Sci-kit regressor
            regr_m = VotingRegressor()
        
        """
        regr_m = regressors_dict[self._regressor]
        regr_m.fit(df_in_train, target_train)

        # Make predictions
        target_series_predict = regr_m.predict(df_in_test)

        try:
            rmse = np.sqrt(
                mean_squared_error(target_test, target_series_predict))
            log_metric(target_series.name, rmse)
            LOGGER.info('Making predictions for : %s', target_series.name)
            LOGGER.info('Root Mean Square Error : %s', str(rmse))
        except Exception:  # pylint: disable-msg=broad-except
            # Because of large (close to infinite values) or nans
            LOGGER.error('Cannot find RMS Error for %s', target_series.name)
            LOGGER.debug('Expected %s, Predicted %s', str(target_test),
                         str(target_series_predict))

        # indices = np.argsort(regr_m.feature_importances_)[::-1]
        # After the model is trained
        new_row = {}
        for column, feat_imp in zip(df_in.columns,
                                    regr_m.feature_importances_):
            new_row[column] = [feat_imp]

        # Current target is not in df_in, so manually adding it
        new_row[target_series.name] = [0.0]

        # Sorting new_row to avoid concatenation warnings
        new_row = dict(sorted(new_row.items()))

        # Concatenating new information about feature importances
        if self._importances_map is not None:
            self._importances_map = pd.concat([
                self._importances_map,
                pd.DataFrame(index=[target_series.name], data=new_row)
            ])
        return regr_m

    def gridsearch(self, df_in, target_series, params):
        """ Apply grid search to fine-tune XGBoost hyperparameters
            and then call the regression method with the best grid
            search parameters.

            :param df_in: Input dataframe representing the context, predictors
            :type df_in: pd.DataFrame
            :param target_series: Pandas series of the target variable. Share
                the same indexes as the df_in dataframe
            :type target_series: pd.Series
            :param params: The hyperparameters to use on the gridsearch
                method
            :type params: dict
            :raises TypeError: If df_in is not Pandas DataFrame
            :return: A fitted XGBRegressor
            :rtype: XGBRegressor
        """
        if not isinstance(df_in, pd.DataFrame):
            LOGGER.error("Expected %s got %s for df_in in gridsearch",
                         pd.DataFrame, type(df_in))
            raise TypeError

        random_state = self.xcorr_params['random_state']
        kfolds = KFold(n_splits=self.xcorr_params['gridsearch_n_splits'],
                       shuffle=True,
                       random_state=random_state)
        regr_m = XGBRegressor(random_state=random_state,
                              predictor="cpu_predictor",
                              tree_method="auto",
                              n_jobs=-1)

        gs_regr = GridSearchCV(regr_m,
                               param_grid=params,
                               cv=kfolds,
                               scoring=self.xcorr_params['gridsearch_scoring'],
                               n_jobs=-1,
                               verbose=1)
        gs_regr.fit(df_in, target_series)

        log_param(target_series.name + ' best estimator', gs_regr.best_params_)
        LOGGER.info("%s best estimator : %s", target_series.name,
                    str(gs_regr.best_estimator_))
        return self.regression(df_in, target_series, gs_regr.best_params_)

    def reset_importance_map(self, columns):
        """
        Creating an empty importance map

        :param columns: List of column names for the importance map
        :rtype columns: pd.Index or array-like
        """
        if self._importances_map is None:
            self._importances_map = pd.DataFrame(data={}, columns=columns)

    def common_mlf_logging(self):
        """ Log the parameters used for gridsearch and regression
            in mlflow
        """
        log_param('Test size', self.xcorr_params['test_size'])
        log_param('Model', 'XGBRegressor')

    def gridsearch_mlf_logging(self):
        """ Log the parameters used for gridsearch
            in mlflow
        """
        log_param('Gridsearch scoring',
                  self.xcorr_params['gridsearch_scoring'])
        log_param('Gridsearch parameters', self.model_params)
        self.common_mlf_logging()

    def regression_mlf_logging(self):
        """ Log the parameters used for regression
            in mlflow.
        """
        self.common_mlf_logging()
        log_params(self.model_params)

    def __build_parameters(self, X):
        """ Remove features only from
            being predicted.

            :param X: The dataset
            :type X: pd.DataFrame
            :return: List of remaining features that are not removed
            :rtype: list
        """
        if self.xcorr_params['feature_columns'] is None:
            return list(X.columns)

        LOGGER.info('Removing features from the parameters : %s',
                    self.xcorr_params['feature_columns'])
        feature_to_remove = set(self.xcorr_params['feature_columns'])

        return [x for x in list(X.columns) if x not in feature_to_remove]
    
    
    
print("Ok")


Ok


In [4]:
"""
Module to launch different data analysis.
"""
import logging

from fets.math import TSIntegrale
from mlflow import set_experiment
import polaris
from polaris.data.graph import PolarisGraph
from polaris.data.readers import read_polaris_data
from polaris.dataset.metadata import PolarisMetadata
from polaris.learn.feature.extraction import create_list_of_transformers, \
    extract_best_features
from polaris.learn.predictor.cross_correlation_configurator import \
    CrossCorrelationConfigurator

LOGGER = logging.getLogger(__name__)


class NoFramesInInputFile(Exception):
    """Raised when frames dataframe is empty"""


def feature_extraction(input_file, param_col):
    """
    Start feature extraction using the given settings.

        :param input_file: Path of a CSV file that will be
            converted to a dataframe
        :type input_file: str
        :param param_col: Target column name
        :type param_col: str
    """
    # Create a small list of two transformers which will generate two
    # different pipelines
    transformers = create_list_of_transformers(["5min", "15min"], TSIntegrale)

    # Extract the best features of the two pipelines
    out = extract_best_features(input_file,
                                transformers,
                                target_column=param_col,
                                time_unit="ms")

    # out[0] is the FeatureImportanceOptimization object
    # from polaris.learn.feature.selection
    # pylint: disable=E1101
    print(out[0].best_features)


# pylint: disable-msg=too-many-arguments
def cross_correlate(input_file,
                    regressor="XGB",
                    output_graph_file=None,
                    xcorr_configuration_file=None,
                    graph_link_threshold=0.1,
                    use_gridsearch=False,
                    csv_sep=',',
                    force_cpu=False):
    """
    Catch linear and non-linear correlations between all columns of the
    input data.

        :param input_file: CSV or JSON file path that will be
            converted to a dataframe
        :type input_file: str
        :param output_graph_file: Output file path for the generated graph.
            It will overwrite if the file already exists. Defaults to None,
            which is'/tmp/polaris_graph.json'
        :type output_graph_file: str, optional
        :param xcorr_configuration_file: XCorr configuration file path,
            defaults to None. Refer to CrossCorrelationConfigurator for
            the default parameters
        :type xcorr_configuration_file: str, optional
        :param graph_link_threshold: Minimum link value to be considered
            as a link between two nodes
        :type graph_link_threshold: float, optional
        :param use_gridsearch: Use grid search for the cross correlation.
            If this is set to False, then it will just use regression.
            Defaults to False
        :type use_gridsearch: bool, optional
        :param csv_sep: The character that separates the columns inside of
            the CSV file, defaults to ','
        :type csv_sep: str, optional
        :param force_cpu: Force CPU for cross corelation, defaults to False
        :type force_cpu: bool, optional
        :raises NoFramesInInputFile: If there are no frames in the converted
            dataframe
    """
    # Reading input file - index is considered on first column
    metadata, dataframe = read_polaris_data(input_file, csv_sep)

    if dataframe.empty:
        LOGGER.error("Empty list of frames -- nothing to learn from!")
        raise NoFramesInInputFile

    input_data = normalize_dataframe(dataframe)
    source = metadata['satellite_name']

    set_experiment(experiment_name=source)

    xcorr_configurator = CrossCorrelationConfigurator(
        xcorr_configuration_file=xcorr_configuration_file,
        use_gridsearch=use_gridsearch,
        force_cpu=force_cpu)

    # Creating and fitting cross-correlator
    xcorr = XCorr(metadata, xcorr_configurator.get_configuration(), regressor)
    xcorr.fit(input_data)

    if output_graph_file is None:
        output_graph_file = "/tmp/polaris_graph_"+ regressor +".json"

    metadata = PolarisMetadata({"satellite_name": source})
    graph = PolarisGraph(metadata=metadata)
    graph.from_heatmap(xcorr.importances_map, graph_link_threshold)
    with open(output_graph_file, 'w') as graph_file:
        graph_file.write(graph.to_json())


def normalize_dataframe(dataframe):
    """
        Apply dataframe modification so it's compatible
        with the learn module. The time column is first
        set as the index of the dataframe. Then, we drop
        the time column.

        :param dataframe: The pandas dataframe to normalize
        :type dataframe: pd.DataFrame
        :return: Pandas dataframe normalized
        :rtype: pd.DataFrame
    """
    dataframe.index = dataframe.ut_ms
    dataframe.drop(['ut_ms'], axis=1, inplace=True)

    return dataframe

print("Ok")

Ok


In [6]:
#GENERA LOS GRAFOS

print("XGBoosting")
cross_correlate("marsexpress_dataset.csv", "XGB")


XGBoosting


In [23]:

print("AdaBoost")
cross_correlate("marsexpress_dataset_medium.csv", "AdaBoost")


AdaBoost


In [24]:

print("Gradient boosting")
cross_correlate("marsexpress_dataset_medium.csv", "GradientBoosting")

Gradient boosting


In [25]:
print("Random Forest")
cross_correlate("marsexpress_dataset_medium.csv", "RandomForest")

Random Forest


In [26]:

print("Extra trees")
cross_correlate("marsexpress_dataset_medium.csv", "ExtraTrees")

Extra trees


In [6]:
from graph_to_object import getObjectFromGraph
import os
import json
import pandas as pd

def count_graph_nodes_and_links(file):
    with open(file + ".json") as f:
        graph_1 = json.loads(f.read())
    list_graph_nodes = [x["id"] for x in graph_1["graph"]["nodes"]]
    df_graph_links = pd.DataFrame([x for x in graph_1["graph"]["links"]])
    return df_graph_links, list_graph_nodes


def get_free_nodes(df_links, list_nodes):
    free_nodes = []
    for node in list_nodes:
        if(node not in list(df_links["target"]) and node not in list(df_links["source"]) ):
            free_nodes.append(node)
    return free_nodes

print("Ok")

Ok


In [7]:
df_XGB_links, list_XGB_nodes  = count_graph_nodes_and_links("lightsail_graphs/polaris_graph_XGB")
df_randomforest_links, list_randomforest_nodes = count_graph_nodes_and_links("marsexpress_graphs/polaris_graph_randomforest")
df_adaboost_links, list_adaboost_nodes = count_graph_nodes_and_links("marsexpress_graphs/polaris_graph_adaboost")
df_extratrees_links, list_extratrees_nodes = count_graph_nodes_and_links("marsexpress_graphs/polaris_graph_extratrees")
df_gradientboosting_links, list_gradientboosting_nodes = count_graph_nodes_and_links("marsexpress_graphs/polaris_graph_GradientBoosting")

todos ={"Método":["XGB", "RandomForest", "AdaBoost", "ExtraTrees", "GradientBoosting"],
        "NumNodos": [len(list_XGB_nodes), len(list_randomforest_nodes), len(list_adaboost_nodes), len(list_extratrees_nodes), len(list_gradientboosting_nodes)],
       "Num Links": [df_XGB_links.shape[0], df_randomforest_links.shape[0], df_adaboost_links.shape[0], df_extratrees_links.shape[0], df_gradientboosting_links.shape[0] ],
       "Num Nodos Libres": [len(get_free_nodes(df_XGB_links, list_XGB_nodes)), len(get_free_nodes(df_randomforest_links, list_randomforest_nodes)), len(get_free_nodes(df_adaboost_links, list_adaboost_nodes)), len(get_free_nodes(df_extratrees_links, list_extratrees_nodes )), len(get_free_nodes(df_gradientboosting_links, list_gradientboosting_nodes)) ],
       "Intensidad De Relación Media":[df_XGB_links["value"].mean(), df_randomforest_links["value"].mean(), df_adaboost_links["value"].mean(), df_extratrees_links["value"].mean(), df_gradientboosting_links["value"].mean() ]}

display(pd.DataFrame(todos))
display(list_extratrees_nodes[:10])
display(df_extratrees_links.head(10))

Unnamed: 0,Método,NumNodos,Num Links,Num Nodos Libres,Intensidad De Relación Media
0,XGB,170,279,37,0.417207
1,RandomForest,70,129,3,0.286504
2,AdaBoost,70,158,2,0.215193
3,ExtraTrees,70,122,7,0.220408
4,GradientBoosting,70,158,0,0.29655


['NPWD2372',
 'NPWD2401',
 'NPWD2402',
 'NPWD2451',
 'NPWD2471',
 'NPWD2472',
 'NPWD2481',
 'NPWD2482',
 'NPWD2491',
 'NPWD2501']

Unnamed: 0,source,target,value
0,NPWD2372,NPWD2491,0.214113
1,NPWD2372,NPWD2791,0.170794
2,NPWD2372,PSF,0.263657
3,NPWD2401,NPWD2482,0.827711
4,NPWD2401,NPWD2722,0.111661
5,NPWD2471,NPWD2871,0.100953
6,NPWD2472,NPWD2792,0.746384
7,NPWD2482,NPWD2401,0.846373
8,NPWD2482,NPWD2722,0.132483
9,NPWD2491,NPWD2372,0.220936
