diff --git a/README.md b/README.md index 45e41ce..af7ab89 100644 --- a/README.md +++ b/README.md @@ -233,8 +233,14 @@ df.predict(pmu.Model.load("/tmp/burrito.model")).tail() * add whatever you need for yourself and share it with us ## Change Log + +### 0.0.27 +* I made my life easier, now i just do "from pandas_ml_utils import pd, np, FeaturesAndLabels, ..." +* features and labels can now handle multi dimensions as a cell may contain another numpy array +* introduced sample weights which can be passed to the fit function i.e. for keras fit + ### 0.0.25 / 26 -* refactored how traing and test data sets are split +* refactored how training and test data sets are split * allow to control the amount of young test data being used (useful for time series) * add sample weights i.e. to penalize loss per sample in a keras model diff --git a/pandas_ml_utils/__init__.py b/pandas_ml_utils/__init__.py index c9555a7..cf00b02 100644 --- a/pandas_ml_utils/__init__.py +++ b/pandas_ml_utils/__init__.py @@ -1,56 +1,56 @@ """Augment pandas DataFrame with methods for machine learning""" -__version__ = '0.0.26' +__version__ = '0.0.27' import logging -import pandas as pd -# imports to provide functionality via root import like import pandas_ml_utils as pmu; pmu.XY +from pandas.core.base import PandasObject as _PandasObject + +import numpy as np +import pandas as pd +import pandas_ml_utils.pandas_utils_extension as _df_ext +from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix as _plot_correlation_matrix +from pandas_ml_utils.analysis.selection import feature_selection as _feature_selection +from pandas_ml_utils.datafetching import fetch_cryptocompare_hourly as _fetch_cryptocompare_hourly, \ + fetch_cryptocompare_daily as _fetch_cryptocompare_daily, fetch_yahoo as _fetch_yahoo +from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels +from pandas_ml_utils.model.fitting.fitter import fit as _fit, predict as _predict, backtest as _backtest, \ + features_and_label_extractor as _features_and_label_extractor from pandas_ml_utils.model.models import Model, SkModel, KerasModel, MultiModel from pandas_ml_utils.wrappers.lazy_dataframe import LazyDataFrame -from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels - -# imports only used to augment pandas classes -from pandas_ml_utils.pandas_utils_extension import inner_join, drop_re, drop_zero_or_nan, add_apply, shift_inplace, \ - extend_forecast, cloc2 -from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix -from pandas_ml_utils.datafetching.fetch_yahoo import fetch_yahoo -from pandas_ml_utils.model.fitting.fitter import fit, predict, backtest, features_and_label_extractor -from pandas_ml_utils.analysis.selection import feature_selection -from pandas.core.base import PandasObject -from pandas_ml_utils.datafetching.fetch_cryptocompare import fetch_cryptocompare_daily, fetch_cryptocompare_hourly - # log provided classes _log = logging.getLogger(__name__) _log.debug(f"available {Model} classes {[SkModel, KerasModel, MultiModel]}") _log.debug(f"available other classes {[LazyDataFrame, FeaturesAndLabels]}") +_log.debug(f"numpy version {np.__version__}") +_log.debug(f"pandas version {pd.__version__}") # add functions to pandas # general utility functions -PandasObject.cloc2 = cloc2 -PandasObject.inner_join = inner_join -PandasObject.drop_re = drop_re -PandasObject.drop_zero_or_nan = drop_zero_or_nan -PandasObject.add_apply = add_apply -PandasObject.shift_inplace = shift_inplace -PandasObject.extend_forecast = extend_forecast +_PandasObject.cloc2 = _df_ext.cloc2 +_PandasObject.inner_join = _df_ext.inner_join +_PandasObject.drop_re = _df_ext.drop_re +_PandasObject.drop_zero_or_nan = _df_ext.drop_zero_or_nan +_PandasObject.add_apply = _df_ext.add_apply +_PandasObject.shift_inplace = _df_ext.shift_inplace +_PandasObject.extend_forecast = _df_ext.extend_forecast # feature selection -PandasObject.plot_correlation_matrix = plot_correlation_matrix -PandasObject.feature_selection = feature_selection +_PandasObject.plot_correlation_matrix = _plot_correlation_matrix +_PandasObject.feature_selection = _feature_selection # provide fit, predict and backtest method -PandasObject.fit = fit -PandasObject.predict = predict -PandasObject.backtest = backtest +_PandasObject.fit = _fit +_PandasObject.predict = _predict +_PandasObject.backtest = _backtest # also provide the plan features and labels extractor -PandasObject.features_and_label_extractor = features_and_label_extractor +_PandasObject.features_and_label_extractor = _features_and_label_extractor # data fetcher -setattr(pd, 'fetch_yahoo', fetch_yahoo) -setattr(pd, 'fetch_cryptocompare_daily', fetch_cryptocompare_daily) -setattr(pd, 'fetch_cryptocompare_hourly', fetch_cryptocompare_hourly) +setattr(pd, 'fetch_yahoo', _fetch_yahoo) +setattr(pd, 'fetch_cryptocompare_daily', _fetch_cryptocompare_daily) +setattr(pd, 'fetch_cryptocompare_hourly', _fetch_cryptocompare_hourly) __doc__ = """ The main concept is to extend pandas DataFrame objects such that you can apply any statistical or machine learning diff --git a/pandas_ml_utils/analysis/correlation_analysis.py b/pandas_ml_utils/analysis/correlation_analysis.py index 06b0e35..630f604 100644 --- a/pandas_ml_utils/analysis/correlation_analysis.py +++ b/pandas_ml_utils/analysis/correlation_analysis.py @@ -1,4 +1,4 @@ -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from typing import Tuple diff --git a/pandas_ml_utils/analysis/selection.py b/pandas_ml_utils/analysis/selection.py index 966f328..5092445 100644 --- a/pandas_ml_utils/analysis/selection.py +++ b/pandas_ml_utils/analysis/selection.py @@ -1,6 +1,6 @@ import logging import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from typing import List, Iterable, Union, Tuple from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix, _sort_correlation, _plot_heatmap diff --git a/pandas_ml_utils/datafetching/__init__.py b/pandas_ml_utils/datafetching/__init__.py index e69de29..de27b42 100644 --- a/pandas_ml_utils/datafetching/__init__.py +++ b/pandas_ml_utils/datafetching/__init__.py @@ -0,0 +1,2 @@ +from .fetch_cryptocompare import * +from .fetch_yahoo import * \ No newline at end of file diff --git a/pandas_ml_utils/datafetching/fetch_cryptocompare.py b/pandas_ml_utils/datafetching/fetch_cryptocompare.py index 32a439f..3435344 100644 --- a/pandas_ml_utils/datafetching/fetch_cryptocompare.py +++ b/pandas_ml_utils/datafetching/fetch_cryptocompare.py @@ -1,7 +1,7 @@ import datetime import cachetools -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd import pytz from pandas_ml_utils.extern.cryptocompare import CURR, LIMIT, TIME, get_historical_price_day, get_historical_price_hour diff --git a/pandas_ml_utils/datafetching/fetch_yahoo.py b/pandas_ml_utils/datafetching/fetch_yahoo.py index 24fd4e2..86a2c22 100644 --- a/pandas_ml_utils/datafetching/fetch_yahoo.py +++ b/pandas_ml_utils/datafetching/fetch_yahoo.py @@ -2,7 +2,7 @@ import traceback import cachetools.func -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from ..pandas_utils_extension import inner_join from ..utils.functions import join_kwargs diff --git a/pandas_ml_utils/extern/cryptocompare.py b/pandas_ml_utils/extern/cryptocompare.py index 8659f4c..ca33130 100644 --- a/pandas_ml_utils/extern/cryptocompare.py +++ b/pandas_ml_utils/extern/cryptocompare.py @@ -4,7 +4,7 @@ import logging import time -import cachetools +import cachetools.func import requests _log = logging.getLogger(__name__) diff --git a/pandas_ml_utils/model/features_and_labels/features_and_labels.py b/pandas_ml_utils/model/features_and_labels/features_and_labels.py index 6658a8c..25b43c1 100644 --- a/pandas_ml_utils/model/features_and_labels/features_and_labels.py +++ b/pandas_ml_utils/model/features_and_labels/features_and_labels.py @@ -4,7 +4,7 @@ from typing import List, Callable, Iterable, Dict, Type, Tuple, Union, Any import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder from pandas_ml_utils.utils.functions import join_kwargs @@ -27,6 +27,7 @@ def __init__(self, features: List[str], labels: _LABELS, label_type: Type = None, + sample_weights: Union[Dict[str, str], str] = None, gross_loss: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None, targets: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None, feature_lags: Iterable[int] = None, @@ -43,6 +44,8 @@ def __init__(self, the average was. It is also possible to provide a Callable[[df, ...magic], labels] which returns the expected data structure. :param label_type: whether to treat a label as int, float, bool + :param sample_weights: sample weights get passed to the model.fit function. In keras for example this can be + used for imbalanced classes :param gross_loss: expects a callable[[df, target, ...magic], df] which receives the source data frame and a target (or None) and should return a series or data frame. Let's say you want to classify whether a printer is jamming the next page or not. Halting and servicing the printer costs @@ -68,6 +71,7 @@ def __init__(self, """ self._features = features self._labels = labels + self._weights = sample_weights self._targets = targets self._gross_loss = gross_loss self.label_type = label_type @@ -89,6 +93,10 @@ def features(self): def labels(self): return self._labels + @property + def weights(self): + return self._weights + @property def targets(self): return self._targets diff --git a/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py b/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py index 2d602b2..9b682ab 100644 --- a/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py +++ b/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py @@ -4,7 +4,7 @@ from typing import Tuple, Dict, Union, List import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from sortedcontainers import SortedDict from pandas_ml_utils.constants import * @@ -24,6 +24,7 @@ class FeatureTargetLabelExtractor(object): def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **kwargs): # prepare fields labels = features_and_labels.labels + weights = features_and_labels.weights encoder = lambda frame, **kwargs: frame label_columns = None joined_kwargs = join_kwargs(features_and_labels.kwargs, kwargs) @@ -49,12 +50,18 @@ def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **k t: l if isinstance(l, TargetLabelEncoder) else IdentityEncoder(l) for t, l in labels.items() }).encode + # flatten weights for multi models + if isinstance(weights, Dict): + weights = [l for t in labels.keys() for l in weights[t]] + # assign all fields self._features_and_labels = features_and_labels # depricated copy all fields here self._features = features_and_labels.features self._labels_columns = label_columns + self._labels = labels self._label_type = features_and_labels.label_type + self._weight_columns = weights self._targets = features_and_labels.targets self._gross_loss = features_and_labels.gross_loss self._encoder = encoder @@ -133,19 +140,17 @@ def prediction_to_frame(self, def training_and_test_data(self, test_size: float = 0.4, youngest_size: float = None, - seed: int = 42) -> Tuple[Tuple[np.ndarray,...], Tuple[np.ndarray,...]]: + seed: int = 42) -> Tuple[Tuple[pd.DataFrame,...], Tuple[pd.DataFrame,...]]: features, labels, weights = self.features_labels_weights_df train_ix, test_ix = train_test_split(features.index, test_size, youngest_size, seed=seed) return ( - (train_ix, - features.loc[train_ix].values, - integrate_nested_arrays(labels.loc[train_ix].values), - weights.loc[train_ix].values if weights is not None else None), - (test_ix, - features.loc[test_ix].values, - integrate_nested_arrays(labels.loc[test_ix].values), - weights.loc[test_ix].values if weights is not None else None) + (features.loc[train_ix], + labels.loc[train_ix], + weights.loc[train_ix] if weights is not None else None), + (features.loc[test_ix], + labels.loc[test_ix], + weights.loc[test_ix] if weights is not None else None) ) @property @@ -155,11 +160,15 @@ def features_labels_weights_df(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Dat df_labels = self.labels_df index_intersect = df_features.index.intersection(df_labels.index) + # engineer sample weights + df_weights = self.weighs_df + if df_weights is not None: + index_intersect = index_intersect.intersection(df_weights.index) + # select only joining index values df_features = df_features.loc[index_intersect] df_labels = df_labels.loc[index_intersect] - # TODO add proper label weights - df_weights = None #pd.DataFrame(np.ones(len(df_labels)), index=df_labels.index) + df_weights = None if df_weights is None else df_weights.loc[index_intersect] # sanity check if not len(df_features) == len(df_labels): @@ -177,7 +186,10 @@ def features_df(self) -> pd.DataFrame: feature_rescaling = self._features_and_labels.feature_rescaling # drop nan's and copy frame - df = self._df[features].dropna().copy() + try: + df = self._df[features].dropna().copy() + except KeyError: + raise KeyError(f'one of the keys >{features}< are not in :{self._df.columns}') # generate feature matrix if feature_lags is None: @@ -221,9 +233,6 @@ def features_df(self) -> pd.DataFrame: dff[col] = tmp[col] _log.info(f" make features ... done in {pc() - start_pc: .2f} sec!") - - # finally patch the "values" property for features data frame and return - dff.__class__ = _RNNShapedValuesDataFrame return dff @property @@ -245,12 +254,29 @@ def label_names(self, level_above=None) -> List[Union[Tuple[str, ...],str]]: return labels if level_above is None else [(level_above, col) for col in labels] @property + @lru_cache(maxsize=1) def labels_df(self) -> pd.DataFrame: # here we can do all sorts of tricks and encodings ... # joined_kwargs(self._features_and_labels.kwargs, self.) - df = self._encoder(self._df[self._labels_columns], **self._joined_kwargs).dropna().copy() + try: + df = self._df[self._labels_columns].dropna() + except KeyError: + raise KeyError(f'one of the keys >{self._labels_columns}< are not in: {self._df.columns}') + + df = self._encoder(df, **self._joined_kwargs).dropna().copy() return df if self._label_type is None else df.astype(self._label_type) + @property + def weighs_df(self) -> pd.DataFrame: + if self._weight_columns is not None: + try: + return self._df[self._weight_columns].dropna().copy() + except KeyError: + raise KeyError(f'one of the keys >{self._weight_columns}< are not in: {self._df.columns}') + + else: + return None + @property def source_df(self): df = self._df.copy() @@ -306,51 +332,6 @@ def target_df(self): return df - def _fix_shape(self, df_features): - # features eventually are in [feature, row, time_step] - # but need to be in RNN shape which is [row, time_step, feature] - feature_arr = df_features.values if self._features_and_labels.feature_lags is None else \ - np.array([df_features[cols].values for cols in self.feature_names], ndmin=3).swapaxes(0, 1).swapaxes(1, 2) - - if len(feature_arr) <= 0: - _log.warning("empty feature array!") - - return feature_arr - def __str__(self): return f'min required data = {self.min_required_samples}' - -class _RNNShapedValuesDataFrame(pd.DataFrame): - - class Loc(): - def __init__(self, df): - self.df = df - - def __getitem__(self, item): - res = self.df.loc[item] - res.__class__ = _RNNShapedValuesDataFrame - return res - - @property - def loc(self): - return _RNNShapedValuesDataFrame.Loc(super(pd.DataFrame, self)) - - @property - def values(self): - top_level_columns = unique_top_level_columns(self) - - # we need to do a sneaky trick here to get a proper "super" object as super() does not work as expected - # so we simply rename with an empty dict - df = self.rename({}) - - # features eventually are in [feature, row, time_step] - # but need to be in RNN shape which is [row, time_step, feature] - feature_arr = df.values if top_level_columns is None else \ - np.array([df[feature].values for feature in top_level_columns], - ndmin=3).swapaxes(0, 1).swapaxes(1, 2) - - if len(feature_arr) <= 0: - _log.warning("empty feature array!") - - return feature_arr diff --git a/pandas_ml_utils/model/features_and_labels/target_encoder.py b/pandas_ml_utils/model/features_and_labels/target_encoder.py index 9a779e2..d222224 100644 --- a/pandas_ml_utils/model/features_and_labels/target_encoder.py +++ b/pandas_ml_utils/model/features_and_labels/target_encoder.py @@ -1,6 +1,6 @@ from copy import deepcopy -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd import numpy as np from typing import Iterable, List, Dict, Union, Callable diff --git a/pandas_ml_utils/model/fitting/fit.py b/pandas_ml_utils/model/fitting/fit.py index d7e24e1..9e995a0 100644 --- a/pandas_ml_utils/model/fitting/fit.py +++ b/pandas_ml_utils/model/fitting/fit.py @@ -1,6 +1,6 @@ from typing import Any -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd import os from pandas_ml_utils.model.models import Model from pandas_ml_utils.summary.summary import Summary diff --git a/pandas_ml_utils/model/fitting/fitter.py b/pandas_ml_utils/model/fitting/fitter.py index 51d6eed..38d81a9 100644 --- a/pandas_ml_utils/model/fitting/fitter.py +++ b/pandas_ml_utils/model/fitting/fitter.py @@ -5,11 +5,10 @@ from typing import Callable, Tuple, Dict, TYPE_CHECKING import numpy as np -import pandas as pd from sklearn.exceptions import ConvergenceWarning -from sklearn.model_selection import train_test_split as sk_train_test_split from sklearn.utils.testing import ignore_warnings +import pandas_ml_utils.monkey_patched_dataframe as pd from pandas_ml_utils.model.features_and_labels.features_and_labels_extractor import FeatureTargetLabelExtractor from pandas_ml_utils.model.fitting.fit import Fit from pandas_ml_utils.model.models import Model @@ -79,8 +78,8 @@ def fit(df: pd.DataFrame, _log.info(f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!") # assemble result objects - df_train = features_and_labels.prediction_to_frame(model.predict(train[1]), index=train[0], inclusive_labels=True) - df_test = features_and_labels.prediction_to_frame(model.predict(test[1]), index=test[0], inclusive_labels=True) \ + df_train = features_and_labels.prediction_to_frame(model.predict(train[0].feature_values), index=train[0].index, inclusive_labels=True) + df_test = features_and_labels.prediction_to_frame(model.predict(test[0].feature_values), index=test[0].index, inclusive_labels=True) \ if len(test[0]) > 0 else None # update minimum required samples @@ -91,8 +90,8 @@ def fit(df: pd.DataFrame, def __train_loop(model, cross_validation, train, test): - x_train, y_train, w_train = train[1], train[2], train[3] - x_test, y_test, w_test = test[1], test[2], test[3] + x_train, y_train, w_train = train[0].feature_values, train[1].label_values, train[2].values if train[2] is not None else None + x_test, y_test, w_test = test[0].feature_values, test[1].label_values, test[2].values if test[2] is not None else None # apply cross validation if cross_validation is not None and isinstance(cross_validation, Tuple) and callable(cross_validation[1]): @@ -157,7 +156,7 @@ def predict(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame: features_and_labels = FeatureTargetLabelExtractor(df, model.features_and_labels, **model.kwargs) x = features_and_labels.features_df - y_hat = model.predict(x.values) + y_hat = model.predict(x.feature_values) return features_and_labels.prediction_to_frame(y_hat, index=x.index, inclusive_labels=False) @@ -167,7 +166,7 @@ def backtest(df: pd.DataFrame, model: Model, summary_provider: Callable[[pd.Data # make training and test data sets x = features_and_labels.features_df - y_hat = model.predict(x.values) + y_hat = model.predict(x.feature_values) df_backtest = features_and_labels.prediction_to_frame(y_hat, index=x.index, inclusive_labels=True, inclusive_source=True) return (summary_provider or model.summary_provider)(df_backtest) diff --git a/pandas_ml_utils/model/fitting/splitting.py b/pandas_ml_utils/model/fitting/splitting.py index cc195b6..34f4c08 100644 --- a/pandas_ml_utils/model/fitting/splitting.py +++ b/pandas_ml_utils/model/fitting/splitting.py @@ -4,7 +4,7 @@ from typing import Tuple import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from sklearn.model_selection import train_test_split as sk_train_test_split _log = logging.getLogger(__name__) diff --git a/pandas_ml_utils/model/models.py b/pandas_ml_utils/model/models.py index 40b7853..e9ba983 100644 --- a/pandas_ml_utils/model/models.py +++ b/pandas_ml_utils/model/models.py @@ -10,7 +10,7 @@ import dill as pickle import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from sklearn.linear_model import LogisticRegression from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels @@ -432,12 +432,12 @@ def fit(self, losses = [] pos = 0 - for target, labels in self.features_and_labels.labels.items(): + for i, (target, labels) in enumerate(self.features_and_labels.labels.items()): index = range(pos, pos + len(labels)) - target_y = y[:,index] - target_y_val = y_val[:,index] - target_w = sample_weight_train[:,index] if sample_weight_train is not None else None - target_w_val = sample_weight_test[:,index] if sample_weight_test is not None else None + target_y = y[:, index] + target_y_val = y_val[:, index] + target_w = sample_weight_train[:, i] if sample_weight_train is not None else None + target_w_val = sample_weight_test[:, i] if sample_weight_test is not None else None _log.info(f"fit model for target {target}") losses.append(self.models[target].fit(x, target_y, x_val, target_y_val, target_w, target_w_val)) pos += len(labels) diff --git a/pandas_ml_utils/monkey_patched_dataframe.py b/pandas_ml_utils/monkey_patched_dataframe.py new file mode 100644 index 0000000..2a78e67 --- /dev/null +++ b/pandas_ml_utils/monkey_patched_dataframe.py @@ -0,0 +1,41 @@ +import logging + +import numpy as _np +import pandas as _pd +from pandas import * + +from pandas_ml_utils.utils.functions import unique_top_level_columns, integrate_nested_arrays + +_log = logging.getLogger(__name__) +__version__ = f'patched: {_pd.__version__}' + + +def _values2D(self): + values = self.values + return integrate_nested_arrays(values) + + +def _values3D(self): + values = integrate_nested_arrays(self.values) + + if isinstance(self.columns, MultiIndex): + top_level_columns = unique_top_level_columns(self) + + # features eventually are in [feature, row, time_step] + # but need to be in RNN shape which is [row, time_step, feature] + values3D = values if top_level_columns is None else \ + _np.array([self[top_level_col].values for top_level_col in top_level_columns], + ndmin=3).swapaxes(0, 1).swapaxes(1, 2) + + if len(values3D) <= 0: + _log.warning("empty values array!") + + return values3D + else: + # a normal data frame with shape [row, time_steps] + # but need to be in RNN shape which is [row, time_step, feature] + return values #values.reshape((*values.shape, 1)) + + +DataFrame.label_values = property(_values2D) +DataFrame.feature_values = property(_values3D) diff --git a/pandas_ml_utils/summary/binary_classification_summary.py b/pandas_ml_utils/summary/binary_classification_summary.py index 55b8ba6..bc5b562 100644 --- a/pandas_ml_utils/summary/binary_classification_summary.py +++ b/pandas_ml_utils/summary/binary_classification_summary.py @@ -4,7 +4,7 @@ from typing import Dict import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from pandas_ml_utils.constants import * from pandas_ml_utils.summary.summary import Summary diff --git a/pandas_ml_utils/summary/summary.py b/pandas_ml_utils/summary/summary.py index 018cbe2..5855fd9 100644 --- a/pandas_ml_utils/summary/summary.py +++ b/pandas_ml_utils/summary/summary.py @@ -1,4 +1,4 @@ -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd import os diff --git a/pandas_ml_utils/utils/functions.py b/pandas_ml_utils/utils/functions.py index 38713d9..af56b64 100644 --- a/pandas_ml_utils/utils/functions.py +++ b/pandas_ml_utils/utils/functions.py @@ -6,7 +6,7 @@ from typing import Callable, Dict, Iterable, Any, List import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd def join_kwargs(*dicts) -> Dict: @@ -33,7 +33,11 @@ def unfold_parameter_space(parameter_space: Dict[str, Iterable], parameters: Dic def unique_top_level_columns(df: pd.DataFrame): - return unique(df.columns.get_level_values(0)) if isinstance(df.columns, pd.MultiIndex) else None + return unique(df.columns.get_level_values(0)) if isinstance(df.columns, pd.MultiIndex) else df.columns + + +def unique_top_level_rows(df: pd.DataFrame): + return unique(df.index.get_level_values(0)) if isinstance(df.index, pd.MultiIndex) else df.index def unique(items): @@ -53,7 +57,7 @@ def one_hot(index: int, number_of_classes: int): vec = np.zeros(number_of_classes) if index >= 0: - vec[index] = 1 + vec[int(index)] = 1 return vec diff --git a/pandas_ml_utils/wrappers/hashable_dataframe.py b/pandas_ml_utils/wrappers/hashable_dataframe.py index 20f7220..70a18d7 100644 --- a/pandas_ml_utils/wrappers/hashable_dataframe.py +++ b/pandas_ml_utils/wrappers/hashable_dataframe.py @@ -1,4 +1,4 @@ -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd # DEPRECATED diff --git a/pandas_ml_utils/wrappers/lazy_dataframe.py b/pandas_ml_utils/wrappers/lazy_dataframe.py index ef16b19..e5b1dec 100644 --- a/pandas_ml_utils/wrappers/lazy_dataframe.py +++ b/pandas_ml_utils/wrappers/lazy_dataframe.py @@ -2,7 +2,7 @@ from functools import lru_cache from typing import Callable, Union -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from pandas_ml_utils.model.fitting.fitter import fit, predict, backtest diff --git a/requirements.txt b/requirements.txt index 141d103..b1b4de1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,8 +5,10 @@ statsmodels tensorflow==1.14.0 matplotlib cachetools +requests yfinance hyperopt +pymongo keras-rl requests seaborn diff --git a/test/extern/test__cryptocompare.py b/test/extern/test__cryptocompare.py index 90ce2ac..ed7aed1 100644 --- a/test/extern/test__cryptocompare.py +++ b/test/extern/test__cryptocompare.py @@ -1,6 +1,6 @@ import unittest import logging -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd import pytest import pandas_ml_utils.extern.cryptocompare as cc diff --git a/test/unit_tests/analysis/test__feature_selection.py b/test/unit_tests/analysis/test__feature_selection.py index c27d702..550dd5a 100644 --- a/test/unit_tests/analysis/test__feature_selection.py +++ b/test/unit_tests/analysis/test__feature_selection.py @@ -1,10 +1,6 @@ import unittest -import pandas as pd - -import pandas_ml_utils as pmu - -print(pmu.__version__) +from pandas_ml_utils import pd class TestFeatureSelection(unittest.TestCase): diff --git a/test/unit_tests/gerneral/test__fetch_yahoo.py b/test/unit_tests/gerneral/test__fetch_yahoo.py index e64b7bc..915e9b8 100644 --- a/test/unit_tests/gerneral/test__fetch_yahoo.py +++ b/test/unit_tests/gerneral/test__fetch_yahoo.py @@ -1,12 +1,10 @@ import logging from unittest import TestCase -import pandas as pd -import pandas_ml_utils as pmu +from pandas_ml_utils import pd logger = logging.getLogger() logger.setLevel(logging.DEBUG) -logging.info(f"{pmu.__version__}") class TestFetchYahoo(TestCase): diff --git a/test/unit_tests/gerneral/test__hashable_dataframe.py b/test/unit_tests/gerneral/test__hashable_dataframe.py index 8aeee9f..cb72c6a 100644 --- a/test/unit_tests/gerneral/test__hashable_dataframe.py +++ b/test/unit_tests/gerneral/test__hashable_dataframe.py @@ -1,7 +1,7 @@ import math import unittest -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from pandas_ml_utils.wrappers.hashable_dataframe import HashableDataFrame diff --git a/test/unit_tests/gerneral/test__lazy_dataframe.py b/test/unit_tests/gerneral/test__lazy_dataframe.py index 7e19d65..aa6c6e6 100644 --- a/test/unit_tests/gerneral/test__lazy_dataframe.py +++ b/test/unit_tests/gerneral/test__lazy_dataframe.py @@ -1,6 +1,6 @@ import unittest -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from copy import deepcopy from pandas_ml_utils.wrappers.lazy_dataframe import LazyDataFrame diff --git a/test/unit_tests/model/fitting/test__make_train_test_data.py b/test/unit_tests/model/fitting/test__make_train_test_data.py index 629ad7b..1c84cdb 100644 --- a/test/unit_tests/model/fitting/test__make_train_test_data.py +++ b/test/unit_tests/model/fitting/test__make_train_test_data.py @@ -1,9 +1,8 @@ import unittest import numpy as np -import pandas as pd +from pandas_ml_utils import pd, FeaturesAndLabels -import pandas_ml_utils as pdu from pandas_ml_utils.model.features_and_labels.features_and_labels_extractor import FeatureTargetLabelExtractor from pandas_ml_utils.model.fitting.splitting import train_test_split @@ -42,7 +41,7 @@ def test_make_youngest_training_data(self): """when""" normal_train, normal_test = train_test_split(df.index, test_size=0.5, seed='youngest') lagged_train, lagged_test = train_test_split( - FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA"], ["labelA"], feature_lags=[0, 1])).features_labels_weights_df[0].index, + FeatureTargetLabelExtractor(df, FeaturesAndLabels(["featureA"], ["labelA"], feature_lags=[0, 1])).features_labels_weights_df[0].index, test_size=0.5, seed='youngest') @@ -71,8 +70,8 @@ def test_lag_smoothing_nan(self): "labelB": [5, 4, 3, 2, 1, 0, 1, 2, 3, None]}) """when lag smoothing is enabled using shift (which is introducing nan into the data frame)""" - fl = pdu.FeaturesAndLabels(["featureA"], ["labelB"], feature_lags=[0, 1], - lag_smoothing={1: lambda df: df["featureA"].shift(2)}) + fl = FeaturesAndLabels(["featureA"], ["labelB"], feature_lags=[0, 1], + lag_smoothing={1: lambda df: df["featureA"].shift(2)}) f, l, _ = FeatureTargetLabelExtractor(df, fl).features_labels_weights_df len_features = 10 - 1 - 2 @@ -85,13 +84,13 @@ def test_lag_smoothing_nan(self): def test_hashable_features_and_labels(self): """given""" - a = pdu.FeaturesAndLabels(["featureA"], ["featureA"], feature_lags=[1, 2, 3, 4], - lag_smoothing={2: lambda df: df[["featureA"]] * 2, - 4: lambda df: df[["featureA"]] * 4}) + a = FeaturesAndLabels(["featureA"], ["featureA"], feature_lags=[1, 2, 3, 4], + lag_smoothing={2: lambda df: df[["featureA"]] * 2, + 4: lambda df: df[["featureA"]] * 4}) - b = pdu.FeaturesAndLabels(["featureA"], ["featureA"], feature_lags=[1, 2, 3, 4], - lag_smoothing={2: lambda df: df[["featureA"]] * 2, - 4: lambda df: df[["featureA"]] * 4}) + b = FeaturesAndLabels(["featureA"], ["featureA"], feature_lags=[1, 2, 3, 4], + lag_smoothing={2: lambda df: df[["featureA"]] * 2, + 4: lambda df: df[["featureA"]] * 4}) """expect""" self.assertEqual(hash(a), hash(a)) @@ -109,12 +108,12 @@ def test_feature_scaling_3d(self): "labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) """when""" - fl = pdu.FeaturesAndLabels(["featureA", "featureB", "featureC"], - ["labelA"], - feature_lags=[1, 2], - feature_rescaling={("featureA", "featureC"): (-1, 1)}) + fl = FeaturesAndLabels(["featureA", "featureB", "featureC"], + ["labelA"], + feature_lags=[1, 2], + feature_rescaling={("featureA", "featureC"): (-1, 1)}) - f = FeatureTargetLabelExtractor(df, fl).features_df.values + f = FeatureTargetLabelExtractor(df, fl).features_df.feature_values """then""" self.assertEqual((8, 2, 3), f.shape) @@ -130,11 +129,11 @@ def test_feature_scaling_2d(self): "labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) """when""" - fl = pdu.FeaturesAndLabels(["featureA", "featureB", "featureC"], - ["labelA"], - feature_rescaling={("featureA", "featureC"): (-1, 1)}) + fl = FeaturesAndLabels(["featureA", "featureB", "featureC"], + ["labelA"], + feature_rescaling={("featureA", "featureC"): (-1, 1)}) - f = FeatureTargetLabelExtractor(df, fl).features_df.values + f = FeatureTargetLabelExtractor(df, fl).features_df.feature_values "then" np.testing.assert_array_almost_equal(f[0], np.array([-1, 0.1, 1])) @@ -156,11 +155,11 @@ def test_lagging(self): "labelA": range(20)}) """when""" - fl = pdu.FeaturesAndLabels(["featureA", "featureB", "featureC"], - ["labelA"], - feature_lags=[0,1,2,3,4]) + fl = FeaturesAndLabels(["featureA", "featureB", "featureC"], + ["labelA"], + feature_lags=[0,1,2,3,4]) - f = FeatureTargetLabelExtractor(df, fl).features_df.values + f = FeatureTargetLabelExtractor(df, fl).features_df.feature_values """then""" self.assertEqual(len(f), len(df) - 4) @@ -177,8 +176,8 @@ def test_make_prediction_data(self): "labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) """when""" - fl = pdu.FeaturesAndLabels(["featureA"], ["labelA"]) - f = FeatureTargetLabelExtractor(df, fl).features_df.values + fl = FeaturesAndLabels(["featureA"], ["labelA"]) + f = FeatureTargetLabelExtractor(df, fl).features_df.feature_values """then""" self.assertEqual((10, 1), f.shape) diff --git a/test/unit_tests/model/test__encoders.py b/test/unit_tests/model/test__encoders.py index 35ed172..704d553 100644 --- a/test/unit_tests/model/test__encoders.py +++ b/test/unit_tests/model/test__encoders.py @@ -1,5 +1,5 @@ from unittest import TestCase -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd import numpy as np from pandas._libs.interval import Interval diff --git a/test/unit_tests/model/test__features_and_labels_extraction.py b/test/unit_tests/model/test__features_and_labels_extraction.py index 6cdea70..0a54859 100644 --- a/test/unit_tests/model/test__features_and_labels_extraction.py +++ b/test/unit_tests/model/test__features_and_labels_extraction.py @@ -1,7 +1,7 @@ from unittest import TestCase import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels from pandas_ml_utils.model.features_and_labels.features_and_labels_extractor import FeatureTargetLabelExtractor @@ -39,9 +39,9 @@ def test_simple(self): f, l, _ = FeatureTargetLabelExtractor(DF, fl).features_labels_weights_df """then""" - np.testing.assert_array_almost_equal(f.values, np.array([[[3], [2], [1]], - [[4], [3], [2]], - [[5], [4], [3]]])) + np.testing.assert_array_almost_equal(f.feature_values, np.array([[[3], [2], [1]], + [[4], [3], [2]], + [[5], [4], [3]]])) def test_pre_processor(self): """given""" diff --git a/test/unit_tests/model/test__save_load.py b/test/unit_tests/model/test__save_load.py index 55e91ed..e6ddba1 100644 --- a/test/unit_tests/model/test__save_load.py +++ b/test/unit_tests/model/test__save_load.py @@ -1,7 +1,6 @@ from unittest import TestCase import numpy as np -import pandas as pd from keras import Sequential from keras.layers import Dense from sklearn.ensemble import RandomForestClassifier @@ -9,8 +8,7 @@ from sklearn.neural_network import MLPClassifier from sklearn.svm import LinearSVC -import pandas_ml_utils as pmu -from pandas_ml_utils import LazyDataFrame +from pandas_ml_utils import pd, SkModel, FeaturesAndLabels, LazyDataFrame, MultiModel, KerasModel, Model df = pd.DataFrame({"a": [0.5592344, 0.60739384, 0.19994533, 0.56642537, 0.50965677, 0.168989, 0.94080671, 0.76651769, 0.8403563, 0.4003567, @@ -23,7 +21,7 @@ class TestSaveLoad(TestCase): def test_save_load_models(self): """given""" - features_and_labels = pmu.FeaturesAndLabels(["a"], ["b"]) + features_and_labels = FeaturesAndLabels(["a"], ["b"]) def keras_model_provider(optimizer='adam'): model = Sequential() @@ -32,13 +30,13 @@ def keras_model_provider(optimizer='adam'): return model providers = [ - pmu.SkModel(MLPClassifier(activation='tanh', hidden_layer_sizes=(1, 1), alpha=0.001, random_state=42), - features_and_labels, foo='bar'), - pmu.SkModel(LogisticRegression(), features_and_labels), - pmu.SkModel(LinearSVC(), features_and_labels), - pmu.SkModel(RandomForestClassifier(), features_and_labels), - pmu.KerasModel(keras_model_provider, features_and_labels), - pmu.MultiModel(pmu.SkModel(LogisticRegression(), pmu.FeaturesAndLabels(["a"], {"b": ["b"]}))) + SkModel(MLPClassifier(activation='tanh', hidden_layer_sizes=(1, 1), alpha=0.001, random_state=42), + features_and_labels, foo='bar'), + SkModel(LogisticRegression(), features_and_labels), + SkModel(LinearSVC(), features_and_labels), + SkModel(RandomForestClassifier(), features_and_labels), + KerasModel(keras_model_provider, features_and_labels), + MultiModel(SkModel(LogisticRegression(), FeaturesAndLabels(["a"], {"b": ["b"]}))) ] """when""" @@ -46,7 +44,7 @@ def keras_model_provider(optimizer='adam'): models = [] for i, f in enumerate(fits): f.save_model(f'/tmp/pandas-ml-utils-unittest-test_model_{i}') - models.append((f.model, pmu.Model.load(f'/tmp/pandas-ml-utils-unittest-test_model_{i}'))) + models.append((f.model, Model.load(f'/tmp/pandas-ml-utils-unittest-test_model_{i}'))) """then""" for i, (fitted_model, restored_model) in enumerate(models): @@ -56,13 +54,13 @@ def keras_model_provider(optimizer='adam'): def test_model_with_LazyDataFrame_copy(self): """given""" - model = pmu.SkModel( + model = SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(1, 1), alpha=0.001, random_state=42), - pmu.FeaturesAndLabels([], []), foo='bar', ldf=LazyDataFrame(None, foo=lambda _f: 'bar')) + FeaturesAndLabels([], []), foo='bar', ldf=LazyDataFrame(None, foo=lambda _f: 'bar')) """when""" model.save(f'/tmp/pandas-ml-utils-unittest-test_model_LDF') - model2 = pmu.Model.load(f'/tmp/pandas-ml-utils-unittest-test_model_LDF') + model2 = Model.load(f'/tmp/pandas-ml-utils-unittest-test_model_LDF') """then""" self.assertEqual(model.kwargs["ldf"], model2.kwargs["ldf"]) @@ -71,7 +69,7 @@ def test_model_with_LazyDataFrame_copy(self): def test_save_load_keras_custom_loss(self): """given""" - features_and_labels = pmu.FeaturesAndLabels(["a"], ["b"]) + features_and_labels = FeaturesAndLabels(["a"], ["b"]) name = '/tmp/pandas-ml-utils-unittest-test_model_keras_custom_loss' def loss_provider(foo): @@ -90,11 +88,11 @@ def keras_model_provider(): return model, loss_provider("bar") """when""" - fit = df.fit(pmu.KerasModel(keras_model_provider, features_and_labels, optimizer='adam', verbose=0)) + fit = df.fit(KerasModel(keras_model_provider, features_and_labels, optimizer='adam', verbose=0)) fitted_model = fit.model fit.save_model(name) - restored_model = pmu.Model.load(name) + restored_model = Model.load(name) """then""" pd.testing.assert_frame_equal(df.predict(fitted_model), df.predict(restored_model)) diff --git a/test/unit_tests/summary/test__classification_summary.py b/test/unit_tests/summary/test__classification_summary.py index 54f0727..1eb3c95 100644 --- a/test/unit_tests/summary/test__classification_summary.py +++ b/test/unit_tests/summary/test__classification_summary.py @@ -2,7 +2,7 @@ from unittest import TestCase import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from matplotlib.figure import Figure from pandas_ml_utils.summary.binary_classification_summary import BinaryClassificationSummary @@ -71,4 +71,4 @@ def test_html(self): html = cs._repr_html_() """then""" - self.assertEqual(len(html), 154945) \ No newline at end of file + self.assertGreater(len(html), 150000) \ No newline at end of file diff --git a/test/unit_tests/summary/test__summary.py b/test/unit_tests/summary/test__summary.py index 3dc491c..6f8ffc3 100644 --- a/test/unit_tests/summary/test__summary.py +++ b/test/unit_tests/summary/test__summary.py @@ -1,7 +1,7 @@ import logging import unittest -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from pandas_ml_utils.summary.summary import Summary diff --git a/test/unit_tests/test__df_extensions.py b/test/unit_tests/test__df_extensions.py index 3e27f49..3d63d54 100644 --- a/test/unit_tests/test__df_extensions.py +++ b/test/unit_tests/test__df_extensions.py @@ -1,11 +1,6 @@ from unittest import TestCase -import numpy as np -import pandas as pd - -import pandas_ml_utils as pml - -print(pml.__version__) +from pandas_ml_utils import pd class TestDfExtensions(TestCase): diff --git a/test/unit_tests/test__monkey_patch.py b/test/unit_tests/test__monkey_patch.py new file mode 100644 index 0000000..566115e --- /dev/null +++ b/test/unit_tests/test__monkey_patch.py @@ -0,0 +1,42 @@ +from unittest import TestCase + +import pandas_ml_utils.monkey_patched_dataframe as pd +import numpy as np + + +class TestMonkeyPatch(TestCase): + + def test_patched_data_frame(self): + """given""" + df = pd.DataFrame( + { + ("A", "a"): [1, 2, 3, 4, 5], + ("A", "b"): [3, 2, 1, 0, 0], + ("A", "c"): [3, 2, 1, 0, 0], + ("B", "a"): [1, 2, 3, 1, 2], + ("B", "b"): [3, 2, 1, 0, 1], + ("B", "c"): [3, 2, 1, 0, 1], + ("C", "a"): [np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4))], + ("C", "b"): [np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4))], + ("C", "c"): [np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4))], + ("D", "a"): [np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4))], + }, + index=[1, 2, 3, 4, 5], + ) + df.columns = pd.MultiIndex.from_tuples(df.columns.tolist()) + + """when""" + print(df) + rnnShape = df[["A"]].feature_values + rnnShape2 = df[["A", "B"]].feature_values + rnnShapeExt = df["C"].feature_values + labelShape = df["D"].label_values + + """then""" + print(rnnShape.shape, rnnShape2.shape, rnnShapeExt.shape, labelShape.shape) + self.assertEqual((5, 3, 1), rnnShape.shape) + self.assertEqual((5, 3, 2), rnnShape2.shape) + self.assertEqual((5, 3, 2, 4), rnnShapeExt.shape) + self.assertEqual((5, 2, 4), labelShape.shape) + + diff --git a/test/unit_tests/utils/test__utils_functions.py b/test/unit_tests/utils/test__utils_functions.py index 01ac380..c17caf9 100644 --- a/test/unit_tests/utils/test__utils_functions.py +++ b/test/unit_tests/utils/test__utils_functions.py @@ -1,7 +1,7 @@ from unittest import TestCase import numpy as np -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from pandas_ml_utils.utils.functions import call_callable_dynamic_args, integrate_nested_arrays @@ -39,7 +39,7 @@ def test_call_dynamic_args_kwargs(self): self.assertRaises(Exception, lambda: call_callable_dynamic_args(lambda a, b: True, 1)) self.assertRaises(Exception, lambda: call_callable_dynamic_args(lambda a, b: True, 1, c=1)) - def test_inegrate_nested_array(self): + def test_integrate_nested_array(self): """given""" x = np.array([1, 2]) df = pd.DataFrame({"a": [np.zeros((4, 3)) for _ in range(10)], diff --git a/test/z_component_tests/test__classification.py b/test/z_component_tests/test__classification.py index ed02a15..a9b8886 100644 --- a/test/z_component_tests/test__classification.py +++ b/test/z_component_tests/test__classification.py @@ -2,14 +2,13 @@ import unittest import numpy as np -import pandas as pd from sklearn.neural_network import MLPClassifier +from test.config import TEST_FILE +from test.utils import SMA -import pandas_ml_utils as pdu +from pandas_ml_utils import pd, SkModel, FeaturesAndLabels from pandas_ml_utils.constants import * from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedTargets, OneHotEncodedDiscrete -from test.config import TEST_FILE -from test.utils import SMA logger = logging.getLogger() logger.setLevel(logging.DEBUG) @@ -24,12 +23,12 @@ def test_binary_classification(self): df["sma"] = SMA(df["spy_Close"]) df["is_above"] = (df["spy_Close"] / df["sma"]) > 1 - model = pdu.SkModel( + model = SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], - labels=["is_above"], - targets=lambda frame: frame["sma"], - gross_loss=lambda frame: frame["spy_Close"] - frame["sma"])) + FeaturesAndLabels(features=['vix_Close'], + labels=["is_above"], + targets=lambda frame: frame["sma"], + gross_loss=lambda frame: frame["spy_Close"] - frame["sma"])) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42) @@ -58,11 +57,11 @@ def make_targets(frame): res.columns = ["close <0.1", "close <0.05", "close >0", "close >0.05"] return res - model = pdu.SkModel( + model = SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], - labels=OneHotEncodedTargets("label", np.linspace(-0.1, 0.1, 5, endpoint=True)), - targets=make_targets)) + FeaturesAndLabels(features=['vix_Close'], + labels=OneHotEncodedTargets("label", np.linspace(-0.1, 0.1, 5, endpoint=True)), + targets=make_targets)) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42,) @@ -71,7 +70,7 @@ def make_targets(frame): predict_df = df.predict(fit.model, tail=1) """then""" - self.assertEqual(len(fit_summary_df), 4023) + self.assertEqual(len(fit_summary_df), 4006) self.assertListEqual(fit_summary_df.columns.tolist(), [(PREDICTION_COLUMN_NAME, '(-inf, -0.05]'), (PREDICTION_COLUMN_NAME, '(-0.05, 0.0]'), (PREDICTION_COLUMN_NAME, '(0.0, 0.05000000000000002]'), (PREDICTION_COLUMN_NAME, '(0.05000000000000002, inf]'), (LABEL_COLUMN_NAME, '(-inf, -0.05]'), (LABEL_COLUMN_NAME, '(-0.05, 0.0]'), (LABEL_COLUMN_NAME, '(0.0, 0.05000000000000002]'), (LABEL_COLUMN_NAME, '(0.05000000000000002, inf]'), @@ -81,7 +80,7 @@ def make_targets(frame): [(PREDICTION_COLUMN_NAME, '(-inf, -0.05]'), (PREDICTION_COLUMN_NAME, '(-0.05, 0.0]'), (PREDICTION_COLUMN_NAME, '(0.0, 0.05000000000000002]'), (PREDICTION_COLUMN_NAME, '(0.05000000000000002, inf]'), (TARGET_COLUMN_NAME, 'close <0.1'), (TARGET_COLUMN_NAME, 'close <0.05'), (TARGET_COLUMN_NAME, 'close >0'), (TARGET_COLUMN_NAME, 'close >0.05')]) - self.assertEqual(bt_summary_df.shape, (6706, 23)) + self.assertEqual(bt_summary_df.shape, (6677, 23)) def test_target_classification(self): """given""" @@ -90,10 +89,10 @@ def test_target_classification(self): df["is_above_1.0"] = (df["spy_Close"] / df["sma"]) > 1 df["is_above_1.2"] = (df["spy_Close"] / df["sma"]) > 1.2 - model = pdu.SkModel( + model = SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], - labels={"a": ["is_above_1.0"], "b": ["is_above_1.2"]})) + FeaturesAndLabels(features=['vix_Close'], + labels={"a": ["is_above_1.0"], "b": ["is_above_1.2"]})) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42) @@ -117,11 +116,11 @@ def test_lagged_classification(self): df["sma"] = SMA(df["spy_Close"]) df["is_above"] = (df["spy_Close"] / df["sma"]) > 1 - model = pdu.SkModel( + model = SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], - feature_lags=[0, 1, 2], - labels=["is_above"])) + FeaturesAndLabels(features=['vix_Close'], + feature_lags=[0, 1, 2], + labels=["is_above"])) """when""" @@ -147,10 +146,10 @@ def test_discrete_encoded_classes(self): df["label"] = (((df["spy_Close"] / df["sma"] -1) > 0.02).astype(int) - ((df["spy_Close"] / df["sma"] -1) < -0.02).astype(int)) + 1 - model = pdu.SkModel( + model = SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], - labels=OneHotEncodedDiscrete("label", 3))) + FeaturesAndLabels(features=['vix_Close'], + labels=OneHotEncodedDiscrete("label", 3))) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42,) diff --git a/test/z_component_tests/test__common.py b/test/z_component_tests/test__common.py index 54d4307..2e79bd6 100644 --- a/test/z_component_tests/test__common.py +++ b/test/z_component_tests/test__common.py @@ -1,7 +1,7 @@ import logging import unittest -import pandas as pd +import pandas_ml_utils.monkey_patched_dataframe as pd from test.config import TEST_FILE from pandas_ml_utils.analysis.correlation_analysis import _sort_correlation diff --git a/test/z_component_tests/test__cross_fold.py b/test/z_component_tests/test__cross_fold.py index bb30a44..0e4e7ef 100644 --- a/test/z_component_tests/test__cross_fold.py +++ b/test/z_component_tests/test__cross_fold.py @@ -1,17 +1,12 @@ import logging -import os import unittest -import numpy as np -import pandas as pd from sklearn.model_selection import KFold from sklearn.neural_network import MLPClassifier - -import pandas_ml_utils as pdu -from pandas_ml_utils.summary.binary_classification_summary import BinaryClassificationSummary -from pandas_ml_utils.utils.functions import fig_to_png_base64 from test.config import TEST_FILE +from pandas_ml_utils import pd, SkModel, FeaturesAndLabels + logger = logging.getLogger() logger.setLevel(logging.DEBUG) @@ -34,10 +29,10 @@ def split(x, y=None, group=None): """when""" fit = df.fit( - pdu.SkModel( + SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), alpha=0.001, random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], labels=['label'], - gross_loss=lambda df: df["spy_Close"] - df["spy_Open"])), + FeaturesAndLabels(features=['vix_Close'], labels=['label'], + gross_loss=lambda df: df["spy_Close"] - df["spy_Open"])), test_size=0.4, cross_validation=(1, split), test_validate_split_seed=42) diff --git a/test/z_component_tests/test__cross_validation.py b/test/z_component_tests/test__cross_validation.py index 068c146..18ae903 100644 --- a/test/z_component_tests/test__cross_validation.py +++ b/test/z_component_tests/test__cross_validation.py @@ -1,13 +1,12 @@ import logging import unittest -import pandas as pd from sklearn.model_selection import KFold from sklearn.neural_network import MLPClassifier - -import pandas_ml_utils as pdu from test.config import TEST_FILE +from pandas_ml_utils import pd, SkModel, FeaturesAndLabels + logger = logging.getLogger() logger.setLevel(logging.DEBUG) @@ -24,9 +23,9 @@ def test_cross_validation(self): """when""" fit = df.fit( - pdu.SkModel( + SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), alpha=0.001, random_state=42, max_iter=10), - pdu.FeaturesAndLabels(features=['vix_Close'], labels=['label'])), + FeaturesAndLabels(features=['vix_Close'], labels=['label'])), test_size=0.4, cross_validation = (2, cv.split), test_validate_split_seed=42) diff --git a/test/z_component_tests/test__encoding.py b/test/z_component_tests/test__encoding.py index c16288f..809de77 100644 --- a/test/z_component_tests/test__encoding.py +++ b/test/z_component_tests/test__encoding.py @@ -2,17 +2,12 @@ import unittest from typing import List -import pandas as pd import numpy as np -from sklearn.neural_network import MLPClassifier - -import pandas_ml_utils as pdu -from pandas_ml_utils.constants import * from test.config import TEST_FILE - -from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder from test.mocks.mock_model import MockModel +from pandas_ml_utils import pd, FeaturesAndLabels +from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder from pandas_ml_utils.utils.functions import integrate_nested_arrays logger = logging.getLogger() @@ -45,7 +40,7 @@ def encode(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame: return res """when""" - model = MockModel(pdu.FeaturesAndLabels(["spy_Close"], ArrayEncoder(), feature_lags=[0, 1, 2])) + model = MockModel(FeaturesAndLabels(["spy_Close"], ArrayEncoder(), feature_lags=[0, 1, 2])) fit = df.fit(model) """then""" diff --git a/test/z_component_tests/test__fetch.py b/test/z_component_tests/test__fetch.py index 54b59f1..669f064 100644 --- a/test/z_component_tests/test__fetch.py +++ b/test/z_component_tests/test__fetch.py @@ -1,13 +1,10 @@ import logging import unittest -import pandas as pd - -import pandas_ml_utils as pdu +from pandas_ml_utils import pd logger = logging.getLogger() logger.setLevel(logging.DEBUG) -print(pdu.__version__) class ComponentTest(unittest.TestCase): diff --git a/test/z_component_tests/test__fnl_extractor.py b/test/z_component_tests/test__fnl_extractor.py index a22557a..4a000df 100644 --- a/test/z_component_tests/test__fnl_extractor.py +++ b/test/z_component_tests/test__fnl_extractor.py @@ -1,12 +1,11 @@ import logging import unittest -import pandas as pd from sklearn.neural_network import MLPClassifier +from test.config import TEST_FILE -import pandas_ml_utils as pdu +from pandas_ml_utils import pd, SkModel, FeaturesAndLabels, LazyDataFrame from pandas_ml_utils.constants import * -from test.config import TEST_FILE logger = logging.getLogger() logger.setLevel(logging.DEBUG) @@ -20,12 +19,12 @@ def test_extractor(self): """when""" extractor = df.features_and_label_extractor( - pdu.SkModel( + SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), alpha=0.001, random_state=42), - pdu.FeaturesAndLabels(features=['feature'], labels=['label'], - gross_loss=lambda df: df["spy_Close"] - df["spy_Open"], - targets=lambda df: df["spy_Close"], - pre_processor=lambda _df: pdu.LazyDataFrame( + FeaturesAndLabels(features=['feature'], labels=['label'], + gross_loss=lambda df: df["spy_Close"] - df["spy_Open"], + targets=lambda df: df["spy_Close"], + pre_processor=lambda _df: LazyDataFrame( _df, feature=lambda f: f["vix_Close"].rolling(2).mean(), label=lambda f: (f["spy_Close"].shift(1) > f["spy_Open"]).shift(-1))))) diff --git a/test/z_component_tests/test__hyper_parameter_tuning.py b/test/z_component_tests/test__hyper_parameter_tuning.py index c2a4d00..a6e78f2 100644 --- a/test/z_component_tests/test__hyper_parameter_tuning.py +++ b/test/z_component_tests/test__hyper_parameter_tuning.py @@ -2,12 +2,11 @@ import unittest import numpy as np -import pandas as pd from sklearn.neural_network import MLPClassifier - -import pandas_ml_utils as pdu from test.config import TEST_FILE +from pandas_ml_utils import pd, SkModel, FeaturesAndLabels + logger = logging.getLogger() logger.setLevel(logging.DEBUG) @@ -23,10 +22,10 @@ def test_hyper_parameter(self): """when fit with find hyper parameter""" fit = df.fit( - pdu.SkModel(MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], labels=['label'], - target_columns=["vix_Open"], - loss_column="spy_Volume")), + SkModel(MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), + FeaturesAndLabels(features=['vix_Close'], labels=['label'], + target_columns=["vix_Open"], + loss_column="spy_Volume")), test_size=0.4, test_validate_split_seed=42, hyper_parameter_space={'alpha': hp.choice('alpha', [0.0001, 10]), 'early_stopping': True, 'max_iter': 50, diff --git a/test/z_component_tests/test__keras_model_label_weight.py b/test/z_component_tests/test__keras_model_label_weight.py new file mode 100644 index 0000000..472ee19 --- /dev/null +++ b/test/z_component_tests/test__keras_model_label_weight.py @@ -0,0 +1,54 @@ +import os +from unittest import TestCase + +from keras.layers import Dense +from keras.models import Sequential +from test.config import TEST_FILE + +from pandas_ml_utils import pd, KerasModel, FeaturesAndLabels + +os.environ["CUDA_VISIBLE_DEVICES"] = "" + + +class TestKerasLossWeight(TestCase): + + def test_single_model(self): + """given""" + df = pd.read_csv(TEST_FILE, index_col='Date') + df["feature"] = df["spy_Close"].pct_change() + df["label"] = df["spy_Close"] > df["spy_Open"] + df["weight"] = df["spy_Close"].pct_change().abs() + + def model_provider(): + model = Sequential() + model.add(Dense(1, input_shape=(1,))) + model.add(Dense(10)) + model.add(Dense(1)) + + model.compile( + loss="mse", + optimizer='adam', + metrics=['accuracy'], + ) + + return model + + model = KerasModel( + model_provider, + FeaturesAndLabels( + features=["feature"], + labels=["label"], + sample_weights="weight" + ) + ) + + """when""" + fit = df.fit(model) + + """then""" + # no exception is thrown + + + def test_multi_model(self): + pass + diff --git a/test/z_component_tests/test__lazy_dataframe.py b/test/z_component_tests/test__lazy_dataframe.py index b86af4a..ae25bb0 100644 --- a/test/z_component_tests/test__lazy_dataframe.py +++ b/test/z_component_tests/test__lazy_dataframe.py @@ -1,12 +1,11 @@ import logging import unittest -import pandas as pd from sklearn.neural_network import MLPClassifier - -import pandas_ml_utils as pdu from test.config import TEST_FILE +from pandas_ml_utils import pd, LazyDataFrame, SkModel, FeaturesAndLabels + logger = logging.getLogger() logger.setLevel(logging.DEBUG) @@ -16,14 +15,14 @@ class LazyDataFrameTest(unittest.TestCase): def test_fit_and_co(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date').tail(100) - ldf = pdu.LazyDataFrame( + ldf = LazyDataFrame( df, sma=lambda f: f["vix_Close"].rolling(2).mean(), label=lambda f: f["spy_Close"] > f["spy_Open"] ) - model = pdu.SkModel( + model = SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels(["sma"], ["label"]) + FeaturesAndLabels(["sma"], ["label"]) ) """when""" @@ -39,12 +38,12 @@ def test_fit_and_co(self): def test_pre_process_and_fit_and_co(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date').tail(100) - model = pdu.SkModel( + model = SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels( + FeaturesAndLabels( ["sma"], ["label"], - pre_processor=lambda _df: pdu.LazyDataFrame( + pre_processor=lambda _df: LazyDataFrame( _df, sma=lambda f: f["vix_Close"].rolling(2).mean(), label=lambda f: f["spy_Close"] > f["spy_Open"] diff --git a/test/z_component_tests/test__multi_model.py b/test/z_component_tests/test__multi_model.py index bacb307..c30e3c2 100644 --- a/test/z_component_tests/test__multi_model.py +++ b/test/z_component_tests/test__multi_model.py @@ -2,10 +2,10 @@ import unittest import numpy as np -import pandas as pd from sklearn.neural_network import MLPClassifier -import pandas_ml_utils as pdu +from pandas_ml_utils import pd, SkModel, MultiModel, FeaturesAndLabels + from pandas_ml_utils.constants import * from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedTargets from test.config import TEST_FILE @@ -21,7 +21,7 @@ class MultiModelTest(unittest.TestCase): def test_invalid_multi_model(self): """expect""" self.assertRaises(ValueError, - lambda: pdu.MultiModel(pdu.MultiModel(pdu.SkModel(MLPClassifier(), pdu.FeaturesAndLabels([], {}))))) + lambda: MultiModel(MultiModel(SkModel(MLPClassifier(), FeaturesAndLabels([], {}))))) def test_multi_model_binary_classifications(self): """given""" @@ -30,13 +30,13 @@ def test_multi_model_binary_classifications(self): df["is_above_1.0"] = (df["spy_Close"] / df["sma"]) > 1 df["is_above_1.2"] = (df["spy_Close"] / df["sma"]) > 1.2 - model = pdu.MultiModel( - pdu.SkModel( + model = MultiModel( + SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], - labels={"a": ["is_above_1.0"], "b": ["is_above_1.2"]}, - targets=lambda frame, t: frame["sma"].rename(f"sma {t}"), - gross_loss=lambda frame: frame["spy_Close"] - frame["sma"]))) + FeaturesAndLabels(features=['vix_Close'], + labels={"a": ["is_above_1.0"], "b": ["is_above_1.2"]}, + targets=lambda frame, t: frame["sma"].rename(f"sma {t}"), + gross_loss=lambda frame: frame["spy_Close"] - frame["sma"]))) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42,) @@ -64,14 +64,14 @@ def test_multi_model_multi_class_classifications(self): df["is_above_1.0"] = (df["spy_Close"] / df["sma"]) + 1 df["is_above_1.2"] = (df["spy_Close"] / df["sma"]) + 2 - model = pdu.MultiModel( - pdu.SkModel( + model = MultiModel( + SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], - labels={"1": OneHotEncodedTargets("is_above_1.0", np.linspace(-0.1, 0.1, 5, endpoint=True) + 1), - "2": OneHotEncodedTargets("is_above_1.2", np.linspace(-0.1, 0.1, 5, endpoint=True) + 2)}, - targets=lambda frame, t: (frame["sma"] + int(t)).rename(f"sma {t}"), - gross_loss=lambda frame: frame["spy_Close"] - frame["sma"]))) + FeaturesAndLabels(features=['vix_Close'], + labels={"1": OneHotEncodedTargets("is_above_1.0", np.linspace(-0.1, 0.1, 5, endpoint=True) + 1), + "2": OneHotEncodedTargets("is_above_1.2", np.linspace(-0.1, 0.1, 5, endpoint=True) + 2)}, + targets=lambda frame, t: (frame["sma"] + int(t)).rename(f"sma {t}"), + gross_loss=lambda frame: frame["spy_Close"] - frame["sma"]))) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42,) @@ -94,5 +94,5 @@ def test_multi_model_multi_class_classifications(self): ('2', 'prediction', '(-inf, 1.95]'), ('2', 'prediction', '(1.95, 2.0]'), ('2', 'prediction', '(2.0, 2.05]'), ('2', 'prediction', '(2.05, inf]'), ('1', TARGET_COLUMN_NAME, 'sma 1'), ('2', TARGET_COLUMN_NAME, 'sma 2')]) - self.assertEqual(bt_summary_df.shape, (6706, 32)) + self.assertEqual(bt_summary_df.shape, (6677, 32)) diff --git a/test/z_component_tests/test__pre_processor.py b/test/z_component_tests/test__pre_processor.py index d012209..ada8920 100644 --- a/test/z_component_tests/test__pre_processor.py +++ b/test/z_component_tests/test__pre_processor.py @@ -1,12 +1,11 @@ import logging import unittest -import pandas as pd from sklearn.neural_network import MLPClassifier +from test.config import TEST_FILE -import pandas_ml_utils as pdu +from pandas_ml_utils import pd, SkModel, FeaturesAndLabels, LazyDataFrame from pandas_ml_utils.constants import * -from test.config import TEST_FILE logger = logging.getLogger() logger.setLevel(logging.DEBUG) @@ -20,11 +19,11 @@ def test_pre_processor(self): """when""" fit = df.fit( - pdu.SkModel( + SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), alpha=0.001, random_state=42), - pdu.FeaturesAndLabels(features=['feature'], labels=['label'], label_type=int, - gross_loss=lambda df: df["spy_Close"] - df["spy_Open"], - pre_processor=lambda _df: pdu.LazyDataFrame( + FeaturesAndLabels(features=['feature'], labels=['label'], label_type=int, + gross_loss=lambda df: df["spy_Close"] - df["spy_Open"], + pre_processor=lambda _df: LazyDataFrame( _df, feature=lambda f: f["vix_Close"].rolling(2).mean(), label=lambda f: (f["spy_Close"].shift(1) > f["spy_Open"]).shift(-1)).to_dataframe())), diff --git a/test/z_component_tests/test__regression.py b/test/z_component_tests/test__regression.py index 390a9e5..9e300f1 100644 --- a/test/z_component_tests/test__regression.py +++ b/test/z_component_tests/test__regression.py @@ -1,12 +1,11 @@ import logging import unittest -import pandas as pd from sklearn.neural_network import MLPRegressor - -import pandas_ml_utils as pdu from test.config import TEST_FILE +from pandas_ml_utils import pd, SkModel, FeaturesAndLabels + logger = logging.getLogger() logger.setLevel(logging.DEBUG) @@ -19,9 +18,9 @@ def test_fit_regressor(self): """when""" fit = df.fit( - pdu.SkModel( + SkModel( MLPRegressor(activation='tanh', hidden_layer_sizes=(4, 3, 2, 1, 2, 3, 4), random_state=42), - pdu.FeaturesAndLabels( + FeaturesAndLabels( features=['spy_Open', 'spy_High', 'spy_Low', 'spy_Close'], labels=['vix_Open', 'vix_High', 'vix_Low', 'vix_Close'], targets=lambda frame: frame[['vix_Open', 'vix_High', 'vix_Low', 'vix_Close']].add_prefix("tgt_") @@ -58,9 +57,9 @@ def test_fit_regressor_mutiple_target(self): """when""" fit = df.fit( - pdu.SkModel( + SkModel( MLPRegressor(activation='tanh', hidden_layer_sizes=(4, 3, 2, 1, 2, 3, 4), random_state=42), - pdu.FeaturesAndLabels( + FeaturesAndLabels( features=['spy_Open', 'spy_High', 'spy_Low', 'spy_Close'], labels={"a": ['vix_Open'], "b": ['vix_High', 'vix_Low', 'vix_Close']}, targets=lambda frame, t: frame[['vix_High', 'vix_Low']].add_prefix(f"{t}_") diff --git a/test/z_component_tests/test__summary.py b/test/z_component_tests/test__summary.py index bcc2d4d..4c1c648 100644 --- a/test/z_component_tests/test__summary.py +++ b/test/z_component_tests/test__summary.py @@ -3,10 +3,10 @@ import unittest import numpy as np -import pandas as pd from sklearn.neural_network import MLPClassifier -import pandas_ml_utils as pdu +from pandas_ml_utils import pd, SkModel, FeaturesAndLabels + from pandas_ml_utils.summary.binary_classification_summary import BinaryClassificationSummary from pandas_ml_utils.utils.functions import fig_to_png_base64 from test.config import TEST_FILE @@ -24,10 +24,10 @@ def test_binary_classification_summary(self): """when""" fit = df.fit( - pdu.SkModel( + SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), alpha=0.001, random_state=42), - pdu.FeaturesAndLabels(features=['vix_Close'], labels=['label'], - gross_loss=lambda df: df["spy_Close"] - df["spy_Open"]), + FeaturesAndLabels(features=['vix_Close'], labels=['label'], + gross_loss=lambda df: df["spy_Close"] - df["spy_Open"]), BinaryClassificationSummary), test_size=0.4, test_validate_split_seed=42)