In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

## Librerias

In [2]:
from typing import Union, Tuple, Optional, Callable
import warnings
import logging
import sys
import numpy as np
import pandas as pd
import sklearn
import sklearn.pipeline
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from copy import copy, deepcopy
import inspect

import skforecast
from skforecast.ForecasterBase import ForecasterBase
from skforecast.exceptions import IgnoredArgumentWarning
from skforecast.exceptions import MissingValuesExogWarning
from skforecast.utils import initialize_lags
from skforecast.utils import initialize_weights
from skforecast.utils import check_select_fit_kwargs
from skforecast.utils import check_exog
from skforecast.utils import get_exog_dtypes
from skforecast.utils import check_exog_dtypes
from skforecast.utils import check_interval
from skforecast.utils import check_predict_input
from skforecast.utils import preprocess_y
from skforecast.utils import preprocess_last_window
from skforecast.utils import preprocess_exog
from skforecast.utils import expand_index
from skforecast.utils import transform_series
from skforecast.utils import transform_dataframe

from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from sklearn.linear_model import LinearRegression

## Data

In [3]:
from skforecast.ForecasterAutoregMultiSeries.tests.fixtures_ForecasterAutoregMultiSeries import series

In [4]:
print(series.shape)
series.columns = ['l1', 'l2']
series.head(2)

(50, 2)


Unnamed: 0,l1,l2
0,0.696469,0.120629
1,0.286139,0.826341


In [5]:
# Exog for series l1
exog_l1 = pd.DataFrame({
              'exog1': np.arange(50),
              'exog2': np.arange(50, 100),
          })


# Exog for series l2
exog_l2 = pd.Series({'exog1': np.arange(100, 150)})


# Dictionary with exog for each series
exog = {
    'l1': exog_l1,
    'l2': exog_l2
}

## Create train X y

In [10]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

In [11]:
# Forecaster attributes needed
max_lag = forecaster.max_lag
lags = forecaster.lags
transformer_series = forecaster.transformer_series
transformer_exog = forecaster.transformer_exog

print(max_lag, lags, transformer_series, transformer_exog)

3 [1 2 3] None None


In [12]:
def _create_lags(
    y: np.ndarray, 
    series_name: str
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Transforms a 1d array into a 2d array (X) and a 1d array (y). Each row
    in X is associated with a value of y and it represents the lags that
    precede it.
    
    Notice that, the returned matrix X_data, contains the lag 1 in the first
    column, the lag 2 in the second column and so on.
    
    Parameters
    ----------
    y : numpy ndarray
        1d numpy ndarray Training time series.
    series_name : str
        Name of the series.

    Returns
    -------
    X_data : numpy ndarray
        2d numpy ndarray with the lagged values (predictors). 
        Shape: (samples - max(self.lags), len(self.lags))
    y_data : numpy ndarray
        1d numpy ndarray with the values of the time series related to each 
        row of `X_data`. 
        Shape: (samples - max(self.lags), )
    
    """
        
    n_splits = len(y) - max_lag
    if n_splits <= 0:
        raise ValueError(
            (f"The maximum lag ({max_lag}) must be less than the length "
                f"of the series '{series_name}', ({len(y)}).")
        )
    
    X_data = np.full(shape=(n_splits, len(lags)), fill_value=np.nan, dtype=float)

    for i, lag in enumerate(lags):
        X_data[:, i] = y[max_lag - lag: -lag]

    y_data = y[max_lag:]
        
    return X_data, y_data


def check_exog_create_train_X_y(series, exog, key=None):
    """
    """
    if len(exog) != len(series):
        if key:
            raise ValueError(
                (f"`exog` must have same number of samples as `series`.\n"
                 f"    length `exog`: {len(exog)} for series {key}\n"
                 f"    length `series`: {len(series)}")
            )
        else:
            raise ValueError(
                (f"`exog` must have same number of samples as `series`. "
                 f"length `exog`: ({len(exog)}), length `series`: ({len(series)})")
            )
    
    check_exog(exog=exog, allow_nan=True)

    if isinstance(exog, pd.Series):
        # Needed for pandas concat
        exog = exog.to_frame()
    
    exog = transform_dataframe(
                df                = exog,
                transformer       = transformer_exog,
                fit               = True,
                inverse_transform = False
            )

    check_exog(exog=exog, allow_nan=False)
    check_exog_dtypes(exog)

    _, _ = preprocess_exog(exog=exog, return_values=False)
    if not (exog.index[:len(series)] == series.index).all():
        if key:
            raise ValueError(
                (f"Different index for `series` and `exog` for series {key}. "
                 f"They must be equal to ensure the correct alignment of values.")
            )
        else:
            raise ValueError(
                ("Different index for `series` and `exog`. They must be equal "
                 "to ensure the correct alignment of values.")
            )
        
    return exog


def create_train_X_y(
    series: pd.DataFrame,
    exog: Optional[Union[pd.Series, pd.DataFrame, dict]]=None
) -> Tuple[pd.DataFrame, pd.Series, pd.Index, pd.Index]:
    """
    Create training matrices from multiple time series and exogenous
    variables.
    
    Parameters
    ----------
    series : pandas DataFrame
        Training time series.
    exog : pandas Series, pandas DataFrame, dict, default `None`
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `series` and their indexes must be aligned.

    Returns
    -------
    X_train : pandas DataFrame
        Training values (predictors).
    y_train : pandas Series
        Values (target) of the time series related to each row of `X_train`.
        Shape: (len(series) - self.max_lag, )
    y_index : pandas Index
        Index of `series`.
    y_train_index: pandas Index
        Index of `y_train`.
    
    """

    if not isinstance(series, pd.DataFrame):
        raise TypeError(f"`series` must be a pandas DataFrame. Got {type(series)}.")

    series_col_names = list(series.columns)

    if transformer_series is None:
        transformer_series_ = {serie: None for serie in series_col_names}
    elif not isinstance(transformer_series, dict):
        transformer_series_ = {serie: clone(transformer_series) 
                               for serie in series_col_names}
    else:
        transformer_series_ = {serie: None for serie in series_col_names}
        # Only elements already present in transformer_series_ are updated
        transformer_series_.update(
            (k, v) for k, v in deepcopy(transformer_series).items() 
            if k in transformer_series_
        )
        series_not_in_transformer_series = set(series.columns) - set(transformer_series.keys())
        if series_not_in_transformer_series:
            warnings.warn(
                (f"{series_not_in_transformer_series} not present in `transformer_series`."
                 f" No transformation is applied to these series."),
                 IgnoredArgumentWarning
            )
    
    if exog is not None:

        if isinstance(exog, dict):
            
            exog_ = {serie: None for serie in series_col_names}
            # Only elements already present in exog_ are updated
            exog_.update(
                (k, v) for k, v in deepcopy(exog).items() 
                if k in exog_
            )
            series_not_in_exog = set(series.columns) - set(exog.keys())
            if series_not_in_exog:
                # TODO: review warning
                warnings.warn(
                    (f"{series_not_in_exog} not present in `exog`."
                     f" No transformation is applied to these series."),
                     IgnoredArgumentWarning
                )

            for k, v in exog.items():
                exog[k] = check_exog_create_train_X_y(series=series, exog=v, key=k)
        else:   
            exog = check_exog_create_train_X_y(series=series, exog=exog, key=None)
            
        # TODO: adapt get_exog_dtypes when dict
        # exog_dtypes = get_exog_dtypes(exog=exog)
    
    X_levels = []
    len_series = []
    X_train_col_names = [f"lag_{lag}" for lag in lags]

    for i, serie in enumerate(series.columns):

        y = series[serie]
        y_values = y.to_numpy()

        if np.isnan(y_values).all():
            raise ValueError(f"All values of series '{serie}' are NaN.")
        
        first_no_nan_idx = np.argmax(~np.isnan(y_values))
        y_values = y_values[first_no_nan_idx:]

        if np.isnan(y_values).any():
            raise ValueError(
                (f"'{serie}' Time series has missing values in between or "
                    f"at the end of the time series. When working with series "
                    f"of different lengths, all series must be complete after "
                    f"the first non-null value.")
            )
        
        y = transform_series(
                series            = y.iloc[first_no_nan_idx:],
                transformer       = transformer_series_[serie],
                fit               = True,
                inverse_transform = False
            )

        y_values = y.to_numpy()
        X_train_values, y_train_values = _create_lags(y=y_values, series_name=serie)

        if i == 0:
            X_train = X_train_values
            y_train = y_train_values
        else:
            X_train = np.concatenate((X_train, X_train_values), axis=0)
            y_train = np.concatenate((y_train, y_train_values), axis=0)

        X_level = [serie]*len(X_train_values)
        X_levels.extend(X_level)
        len_series.append(len(y_train_values))
    
    X_levels = pd.Series(X_levels)
    X_levels = pd.get_dummies(X_levels, dtype=float)

    X_train = pd.DataFrame(
                  data    = X_train,
                  columns = X_train_col_names
              )

    if exog is not None:
        # The first `self.max_lag` positions have to be removed from exog
        # since they are not in X_train. Then Exog is cloned as many times 
        # as there are series, taking into account the length of the series.
        if isinstance(exog, dict):
            exog_to_train = []
            for i, key in enumerate(exog):
                exog_to_train.append(exog[key].iloc[-len_series[i]:, ])
        else:
            exog_to_train = [exog.iloc[-length:, ] for length in len_series]

        exog_to_train = pd.concat(exog_to_train).reset_index(drop=True)
    else:
        exog_to_train = None
    
    X_train = pd.concat([X_train, exog_to_train, X_levels], axis=1)
    X_train_col_names = X_train.columns.to_list()

    y_train = pd.Series(
                  data = y_train,
                  name = 'y'
              )

    _, y_index = preprocess_y(y=series, return_values=False)

    y_index_numpy = y_index.to_numpy()
    y_train_index = pd.Index(
                        np.concatenate(
                            [y_index_numpy[-length:, ] 
                             for length in len_series]
                        )
                    )

    return X_train, y_train, y_index, y_train_index

In [13]:
series.head(2)

Unnamed: 0,l1,l2
0,0.696469,0.120629
1,0.286139,0.826341


In [14]:
exog.keys()

dict_keys(['l1', 'l2'])

In [11]:
len_series = [50, 50]

exog_to_train = []
for i, key in enumerate(exog):

    if isinstance(exog[key], pd.Series):
        exog_ = exog[key].copy().to_frame()
    else:
        exog_ = exog[key]
    
    exog_to_train.append(exog[key].iloc[-len_series[i]:, ])


exog_to_train = pd.concat(exog_to_train).reset_index(drop=True)

In [15]:
# Exog for series l1
exog_l1 = pd.DataFrame({
              'exog1': np.arange(50),
              'exog2': np.arange(50, 100),
          })


# Exog for series l2
exog_l2 = pd.Series(np.arange(100, 150), name='exog1')


# Dictionary with exog for each series
exog = {
    'l1': None,
    'l2': exog_l2
}

In [16]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

# Forecaster attributes needed
max_lag = forecaster.max_lag
lags = forecaster.lags
transformer_series = forecaster.transformer_series
transformer_exog = StandardScaler()

print(max_lag, lags, transformer_series, transformer_exog)

3 [1 2 3] None StandardScaler()


In [14]:
for k, v in exog.items():
    _ = check_exog_create_train_X_y(series=series, exog=v, key=k)
    print(_.head(2))

      exog1     exog2
0 -1.697749 -1.697749
1 -1.628453 -1.628453
      exog1
0 -1.697749
1 -1.628453


In [17]:
create_train_X_y(series=series, exog=exog)[0]

TypeError: object of type 'NoneType' has no len()

In [16]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

In [17]:
exog = pd.Series(np.arange(100, 150), name='exog1')

In [18]:
forecaster.create_train_X_y(series=series, exog=exog)[0]

Unnamed: 0,lag_1,lag_2,lag_3,exog1,l1,l2
0,0.226851,0.286139,0.696469,103,1.0,0.0
1,0.551315,0.226851,0.286139,104,1.0,0.0
2,0.719469,0.551315,0.226851,105,1.0,0.0
3,0.423106,0.719469,0.551315,106,1.0,0.0
4,0.980764,0.423106,0.719469,107,1.0,0.0
...,...,...,...,...,...,...
89,0.593177,0.762548,0.355915,145,0.0,1.0
90,0.691702,0.593177,0.762548,146,0.0,1.0
91,0.151127,0.691702,0.593177,147,0.0,1.0
92,0.398876,0.151127,0.691702,148,0.0,1.0


## Option 2: Parallelitation

In [6]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

In [19]:
# Forecaster attributes needed
max_lag = forecaster.max_lag
lags = forecaster.lags
exog_col_names = ['exog1', 'exog2']
transformer_series = forecaster.transformer_series
transformer_exog = forecaster.transformer_exog

print(max_lag, lags, exog_col_names, transformer_series, transformer_exog)

3 [1 2 3] ['exog1', 'exog2'] None None


In [136]:
def _create_lags(
    y: np.ndarray, 
    series_name: str
) -> Tuple[np.ndarray, np.ndarray]:
    """    
    """
        
    n_splits = len(y) - max_lag
    if n_splits <= 0:
        raise ValueError(
            (f"The maximum lag ({max_lag}) must be less than the length "
                f"of the series '{series_name}', ({len(y)}).")
        )
    
    X_data = np.full(shape=(n_splits, len(lags)), fill_value=np.nan, dtype=float)

    for i, lag in enumerate(lags):
        X_data[:, i] = y[max_lag - lag: -lag]

    y_data = y[max_lag:]
        
    return X_data, y_data


def create_train_X_y(
    series: pd.DataFrame,
    exog: Optional[Union[pd.Series, pd.DataFrame, dict]]=None
) -> Tuple[pd.DataFrame, pd.Series, pd.Index, pd.Index]:
    """    
    """

    if not isinstance(series, pd.DataFrame):
        raise TypeError(f"`series` must be a pandas DataFrame. Got {type(series)}.")

    series_col_names = list(series.columns)

    if transformer_series is None:
        transformer_series_ = {serie: None for serie in series_col_names}
    elif not isinstance(transformer_series, dict):
        transformer_series_ = {serie: clone(transformer_series) 
                               for serie in series_col_names}
    else:
        transformer_series_ = {serie: None for serie in series_col_names}
        # Only elements already present in transformer_series_ are updated
        transformer_series_.update(
            (k, v) for k, v in deepcopy(transformer_series).items() 
            if k in transformer_series_
        )
        series_not_in_transformer_series = set(series.columns) - set(transformer_series.keys())
        if series_not_in_transformer_series:
            warnings.warn(
                (f"{series_not_in_transformer_series} not present in `transformer_series`."
                    f" No transformation is applied to these series."),
                    IgnoredArgumentWarning
            )

    if exog is not None:
        if not isinstance(exog, dict):
            exog_dict = {serie: exog for serie in series_col_names}
        else:
            exog_dict = {serie: None for serie in series_col_names}
            # Only elements already present in exog_dict are updated
            exog_dict.update(
                (k, v) for k, v in exog.items() 
                if k in exog_dict
            )
            series_not_in_exog = set(series.columns) - set(exog.keys())
            if series_not_in_exog:
                # TODO: review warning
                warnings.warn(
                    (f"{series_not_in_exog} not present in `exog`. All values "
                     f"of the exogenous variables for these series will be NaN."),
                     MissingValuesExogWarning
                )

        for k, v in exog_dict.items():
            if v is not None:
                if len(v) != len(series):
                    # TODO: when series of different length, exog with nans at the begining
                    raise ValueError(
                        (f"`exog` must have same number of samples as `series`.\n"
                         f"    length `exog`: {len(v)} for series {k}\n"
                         f"    length `series`: {len(series)}")
                    )
                check_exog(exog=v, allow_nan=True)

                # TODO: review this ValueError
                _, _ = preprocess_exog(exog=v, return_values=False)
                if not (v.index[:len(series)] == series.index).all():
                    raise ValueError(
                        ("Different index for `series` and `exog`. They must be equal "
                         "to ensure the correct alignment of values.")
                    )
    
    X_levels = []
    exog_levels = []
    len_series = []
    X_train_col_names = [f"lag_{lag}" for lag in lags]

    for i, serie in enumerate(series.columns):

        y = series[serie]
        y_index_no_nan = y.dropna().index

        # if np.isnan(y_values).all():
        if len(y_index_no_nan) == 0:
            raise ValueError(f"All values of series '{serie}' are NaN.")
        
        # first_no_nan_idx = np.argmax(~np.isnan(y_values))
        # y_values = y_values[first_no_nan_idx:]

        # if np.isnan(y_values).any():
        #     raise ValueError(
        #         (f"'{serie}' Time series has missing values in between or "
        #             f"at the end of the time series. When working with series "
        #             f"of different lengths, all series must be complete after "
        #             f"the first non-null value.")
        #     )
        
        y = transform_series(
                series            = y,
                transformer       = transformer_series_[serie],
                fit               = True,
                inverse_transform = False
            )

        y_values = y.to_numpy()
        X_train_values, y_train_values = _create_lags(y=y_values, series_name=serie)

        delete_from_X_train = False
        if delete_from_X_train:
            rows_with_nan = np.isnan(X_train_values).any(axis=1)
        else:
            rows_with_nan = np.isnan(y_train_values)
        
        X_train_values = X_train_values[~rows_with_nan]
        y_train_values = y_train_values[~rows_with_nan]

        if i == 0:
            X_train = X_train_values
            y_train = y_train_values
        else:
            X_train = np.concatenate((X_train, X_train_values), axis=0)
            y_train = np.concatenate((y_train, y_train_values), axis=0)

        X_level = [serie]*len(X_train_values)
        X_levels.extend(X_level)
        # len_series.append(rows_with_nan)

        if exog is not None:
            if exog_dict[serie] is not None:
                exog_level = exog_dict[serie].iloc[max_lag:, ].iloc[~rows_with_nan]
                # TODO: poner check que index_exog contenga todos los valores de y_index_no_nan
                #if not (exog_level.index == y_index_no_nan).all():
                #    raise ValueError(
                #        ("Different index for `series` and `exog`. They must be equal "
                #         "to ensure the correct alignment of values.")
                #    )
                if isinstance(exog_level, pd.Series):
                    # Needed for pandas concat
                    exog_level = exog_level.to_frame()
            else:
                exog_level = pd.DataFrame(
                                 np.nan, 
                                 index=np.arange(len(X_train_values)), 
                                 columns=exog_col_names
                             )
            exog_levels.append(exog_level)

    X_levels = pd.Series(X_levels)
    X_levels = pd.get_dummies(X_levels, dtype=float)

    X_train = pd.DataFrame(
                  data    = X_train,
                  columns = X_train_col_names
              )

    if exog is not None:
        exog_train = pd.concat(exog_levels).reset_index(drop=True)
        exog_train = transform_dataframe(
                         df                = exog_train,
                         transformer       = transformer_exog,
                         fit               = True,
                         inverse_transform = False
                     )
        
        # check_exog_dtypes llama a check_exog
        # check_exog(exog=exog_train, allow_nan=False)
        check_exog_dtypes(exog_train)
        exog_dtypes = get_exog_dtypes(exog=exog_train)
    else:
        exog_train = None
    
    X_train = pd.concat([X_train, exog_train, X_levels], axis=1)
    X_train_col_names = X_train.columns.to_list()

    y_train = pd.Series(
                  data = y_train,
                  name = 'y'
              )

    _, y_index = preprocess_y(y=series, return_values=False)

    # y_index_numpy = y_index.to_numpy()
    # y_train_index = pd.Index(
    #                     np.concatenate(
    #                         [y_index_numpy[~np.isin(range(len(y_index_numpy)), rows_with_nan)]
    #                          for rows_with_nan in len_series]
    #                     )
    #                 )

    y_train_index = []
    for serie in series.columns:
        serie_index = series[serie].dropna().index[max_lag:, ]
        y_train_index.append(serie_index)

    y_train_index = pd.Index(np.concatenate(y_train_index))

    return X_train, y_train, y_index, y_train_index

In [75]:
series_2 = series.head(10).copy()
series_2.head(2)

Unnamed: 0,l1,l2
0,0.696469,0.120629
1,0.286139,0.826341


In [104]:
series_3 = series_2.copy()
series_3.iloc[:3, 1] = np.nan
series_3

Unnamed: 0,l1,l2
0,0.696469,
1,0.286139,
2,0.226851,
3,0.551315,0.545068
4,0.719469,0.342764
5,0.423106,0.304121
6,0.980764,0.417022
7,0.68483,0.681301
8,0.480932,0.875457
9,0.392118,0.510422


In [105]:
# Exog for series l1
exog_l1 = pd.DataFrame({
              'exog1': np.arange(10),
              'exog2': np.arange(50, 60),
          })

# Exog for series l2
exog_l2 = pd.Series(np.arange(100, 110), name='exog1').to_frame()

# Dictionary with exog for each series
exog = {
    'l1': exog_l1,
    'l2': exog_l2
}

In [106]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

# Forecaster attributes needed
max_lag = forecaster.max_lag
lags = forecaster.lags
transformer_series = forecaster.transformer_series
transformer_exog = forecaster.transformer_exog
# transformer_exog = StandardScaler()

print(max_lag, lags, transformer_series, transformer_exog)

3 [1 2 3] None None


In [117]:
series_create = series_3
series_create

Unnamed: 0,l1,l2
0,0.696469,
1,0.286139,
2,0.226851,
3,0.551315,0.545068
4,0.719469,0.342764
5,0.423106,0.304121
6,0.980764,0.417022
7,0.68483,0.681301
8,0.480932,0.875457
9,0.392118,0.510422


In [137]:
create_train_X_y(series=series_create, exog=exog)[0]

[0.55131477 0.71946897 0.42310646 0.9807642  0.68482974 0.4809319
 0.39211752]
(7,)
[0.54506801 0.34276383 0.30412079 0.41702221 0.68130077 0.87545684
 0.51042234]
(7,)




Unnamed: 0,lag_1,lag_2,lag_3,exog1,exog2,l1,l2
0,0.226851,0.286139,0.696469,3,53.0,1.0,0.0
1,0.551315,0.226851,0.286139,4,54.0,1.0,0.0
2,0.719469,0.551315,0.226851,5,55.0,1.0,0.0
3,0.423106,0.719469,0.551315,6,56.0,1.0,0.0
4,0.980764,0.423106,0.719469,7,57.0,1.0,0.0
5,0.68483,0.980764,0.423106,8,58.0,1.0,0.0
6,0.480932,0.68483,0.980764,9,59.0,1.0,0.0
7,,,,103,,0.0,1.0
8,0.545068,,,104,,0.0,1.0
9,0.342764,0.545068,,105,,0.0,1.0


In [138]:
create_train_X_y(series=series_create, exog=exog)[1]

[0.55131477 0.71946897 0.42310646 0.9807642  0.68482974 0.4809319
 0.39211752]
(7,)
[0.54506801 0.34276383 0.30412079 0.41702221 0.68130077 0.87545684
 0.51042234]
(7,)




0     0.551315
1     0.719469
2     0.423106
3     0.980764
4     0.684830
5     0.480932
6     0.392118
7     0.545068
8     0.342764
9     0.304121
10    0.417022
11    0.681301
12    0.875457
13    0.510422
Name: y, dtype: float64

In [121]:
create_train_X_y(series=series_create, exog=exog)[2]



RangeIndex(start=0, stop=10, step=1)

In [119]:
create_train_X_y(series=series_create, exog=exog)[3]



Index([3, 4, 5, 6, 7, 8, 9, 6, 7, 8, 9], dtype='int64')

In [55]:
X_train_rows_with_nan = np.array([])

# exog['l1'].iloc[~X_train_rows_with_nan, ]
exog['l1'][~exog['l1'].index.isin(X_train_rows_with_nan)]

Unnamed: 0,exog1,exog2
0,0,50
1,1,51
2,2,52
3,3,53
4,4,54
5,5,55
6,6,56
7,7,57
8,8,58
9,9,59


In [101]:
series_3 = pd.DataFrame({
    'l1': np.arange(10),
    'l2': np.arange(50, 60)
})

# series_3.iloc[:5, 1] = np.nan

In [102]:
create_train_X_y(series=series_3, exog=exog)[0]

Unnamed: 0,lag_1,lag_2,lag_3,exog1,exog2,l1,l2
0,2.0,1.0,0.0,-1.5,-1.5,1.0,0.0
1,3.0,2.0,1.0,-1.0,-1.0,1.0,0.0
2,4.0,3.0,2.0,-0.5,-0.5,1.0,0.0
3,5.0,4.0,3.0,0.0,0.0,1.0,0.0
4,6.0,5.0,4.0,0.5,0.5,1.0,0.0
5,7.0,6.0,5.0,1.0,1.0,1.0,0.0
6,8.0,7.0,6.0,1.5,1.5,1.0,0.0
7,52.0,51.0,50.0,-1.5,-1.5,0.0,1.0
8,53.0,52.0,51.0,-1.0,-1.0,0.0,1.0
9,54.0,53.0,52.0,-0.5,-0.5,0.0,1.0


In [58]:
exog

{'l1':    exog1  exog2
 0      0     50
 1      1     51
 2      2     52
 3      3     53
 4      4     54
 5      5     55
 6      6     56
 7      7     57
 8      8     58
 9      9     59,
 'l2':    exog1
 0    100
 1    101
 2    102
 3    103
 4    104
 5    105
 6    106
 7    107
 8    108
 9    109}

In [93]:
serie = pd.Series(np.arange(10000))
display(serie.head(4))
serie.iloc[3] = np.nan
display(serie.head(4))

array = serie.to_numpy()

0    0
1    1
2    2
3    3
dtype: int32

0    0.0
1    1.0
2    2.0
3    NaN
dtype: float64

In [103]:
%%timeit -n 10000 -r 4

# Finding the positions of non-NaN values in the Series
non_nan_positions_mostly_nan = np.argwhere(~np.isnan(array)).flatten()

# Filtering the Series using iloc and the positions of non-NaN values
filtered_series_non_nan_mostly_nan = serie.iloc[non_nan_positions_mostly_nan]
filtered_series_non_nan_mostly_nan


128 µs ± 6.83 µs per loop (mean ± std. dev. of 4 runs, 10,000 loops each)


In [104]:
%%timeit -n 10000 -r 4

serie.dropna()

61.1 µs ± 1.71 µs per loop (mean ± std. dev. of 4 runs, 10,000 loops each)


In [102]:
%%timeit -n 10000 -r 4

serie[~serie.isna()]

129 µs ± 6.53 µs per loop (mean ± std. dev. of 4 runs, 10,000 loops each)


In [114]:
serie = pd.Series([np.nan]*10)
display(serie.head(4))
serie = serie.dropna().index
len(serie)

0   NaN
1   NaN
2   NaN
3   NaN
dtype: float64

0

In [113]:
display(series.head(4))
serie = series['l1']
serie = serie.dropna().index
display(serie)
display(series.head(4))

Unnamed: 0,l1,l2
0,0.696469,0.120629
1,0.286139,0.826341
2,0.226851,0.60306
3,,0.545068


Index([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
      dtype='int64')

Unnamed: 0,l1,l2
0,0.696469,0.120629
1,0.286139,0.826341
2,0.226851,0.60306
3,,0.545068


In [73]:
l = [[1,2,3,4,5], [1,2,3,4,5]]

pd.Index(np.concatenate(l))

Index([1, 2, 3, 4, 5, 1, 2, 3, 4, 5], dtype='int32')

In [111]:
id(series)

1385573382160

In [140]:
array1_5x6 = np.array([[1, np.nan, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12], [13, 14, np.nan, 16, 17, 18], [19, 20, 21, np.nan, 23, 24], [25, 26, 27, 28, 29, 30]])

np.isnan(array1_5x6).any(axis=1)

array([ True, False,  True,  True, False])

In [141]:
array2_1d_5 = np.array([101, 102, 103, 104, 105])
np.isnan(array2_1d_5)

array([False, False, False, False, False])

In [70]:
# Creating a new example where the first array is 5x6 and the second array is 1D with length 5
array1_5x6 = np.array([[1, np.nan, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12], [13, 14, np.nan, 16, 17, 18], [19, 20, 21, np.nan, 23, 24], [25, 26, 27, 28, 29, 30]])
array2_1d_5 = np.array([101, 102, 103, 104, 105])

print(array1_5x6)
print(array2_1d_5)

# Find rows in the first array that contain NaN
rows_with_nan = np.unique(np.argwhere(np.isnan(array1_5x6))[:, 0])
# rows_with_nan = np.isnan(array1_5x6).any(axis=1)

print(rows_with_nan)

# Deleting the corresponding rows in the 1D array
# array2_1d_5_filtered = np.delete(array2_1d_5, rows_with_nan)
array2_1d_5_filtered = array2_1d_5[~rows_with_nan]
array2_1d_5_filtered


[[ 1. nan  3.  4.  5.  6.]
 [ 7.  8.  9. 10. 11. 12.]
 [13. 14. nan 16. 17. 18.]
 [19. 20. 21. nan 23. 24.]
 [25. 26. 27. 28. 29. 30.]]
[101 102 103 104 105]
[0 2 3]


array([105, 103, 102])

In [69]:
array = np.array([True, False, True, False, True, False, True, False, True, False])

series_2.iloc[array]

Unnamed: 0,l1,l2
0,0.696469,0.120629
2,0.226851,0.60306
4,0.719469,0.342764
6,0.980764,0.417022
8,0.480932,0.875457


In [119]:
np.argwhere(np.isnan(array1_5x6))

array([[0, 1],
       [2, 2],
       [3, 3]], dtype=int64)

In [118]:
np.argwhere(np.isnan(array1_5x6))[:, 0]

array([0, 2, 3], dtype=int64)

In [121]:
rows_with_nan

array([0, 2, 3], dtype=int64)

In [131]:
# Creating a 1D array for demonstration
array_1d_example = np.array([10, 20, 30, 40, 50])

# Filtering the 1D array to keep only positions that are not in `rows_with_nan`
filtered_array_1d = array_1d_example[~np.isin(range(len(array_1d_example)), rows_with_nan)]
filtered_array_1d


array([20, 50])

In [122]:
array = np.arange(100000)

In [130]:
%%timeit -n 10000 -r 10

array.shape[0]

93.9 ns ± 25.3 ns per loop (mean ± std. dev. of 10 runs, 10,000 loops each)


In [129]:
%%timeit -n 10000 -r 10

len(array)

38.1 ns ± 0.803 ns per loop (mean ± std. dev. of 10 runs, 10,000 loops each)


In [9]:
pd.DataFrame(np.full(shape=(len([1,2])), fill_value=np.nan, dtype=float))

Unnamed: 0,0
0,
1,
