# Profiling main skforecast classes and methods

In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

In [2]:
import platform
import psutil
import skforecast
import pandas as pd
import numpy as np
import scipy
import sklearn


import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor

from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.utils import *

%load_ext pyinstrument
%load_ext line_profiler

# Information system and libraries

In [3]:
# Versions
# ==============================================================================
print(f"Python version: {platform.python_version()}")
print(f"scikit-learn version: {sklearn.__version__}")
print(f"skforecast version: {skforecast.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"scipy version: {scipy.__version__}")
print(f"psutil version: {psutil.__version__}")
print("")

# Computer information
# ==============================================================================
#Computer network name
print(f"Computer network name: {platform.node()}")
#Machine type
print(f"Machine type: {platform.machine()}")
#Processor type
print(f"Processor type: {platform.processor()}")
#Platform type
print(f"Platform type: {platform.platform()}")
#Operating system
print(f"Operating system: {platform.system()}")
#Operating system release
print(f"Operating system release: {platform.release()}")
#Operating system version
print(f"Operating system version: {platform.version()}")
#Physical cores
print(f"Number of physical cores: {psutil.cpu_count(logical=False)}")
#Logical cores
print(f"Number of logical cores: {psutil.cpu_count(logical=True)}")

Python version: 3.11.9
scikit-learn version: 1.3.0
skforecast version: 0.13.0
pandas version: 2.2.2
numpy version: 1.26.4
scipy version: 1.13.1
psutil version: 5.9.0

Computer network name: ip-10-2-1-218
Machine type: x86_64
Processor type: x86_64
Platform type: Linux-5.15.0-1063-aws-x86_64-with-glibc2.31
Operating system: Linux
Operating system release: 5.15.0-1063-aws
Operating system version: #69~20.04.1-Ubuntu SMP Fri May 10 19:20:12 UTC 2024
Number of physical cores: 4
Number of logical cores: 8


# Data

In [4]:
# Data
# ==============================================================================
len_series = 1_000
n_series = 1_000
n_exog = 300
rgn = np.random.default_rng(seed=123)
y = pd.Series(
    rgn.random(size=(len_series)),
    name="y",
    index=pd.date_range(start="2000-01-01", periods=len_series, freq="h"),
)
exog = pd.DataFrame(rgn.random(size=(len_series, n_exog)), index=y.index)
exog.columns = [f"exog_{i}" for i in range(exog.shape[1])]
series = pd.DataFrame(rgn.random(size=(len_series, n_series)), index=y.index)
series.columns = [f"series_{i+1}" for i in range(series.shape[1])]
exog_test = exog.copy()
exog_test.index = exog.index + pd.DateOffset(hours=len_series)
print(f"Shape of y: {y.shape}")
print(f"Shape of exog: {exog.shape}")
print(f"Shape of series: {series.shape}")
print(f"Shape of exog_test: {exog_test.shape}")

Shape of y: (1000,)
Shape of exog: (1000, 300)
Shape of series: (1000, 1000)
Shape of exog_test: (1000, 300)


# ForecasterAutoregMultiSeries

In [5]:
forecaster = ForecasterAutoregMultiSeries(
                 regressor = Ridge(),
                 lags      = 24,
                 encoding  = 'ordinal',
                 transformer_exog=None,
                 transformer_series=None
             )



In [6]:
%%pyinstrument

forecaster.fit(series=series, exog=exog)

In [7]:
def prof_function(forecaster):
    forecaster.fit(series=series, exog=exog)

%prun prof_function(forecaster)

 

         3569293 function calls (3509962 primitive calls) in 14.515 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        3    2.882    0.961    2.882    0.961 extmath.py:160(safe_sparse_dot)
        3    1.219    0.406    1.220    0.407 managers.py:1707(_interleave)
     1006    0.945    0.001    1.040    0.001 numeric.py:2378(array_equal)
     8035    0.777    0.000    0.777    0.000 {method 'reduce' of 'numpy.ufunc' objects}
     1009    0.743    0.001    0.743    0.001 {method 'copy' of 'numpy.ndarray' objects}
        1    0.729    0.729    0.730    0.730 concat.py:249(_concat_homogeneous_fastpath)
     6022    0.679    0.000    0.679    0.000 {built-in method numpy.array}
1002/1001    0.647    0.001    0.648    0.001 {built-in method _operator.eq}
        1    0.283    0.283    1.195    1.195 _base.py:189(_preprocess_data)
     2005    0.233    0.000    0.237    0.000 missing.py:261(_isna_array)
717364/710279    0.2

In [8]:
series_indexes = {series[col].name: series[col].index for col in series.columns}
exog_dict = exog.to_dict(orient='series')

def funt_to_profile(forecaster, series, exog):
    forecaster._create_train_X_y(series=series, exog=exog)

%lprun -f forecaster._create_train_X_y funt_to_profile(forecaster, series, exog)

Timer unit: 1e-09 s

Total time: 5.4734 s
File: /home/ubuntu/varios/skforecast/skforecast/ForecasterAutoregMultiSeries/ForecasterAutoregMultiSeries.py
Function: _create_train_X_y at line 595

Line #      Hits         Time  Per Hit   % Time  Line Contents
   595                                               def _create_train_X_y(
   596                                                   self,
   597                                                   series: Union[pd.DataFrame, dict],
   598                                                   exog: Optional[Union[pd.Series, pd.DataFrame, dict]]=None,
   599                                                   store_last_window: Union[bool, list]=True,
   600                                               ) -> Tuple[pd.DataFrame, pd.Series, dict, list, list, list, dict, dict]:
   601                                                   """
   602                                                   Create training matrices from multiple time series and

In [9]:
import pandas as pd
import numpy as np
import time


# Original code encapsulated in a function
def check_dtypes_original(exog_dict, exog_col_names):
    exog_dtype_dict = {col_name: set() 
                       for col_name in exog_col_names}
    for v in exog_dict.values():
        if v is not None:
            for col_name in v.columns:
                exog_dtype_dict[col_name].add(v[col_name].dtype.name)

    for col_name, dtypes in exog_dtype_dict.items():
        if len(dtypes) > 1:
            raise TypeError(
                (f"Column '{col_name}' has different dtypes in different exog "
                 f"DataFrames or Series.")
            )

# Optimized code encapsulated in a function
def check_dtypes_optimized_1(exog_dict):
    first_dtype_dict = {}
    for v in exog_dict.values():
        if v is not None:
            for col_name in v.columns:
                current_dtype = v[col_name].dtype.name
                if col_name not in first_dtype_dict:
                    first_dtype_dict[col_name] = current_dtype
                elif first_dtype_dict[col_name] != current_dtype:
                    raise TypeError(f"Column '{col_name}' has different dtypes in different exog DataFrames or Series.")
                

def check_dtypes_optimized_2(exog_dict):
    exog_dtype_dict = {}
    for df in exog_dict.values():
        if df is not None:
            for col_name, dtype in df.dtypes.items():
                if col_name not in exog_dtype_dict:
                    exog_dtype_dict[col_name] = set()
                exog_dtype_dict[col_name].add(dtype.name)
                if len(exog_dtype_dict[col_name]) > 1:
                    raise TypeError(f"Column '{col_name}' has different dtypes in different exog DataFrames or Series.")



def check_dtypes_optimized_3(exog_dict):
    exog_dtypes_buffer = []
    for df in exog_dict.values():
        if df is not None:
            exog_dtypes_buffer.append(df.dtypes)
    exog_dtypes_buffer = pd.concat(exog_dtypes_buffer, axis=1)
    if not (exog_dtypes_buffer.nunique(axis=1, dropna=False) == 1).all():
        raise TypeError("Some columns have different dtypes in different exog DataFrames or Series.")
    

def check_dtypes_optimized_4(exog_dict):
    exog_dtypes_buffer = []
    for df in exog_dict.values():
        if df is not None:
            exog_dtypes_buffer.append(df.dtypes)
    exog_dtypes_buffer = np.concatenate(exog_dtypes_buffer)
    if not np.all(exog_dtypes_buffer == exog_dtypes_buffer[0]):
        raise TypeError("Some columns have different dtypes in different exog DataFrames or Series.")


# No error
# ------------------------------------------------------------------------------
n_exog = 30
n_series = 1000
df = pd.DataFrame(np.random.rand(1000, n_exog), columns=[f'col_{i}' for i in range(n_exog)])
exog_dict = {f"df_{i}": df.copy() for i in range(n_series)}
exog_col_names = df.columns

start_time = time.time()
try:
    check_dtypes_original(exog_dict, exog_col_names)
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Original code execution time: {end_time - start_time} seconds")

start_time = time.time()
try:
    check_dtypes_optimized_1(exog_dict)
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Optimized code 1 execution time: {end_time - start_time} seconds")

start_time = time.time()
try:
    check_dtypes_optimized_2(exog_dict)
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Optimized code 2 execution time: {end_time - start_time} seconds")

start_time = time.time()
try:
    check_dtypes_optimized_3(exog_dict)
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Optimized code 3 execution time: {end_time - start_time} seconds")

start_time = time.time()
try:
    check_dtypes_optimized_4(exog_dict)
except TypeError as e:  
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Optimized code 4 execution time: {end_time - start_time} seconds")
print("")


# With error
# ------------------------------------------------------------------------------
n_exog = 30
n_series = 1000
df = pd.DataFrame(np.random.rand(1000, n_exog), columns=[f'col_{i}' for i in range(n_exog)])
exog_dict = {f"df_{i}": df.copy() for i in range(n_series)}
exog_dict["df_100"]['col_13'] = exog_dict["df_100"]['col_13'].astype('str')
exog_col_names = df.columns

start_time = time.time()
try:
    check_dtypes_original(exog_dict, exog_col_names)
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Original code execution time: {end_time - start_time} seconds")

start_time = time.time()
try:
    check_dtypes_optimized_1(exog_dict)
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Optimized code 1 execution time: {end_time - start_time} seconds")

start_time = time.time()
try:
    check_dtypes_optimized_2(exog_dict)
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Optimized code 2 execution time: {end_time - start_time} seconds")

start_time = time.time()
try:
    check_dtypes_optimized_3(exog_dict)
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Optimized code 3 execution time: {end_time - start_time} seconds")

start_time = time.time()
try:
    check_dtypes_optimized_4(exog_dict)
except TypeError as e:  
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Optimized code 4 execution time: {end_time - start_time} seconds")

Original code execution time: 1.572906732559204 seconds
Optimized code 1 execution time: 0.3028547763824463 seconds
Optimized code 2 execution time: 0.21156692504882812 seconds
Optimized code 3 execution time: 0.10552453994750977 seconds
Optimized code 4 execution time: 0.0680389404296875 seconds

Column 'col_13' has different dtypes in different exog DataFrames or Series.
Original code execution time: 1.5326159000396729 seconds
Column 'col_13' has different dtypes in different exog DataFrames or Series.
Optimized code 1 execution time: 0.030980348587036133 seconds
Column 'col_13' has different dtypes in different exog DataFrames or Series.
Optimized code 2 execution time: 0.02108454704284668 seconds
Some columns have different dtypes in different exog DataFrames or Series.
Optimized code 3 execution time: 0.10317015647888184 seconds
Some columns have different dtypes in different exog DataFrames or Series.
Optimized code 4 execution time: 0.0688016414642334 seconds
