In [391]:
import pandas as pd
import numpy as np

from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sktime.transformations.series.difference import Differencer
from sktime.forecasting.compose import make_reduction, TransformedTargetForecaster
from sktime.forecasting.model_evaluation import evaluate
from sktime.forecasting.model_selection import ExpandingWindowSplitter

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('../data/car_parts_final.csv', parse_dates = ['Date'])
df.set_index(['Series', 'Date'], inplace = True)

In [392]:
preds   = pd.read_csv('../results/linear_regression_differenced_20230826-182504/ensemble_preds.csv')
results = pd.read_csv('../results/linear_regression_differenced_20230826-182504/metric_results.csv')

In [393]:
df['Values'] = df.groupby(level = 0)['Values'].diff()

In [394]:
lags = range(1, min(10, int(np.ceil(51 / 10))))

In [395]:
for lag in lags:
    df[f'lag_{lag}'] = df.groupby(level = 0)['Values'].shift(lag)

In [396]:
mod1 = TransformedTargetForecaster([
    Differencer(lags = 1, na_handling = 'drop_na') *
    make_reduction(LinearRegression(), window_length = 1, strategy = 'recursive')
])

mod2 = make_reduction(LinearRegression(), window_length = 1, strategy = 'recursive')
mod3 = LinearRegression()

In [397]:
# sample data to use
fake             = pd.DataFrame(np.random.normal(0, 0.25, size = 25), columns = ['Vals'])
fake['prev']     = fake['Vals'].shift()
fake['diff']     = fake['Vals'].diff()
fake['prevdiff'] = fake['diff'].shift()
fake.index       = pd.date_range(start = '2000-01-01', periods = 25)

In [398]:
# linear regression fitted on previous values
print(LinearRegression().fit(fake[['prev']][1:], fake['Vals'][1:]).coef_)
print(LinearRegression().fit(fake[['prev']][1:], fake['Vals'][1:]).intercept_)

[-0.22407836]
-0.033275071520778896


In [399]:
# compare to make_reduction -- it works
mod2.fit(fake['Vals'])
mod2.get_fitted_params()

{'estimator': LinearRegression(),
 'transformers': None,
 'window_length': 1,
 'estimator__coef': array([-0.22407836]),
 'estimator__intercept': -0.033275071520778896,
 'estimator__n_features_in': 1,
 'estimator__rank': 1,
 'estimator__singular': array([0.9969091])}

In [400]:
# now compare to model pipeline w/ differencer and see results
mod1.fit(fake['Vals'])
mod1.get_fitted_params()

{'forecaster': TransformedTargetForecaster(steps=[Differencer(na_handling='drop_na'),
                                    RecursiveTabularRegressionForecaster(estimator=LinearRegression(),
                                                                         window_length=1)]),
 'steps': [('TransformedTargetForecaster',
   TransformedTargetForecaster(steps=[Differencer(na_handling='drop_na'),
                                      RecursiveTabularRegressionForecaster(estimator=LinearRegression(),
                                                                           window_length=1)]))],
 'transformers_post': [],
 'transformers_pre': [],
 'TransformedTargetForecaster': TransformedTargetForecaster(steps=[Differencer(na_handling='drop_na'),
                                    RecursiveTabularRegressionForecaster(estimator=LinearRegression(),
                                                                         window_length=1)]),
 'TransformedTargetForecaster__forecaster': Recur

In [401]:
# coefficients for model fit w/ Linear Regression -- And it works
print(mod3.fit(fake.dropna()[['prevdiff']], fake.dropna()['diff']).coef_)
print(mod3.fit(fake.dropna()[['prevdiff']], fake.dropna()['diff']).intercept_)

[-0.66826683]
-0.034432605285480575


In [402]:
# now make a prediction -- is this the problem?
mod1.predict(fh = [1, 2, 3, 4, 5, 6, 7, 8])

2000-01-26   -0.266702
2000-01-27   -0.218325
2000-01-28   -0.285086
2000-01-29   -0.274904
2000-01-30   -0.316141
2000-01-31   -0.323017
2000-02-01   -0.352854
2000-02-02   -0.367347
Freq: D, Name: Vals, dtype: float64

In [403]:
# work with larger values -- begin to coincide
# now compare to model pipeline w/ differencer and see results
mod1.fit(fake['Vals'][:21])
mod1.get_fitted_params()

{'forecaster': TransformedTargetForecaster(steps=[Differencer(na_handling='drop_na'),
                                    RecursiveTabularRegressionForecaster(estimator=LinearRegression(),
                                                                         window_length=1)]),
 'steps': [('TransformedTargetForecaster',
   TransformedTargetForecaster(steps=[Differencer(na_handling='drop_na'),
                                      RecursiveTabularRegressionForecaster(estimator=LinearRegression(),
                                                                           window_length=1)]))],
 'transformers_post': [],
 'transformers_pre': [],
 'TransformedTargetForecaster': TransformedTargetForecaster(steps=[Differencer(na_handling='drop_na'),
                                    RecursiveTabularRegressionForecaster(estimator=LinearRegression(),
                                                                         window_length=1)]),
 'TransformedTargetForecaster__forecaster': Recur

In [404]:
# linear regression model
# coefficients for model fit w/ Linear Regression -- And it works
print(mod3.fit(fake.dropna()[['prevdiff']][:19], fake.dropna()['diff'][:19]).coef_)
print(mod3.fit(fake.dropna()[['prevdiff']][:19], fake.dropna()['diff'][:19]).intercept_)

[-0.72990323]
-0.02154210268340424


In [406]:
# prediction for next couple of values for mod1 -- DON'T MATCH!!
mod1.predict(fh = [1])

2000-01-22    0.012265
Freq: D, Name: Vals, dtype: float64

In [407]:
# actual value for 2000 - 01 - 21
mod3.predict(fake[['prevdiff']].iloc[19:22])

array([ 0.0207449 , -0.12659116, -0.02266725])

In [408]:
# problem is that out of sample forecasted values don't match, presumably because 
# it's using last value in the known time series?  not ideal for our usecases
# maybe better to spin up our own functions -- and GET READY TO FINISH!
# will look at evaluate function for now
# sample evaluate result set
splitter = ExpandingWindowSplitter(step_length=1, fh=[1], initial_window=20)
results  = evaluate(mod1, 
                    y = fake['Vals'], 
                    cv = splitter, 
                    strategy = 'refit', 
                    return_data = True, 
                    error_score = 'raise')

In [409]:
print(mod3.fit(fake.dropna()[['prevdiff']], fake.dropna()['diff']).coef_)
print(mod3.fit(fake.dropna()[['prevdiff']], fake.dropna()['diff']).intercept_)

[-0.66826683]
-0.034432605285480575


In [410]:
mod3.fit(fake.dropna()[['prevdiff']][:18], fake.dropna()[['diff']][:18]) 

In [411]:
mod3.predict(fake.dropna()[['prevdiff']].iloc[18].to_frame())

array([[-0.13513173]])

In [412]:
# sample evaluate result set
splitter = ExpandingWindowSplitter(step_length=1, fh=[1], initial_window=20)
results  = evaluate(mod1, 
                    y = fake['Vals'], 
                    cv = splitter, 
                    strategy = 'refit', 
                    return_data = True, 
                    error_score = 'raise')

In [413]:
X = sample[['PrevValue']].dropna()
y = sample['Values'][1:]

In [414]:
sample['Diff']     = sample['Values'].diff()
sample['PrevDiff'] = sample['Diff'].shift()

In [469]:
# for now:  write function to fit a model on a series, and optionally fit it to a hierarchical time series
def fit_model_on_data(model, y: pd.Series, X = None, start_idx: int = 25, model_args: dict = {}) -> pd.DataFrame:
    """Function to fit model on a singe time series of data & increment it w/ a walk forward validation step"""
    
    final_results = []
    
    m = y.shape[0]
        
    for s in range(start_idx, m):

        y_temp = y.iloc[:s]
            
        if X is not None:
            X_temp = X.iloc[:s]
            model.fit(X_temp, y_temp, **model_args)
            
        else:
            model.fit(y, **model_args)
            
        y_true = y.iloc[s]
        
        if X is not None:
            y_pred = model.predict(X.iloc[s].to_frame())[0]
        
        else:
            # TO DO:  Add in appropriate one step ahead forecast for next prediction
            pass
            
        final_results.append({
            'y_pred': y_pred,
            'Date': X.iloc[s].name
        })
        
    return pd.DataFrame(final_results)

In [461]:
X = sample.dropna()[['PrevDiff']]
y = sample.dropna()['Diff']

In [470]:
fit_model_on_data(LinearRegression(), X = sample.dropna()[['PrevDiff']], y =  sample.dropna()['Diff'])

Unnamed: 0,y_pred,Date
0,0.0,2000-04-01
1,0.0,2000-05-01
2,0.0,2000-06-01
3,0.0,2000-07-01
4,0.0,2000-08-01
5,-0.466667,2000-09-01
6,0.695652,2000-10-01
7,0.0,2000-11-01
8,0.0,2000-12-01
9,0.0,2001-01-01


In [490]:
y_fake = pd.DataFrame(np.random.normal(0, 1, size = (100)))
y_fake.index = pd.date_range(start = '2000-01-01', periods = 100)

from sktime.forecasting.base import ForecastingHorizon

fh = ForecastingHorizon([1, 2])

model = TransformedTargetForecaster([
    Differencer(lags = 1) *
    make_reduction(LinearRegression(), window_length = 1, strategy = 'recursive')
])

model.fit(y_fake, fh = fh)

In [492]:
?model.predict

In [493]:
model.predict(X = pd.Series([.23, -.18]))

AttributeError: 'NoneType' object has no attribute 'T'

### Model Fitting

In [453]:
def build_lookback_preds_for_series(model, 
                         y: pd.Series, 
                         X = None, 
                         start_idx: int = 25,
                         model_args: dict = {}): 
    """Build lookback predictions for Data"""
    
    max_window = min(10, int(np.ceil(len(y) / 10)))
    
    # check that index is a time stamp
    if not np.issubdtype(sample.index.dtype, np.datetime64):
        raise ValueError("Please make sure index column is a datetime")
        
    if X is not None and len(y) != len(X):
        raise ValueError("X & y are not the same shape")
    
    
    results           = pd.DataFrame()
    results['y_true'] = y
    results['Date']   = X.index
    
    for i in range(1, max_window):
        print(i)
        
        lag_cols = [col for col in X.columns if 
                    int(col.split('_')[1]) <= i and col != 'Values']
                        
        X_temp   = X[lag_cols]
        
        # to remove null values from training data
        idx = X_temp.notnull().all(axis = 1).values    

        
        preds = fit_model_on_data(model, 
                                     y = y.loc[idx], 
                                     X = X_temp.loc[idx], 
                                     start_idx  = start_idx,
                                     model_args = model_args)
        
        print(preds.head())
        results[f'y_pred_lag_{i}'] = preds['y_pred']
    
    return results

In [454]:
build_lookback_preds_for_series(LinearRegression(), X = df.loc['T1'].iloc[:, 1:], y = df.loc['T1']['Values'])

1
   y_pred       Date
0     0.0 2000-04-01
1     0.0 2000-05-01
2     0.0 2000-06-01
3     0.0 2000-07-01
4     0.0 2000-08-01
2


ValueError: X has 1 features, but LinearRegression is expecting 2 features as input.