In [1]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler,FunctionTransformer,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt

In [2]:
Xraw,yraw = fetch_openml(data_id=43927,return_X_y=True,as_frame=True)

In [3]:
Xraw['DATE']=pd.to_datetime(Xraw['year'].astype(str)+"-"+Xraw['month'].astype(str)+"-"+Xraw['day'].astype(str))

In [4]:
mask = (Xraw['region']=='WestTexNewMexico')&(Xraw['type']=='organic')

allDat = pd.concat([Xraw,yraw],axis=1)

subset = allDat[mask].copy()

In [24]:
def doy(X):
    result = pd.Series(X['DATE'].dt.day_of_year,
        index=X.index).to_frame()
    return result

def month(X):
    result = pd.Series(X['DATE'].dt.month,
        index=X.index).to_frame()
    return result

def dow(X):
    result = pd.Series(X['DATE'].dt.day_of_week,
        index=X.index).to_frame()
    return result

def sin_seas(X):
    result = np.sin(2*np.pi*X/366)
    return result

def cos_seas(X):
    result = np.cos(2*np.pi*X/366)
    return result

transformer = ColumnTransformer(
    transformers=[
        (
            'seas',
            FeatureUnion(
                [
                    (
                        'sinu',
                        Pipeline(
                            steps=[
                                ('doy',FunctionTransformer(func=doy,feature_names_out='one-to-one')),
                                (
                                    'union',
                                    FeatureUnion(
                                        [
                                            ('sin',FunctionTransformer(sin_seas,feature_names_out='one-to-one')),
                                            ('cos',FunctionTransformer(cos_seas,feature_names_out='one-to-one')),
                                        ]
                                    )
                                )
                            ]
                        )
                    ),
                    (
                        'oh',
                        Pipeline(
                            steps=[
                                (
                                    'nums',
                                    FeatureUnion(
                                        [
                                            ('dow',FunctionTransformer(dow,feature_names_out='one-to-one')),
                                            ('dom',FunctionTransformer(month,feature_names_out='one-to-one')),
                                        ]
                                    )
                                ),
                                ('_',OneHotEncoder(handle_unknown='ignore'))
                            ]
                        )
                    )
                    
                ]
            )
            ,
            ['DATE']
         )
    ],
    remainder='passthrough',
    force_int_remainder_cols=False
)

In [25]:
X=subset[['DATE','AveragePrice']].copy()
y=subset['Total Volume'].copy()


In [26]:
transformer.fit_transform(X)

array([[-0.89061651,  0.45475514,  1.        , ...,  0.        ,
         0.        ,  1.89      ],
       [-0.0857305 ,  0.99631836,  1.        , ...,  0.        ,
         1.        ,  1.81      ],
       [-0.20455207,  0.97885569,  1.        , ...,  0.        ,
         1.        ,  1.92      ],
       ...,
       [ 0.35275209,  0.93571682,  1.        , ...,  0.        ,
         0.        ,  1.87      ],
       [ 0.23803328,  0.971257  ,  1.        , ...,  0.        ,
         0.        ,  1.93      ],
       [ 0.11988119,  0.99278825,  1.        , ...,  0.        ,
         0.        ,  1.62      ]])

In [27]:
transformer.get_feature_names_out()

array(['seas__sinu__sin__DATE', 'seas__sinu__cos__DATE',
       'seas__oh__dow__DATE_6', 'seas__oh__dom__DATE_1',
       'seas__oh__dom__DATE_2', 'seas__oh__dom__DATE_3',
       'seas__oh__dom__DATE_4', 'seas__oh__dom__DATE_5',
       'seas__oh__dom__DATE_6', 'seas__oh__dom__DATE_7',
       'seas__oh__dom__DATE_8', 'seas__oh__dom__DATE_9',
       'seas__oh__dom__DATE_10', 'seas__oh__dom__DATE_11',
       'seas__oh__dom__DATE_12', 'remainder__AveragePrice'], dtype=object)

In [16]:
fu = FeatureUnion(
    [
        ('month',FunctionTransformer(month,feature_names_out='one-to-one')),
        ('dow',FunctionTransformer(dow,feature_names_out='one-to-one')),
    ]
)

oh = OneHotEncoder(drop='first',handle_unknown='ignore')

s1 = fu.fit_transform(X[['DATE']])

oh.fit_transform(s1)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 148 stored elements and shape (166, 11)>

In [17]:
feature_union = FeatureUnion([
    ('month', FunctionTransformer(month, feature_names_out='one-to-one')),
    ('dow', FunctionTransformer(dow, feature_names_out='one-to-one'))
])

# Pipeline to apply FeatureUnion followed by OneHotEncoder
pipeline = Pipeline([
    ('date_features', feature_union),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# ColumnTransformer to apply the pipeline to the DATE column
transformer = ColumnTransformer(
    transformers=[
        ('date_pipeline', pipeline, ['DATE'])
    ],
    remainder='passthrough'
)

# Fit and transform the data
transformed_data = transformer.fit_transform(X[['DATE']])