In [34]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [35]:
comp_dir = Path('store-sales-time-series-forecasting')

In [36]:
holidays_events = pd.read_csv(
    comp_dir / "holidays_events.csv",
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date']
)
holidays_events = holidays_events.set_index('date').to_period('D')

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
average_sales = (
    store_sales
    .groupby('date').mean()
    .squeeze()
    .loc['2017']
)

In [40]:
# Загрузим данные
holidays_events = pd.read_csv(
    comp_dir / "holidays_events.csv",
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date']
)

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
)

class DateIndexer(BaseEstimator, TransformerMixin):
    def __init__(self, date_column):
        self.date_column = date_column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.date_column] = X[self.date_column].dt.to_period('D')
        X = X.set_index(self.date_column)
        return X.reset_index()

class SalesPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.set_index(['store_nbr', 'family', 'date']).sort_index()
        return X

class AverageSalesProcessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.groupby('date').mean().squeeze().loc['2017']

# Создание пайплайна
pipeline = Pipeline([
    ('date_indexer', DateIndexer(date_column='date')),
    ('sales_preprocessor', SalesPreprocessor()),
    ('average_sales_processor', AverageSalesProcessor())
])

# Применение пайплайна к данным
holidays_events = holidays_events.set_index('date').to_period('D')
average_sales = pipeline.fit_transform(store_sales)

print(average_sales.head())

date
2017-01-01      3.609085
2017-01-02    813.093689
2017-01-03    626.199890
2017-01-04    567.608459
2017-01-05    438.580963
Freq: D, Name: sales, dtype: float32
