In [37]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [22]:
comp_dir = Path('store-sales-time-series-forecasting')

In [23]:
holidays_events = pd.read_csv(
    comp_dir / "holidays_events.csv",
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date']
)
holidays_events = holidays_events.set_index('date').to_period('D')

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
average_sales = (
    store_sales
    .groupby('date').mean()
    .squeeze()
    .loc['2017']
)

In [24]:
stores = pd.read_csv(comp_dir / 'stores.csv', dtype={'store_nbr': 'category'})
oil = pd.read_csv(comp_dir / 'oil.csv', parse_dates=['date'])
oil['date'] = oil['date'].dt.to_period('D')
transactions = pd.read_csv(comp_dir / 'transactions.csv', parse_dates=['date'])
transactions['date'] = transactions['date'].dt.to_period(('D'))
transactions['store_nbr'] = transactions['store_nbr'].astype('category')

In [26]:
def merge_data(df):
    df = df.reset_index().merge(stores, on='store_nbr', how='left')
    df = df.merge(oil, on='date', how='left')
    df = df.merge(holidays_events, on='date', how='left')
    df = df.merge(transactions, on=['date', 'store_nbr'], how='left')
    df = df.set_index(['store_nbr', 'family', 'date']).sort_index()
    return df

train = merge_data(store_sales)

# Создание дополнительных признаков из даты
def create_date_features(df):
    df['year'] = df.index.get_level_values('date').year
    df['month'] = df.index.get_level_values('date').month
    df['day'] = df.index.get_level_values('date').day
    df['dayofweek'] = df.index.get_level_values('date').dayofweek
    return df

In [36]:
categorical_columns = train.select_dtypes(include=['object']).columns
numeric_columns = train.select_dtypes(include=['number']).columns

In [38]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

In [39]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lgb)
])

In [31]:
# Создание дополнительных признаков из даты
def create_date_features(df):
    df['year'] = df.index.get_level_values('date').year
    df['month'] = df.index.get_level_values('date').month
    df['day'] = df.index.get_level_values('date').day
    df['dayofweek'] = df.index.get_level_values('date').dayofweek
    return df

train = create_date_features(train)

# Создание лагов
def create_lag_features(df, lags, col):
    for lag in lags:
        df[f'{col}_lag_{lag}'] = df[col].groupby(level=['store_nbr', 'family']).shift(lag)

lags = [1, 7, 30]
create_lag_features(train, lags, 'sales')

# Удаление строк с пропущенными значениями (появившимися из-за лагов)
train.dropna(inplace=True)

# Определение признаков и целевой переменной
features = ['store_nbr', 'family', 'onpromotion', 'transactions', 'dcoilwtico', 'year', 'month', 'day', 'dayofweek'] + \
           [f'sales_lag_{lag}' for lag in lags]
X = train[features]
y = train['sales']

  df[f'{col}_lag_{lag}'] = df[col].groupby(level=['store_nbr', 'family']).shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].groupby(level=['store_nbr', 'family']).shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].groupby(level=['store_nbr', 'family']).shift(lag)


KeyError: "['store_nbr', 'family', 'onpromotion'] not in index"