In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import optuna.visualization as vis
import category_encoders as ce
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import TimeSeriesSplit

sns.set_style('whitegrid')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install optuna-integration  # Need for optuna work

In [None]:
train_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv', parse_dates=['date'])
test_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'])
oil_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv', parse_dates=['date'])
holidays_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv', parse_dates=['date'])
transactions_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv', parse_dates=['date'])
stores_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')

# And this cell for locale run

In [None]:
print(train_df.head())
print(train_df.info())

In [None]:
print(test_df.head())
print(test_df.info())

In [None]:
print(oil_df.head())
print(oil_df.info())

In [None]:
print(holidays_df.head())
print(holidays_df.info())

In [None]:
print(transactions_df.head())
print(transactions_df.info())

In [None]:
print(stores_df.head())
print(stores_df.info())

In [None]:
train = train_df.merge(stores_df, on='store_nbr', how='left')
train = train.merge(transactions_df, on=['date', 'store_nbr'], how='left')
train = train.merge(oil_df, on='date', how='left')
train = train.merge(holidays_df, on='date', how='left')
train.head()

In [None]:
train['oil_price'] = train['dcoilwtico'].ffill().bfill()
train['holiday_type'] = train['type_y'].fillna('No Holiday')
train['transactions'] = train['transactions'].ffill()

train.drop(columns=['type_y'], inplace=True)

In [None]:
train.head()

In [None]:
print(train.isnull().sum())

sns.heatmap(train.isnull(), cbar=False)
plt.title('Пропуски в данных')
plt.show()

In [None]:
train = train_df.merge(stores_df, on='store_nbr', how='left')
train = train.merge(transactions_df, on=['date', 'store_nbr'], how='left')
train = train.merge(oil_df, on='date', how='left')
train = train.merge(holidays_df, on='date', how='left')

train['oil_price'] = train['dcoilwtico'].ffill().bfill()
train['holiday_type'] = train['type_y'].fillna('No Holiday')
train['transactions'] = train['transactions'].ffill()
train.drop(columns=['type_y'], inplace=True)

train['store_nbr'] = train['store_nbr'].astype(str)
train['cluster'] = train['cluster'].astype(str)

# Создание временных признаков
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day_of_week'] = train['date'].dt.dayofweek
train['week_of_year'] = train['date'].dt.isocalendar().week
train['is_weekend'] = train['day_of_week'].isin([5, 6]).astype(int)
train['is_holiday'] = (train['holiday_type'] != 'Work Day').astype(int)
train['is_month_end'] = train['date'].dt.is_month_end.astype(int)
train['is_month_start'] = train['date'].dt.is_month_start.astype(int)
train['is_payday'] = ((train['date'].dt.day == 15) | (train['date'].dt.is_month_end)).astype(int)
train['store_family_mean'] = train.groupby(['store_nbr', 'family'])['sales'].transform('mean')


In [None]:
# Лог-трансформация целевой переменной
train['log_sales'] = np.log1p(train['sales'])

test = test_df.merge(stores_df, on='store_nbr', how='left')
test = test.merge(transactions_df, on=['date', 'store_nbr'], how='left')
test = test.merge(oil_df, on='date', how='left')
test = test.merge(holidays_df, on='date', how='left')

test['oil_price'] = test['dcoilwtico'].ffill().bfill()
test['holiday_type'] = test['type_y'].fillna('No Holiday')
test['transactions'] = test['transactions'].ffill()
test.drop(columns=['type_y'], inplace=True)

test['store_nbr'] = test['store_nbr'].astype(str)
test['cluster'] = test['cluster'].astype(str)


test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day_of_week'] = test['date'].dt.dayofweek
test['week_of_year'] = test['date'].dt.isocalendar().week
test['is_weekend'] = test['day_of_week'].isin([5, 6]).astype(int)
test['is_holiday'] = (test['holiday_type'] != 'Work Day').astype(int)
test['is_month_end'] = test['date'].dt.is_month_end.astype(int)
test['is_month_start'] = test['date'].dt.is_month_start.astype(int)
test['is_payday'] = ((test['date'].dt.day == 15) | (test['date'].dt.is_month_end)).astype(int)


lags = [7, 14, 30, 60]
rolling_windows = [7, 14, 30]

for lag in lags:
    train[f'lag_{lag}_sales'] = train.groupby(['store_nbr', 'family'])['sales'].shift(lag)

for window in rolling_windows:
    train[f'rolling_mean_{window}'] = (
        train.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(window).mean()
    )
    train[f'rolling_std_{window}'] = (
        train.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(window).std()
    )


for lag in lags:
    test[f'lag_{lag}_sales'] = test.merge(
        train.groupby(['store_nbr', 'family'])[[f'lag_{lag}_sales']].last().reset_index(),
        on=['store_nbr', 'family'],
        how='left'
    )[f'lag_{lag}_sales']

for window in rolling_windows:
    test[f'rolling_mean_{window}'] = test.merge(
        train.groupby(['store_nbr', 'family'])[[f'rolling_mean_{window}']].last().reset_index(),
        on=['store_nbr', 'family'],
        how='left'
    )[f'rolling_mean_{window}']

    test[f'rolling_std_{window}'] = test.merge(
        train.groupby(['store_nbr', 'family'])[[f'rolling_std_{window}']].last().reset_index(),
        on=['store_nbr', 'family'],
        how='left'
    )[f'rolling_std_{window}']

In [None]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
# Разделение train на признаки и целевую переменную
X = train.drop(columns=['sales', 'log_sales'])
y = train['log_sales']

categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['number']).columns


In [None]:
from optuna.integration import XGBoostPruningCallback

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Функция для Optuna
def objective(trial):
    
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'device': 'cuda',
        'random_state': 42
    }

    tscv = TimeSeriesSplit(n_splits=7)
    rmsle_scores = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        target_encoder = ce.TargetEncoder(cols=categorical_features)
        X_train_encoded = target_encoder.fit_transform(X_train, y_train)
        X_test_encoded = target_encoder.transform(X_test)

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', 'passthrough', categorical_features)
            ]
        )

        X_train_processed = preprocessor.fit_transform(X_train_encoded)
        X_test_processed = preprocessor.transform(X_test_encoded)

        dtrain = xgb.DMatrix(X_train_processed, label=y_train)
        dtest = xgb.DMatrix(X_test_processed, label=y_test)

        # Обучение модели с ранней остановкой
        model = xgb.train(
            param, 
            dtrain,
            num_boost_round=trial.suggest_int('n_estimators', 100, 1000),
            evals=[(dtest, 'validation')],
            early_stopping_rounds=15,
            verbose_eval=False,
        )

        # Предсказание
        y_pred = model.predict(dtest)
        y_pred = np.maximum(0, y_pred)

        # Применение RMSLE
        rmsle = mean_squared_log_error(y_test, y_pred, squared=False)
        rmsle_scores.append(rmsle)
        
    return sum(rmsle_scores) / len(rmsle_scores)

# Запуск Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

print("Best trial:")
trial = study.best_trial
print(f"  RMSLE: {trial.value}")
print("  Best hyperparameters: ", trial.params)