In [1]:
# Imports
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
api = KaggleApi()
api.authenticate()

competition_name = 'store-sales-time-series-forecasting'
output_dir = Path('datasets') / competition_name
zip_path = f'{competition_name}.zip'

if not os.path.exists(output_dir):
    # Download from kaggle
    print('Downloading dataset...')
    os.makedirs(output_dir, exist_ok=True)
    api = KaggleApi()
    api.authenticate()
    api.competition_download_files(competition=competition_name)

    # Extract datasets
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)

In [3]:
for dirname, _, filenames in os.walk(output_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))

datasets/store-sales-time-series-forecasting/sample_submission.csv
datasets/store-sales-time-series-forecasting/stores.csv
datasets/store-sales-time-series-forecasting/train.csv
datasets/store-sales-time-series-forecasting/transactions.csv
datasets/store-sales-time-series-forecasting/test.csv
datasets/store-sales-time-series-forecasting/oil.csv
datasets/store-sales-time-series-forecasting/holidays_events.csv


In [4]:
# Load the Data
train = pd.read_csv(output_dir / 'train.csv', parse_dates=['date'])
test = pd.read_csv(output_dir / 'test.csv', parse_dates=['date'])

In [5]:
# Feature Engineering Functions
def create_date_features(df: pd.DataFrame):
    df['day'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype('int')
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofweek'] = df['date'].dt.dayofweek
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    return df

def create_lag_features(df: pd.DataFrame, lags=[1, 7, 14, 28]):
    for lag in lags:
        df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)
    return df

def create_rolling_features(df: pd.DataFrame, windows=[7, 14, 28]):
    for window in windows:
        df[f'sales_roll_mean_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(window=window).mean()
    return df

In [6]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding
le = LabelEncoder()
train['family'] = le.fit_transform(train['family'])
test['family'] = le.transform(test['family'])

In [7]:
# Merge train and test for feature engineering
test['sales'] = np.nan  # dummy sales column
all_data = pd.concat([train, test], axis=0).sort_values(['store_nbr', 'family', 'date'])

In [8]:
# Feature Engineering
all_data = create_date_features(all_data)
all_data = create_lag_features(all_data)
all_data = create_rolling_features(all_data)

In [9]:
# Split back
train = all_data[~all_data['sales'].isna()]
test = all_data[all_data['sales'].isna()]

In [10]:
# Fill missing
train = train.fillna(-1)
test = test.fillna(-1)

In [11]:
# Prepare Features and Target
features = [
    'store_nbr', 'family', 'onpromotion', 'day', 'weekofyear', 'month', 'year', 'dayofweek',
    'is_weekend', 'is_month_start', 'is_month_end',
    'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_28',
    'sales_roll_mean_7', 'sales_roll_mean_14', 'sales_roll_mean_28'
]

target = 'sales'

In [12]:
# Train-Validation Split
X_train = train[train['date'] < '2017-07-01'][features]
y_train = train[train['date'] < '2017-07-01'][target]
X_valid = train[train['date'] >= '2017-07-01'][features]
y_valid = train[train['date'] >= '2017-07-01'][target]

In [13]:
# Important Trick: Log Transform Target
y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid)

In [14]:
# LightGBM Dataset
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train_log, categorical_feature=['family'])
valid_data = lgb.Dataset(X_valid, label=y_valid_log, categorical_feature=['family'])

In [15]:
# LightGBM Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.02,
    'num_leaves': 256,
    'max_depth': 8,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 5,
    'lambda_l2': 1,
    'verbose': -1,
}

In [16]:
import lightgbm as lgb
from tqdm import tqdm

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=600,
    callbacks=[
        lgb.log_evaluation(period=200),
    ]
)

[200]	training's rmse: 0.399852	valid_1's rmse: 0.384054
[400]	training's rmse: 0.379272	valid_1's rmse: 0.377312
[600]	training's rmse: 0.371247	valid_1's rmse: 0.375009
[800]	training's rmse: 0.366506	valid_1's rmse: 0.373571
[1000]	training's rmse: 0.362647	valid_1's rmse: 0.37258


In [17]:
from sklearn.metrics import mean_squared_log_error

# Validation RMSLE
y_pred_valid_log = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred_valid = np.expm1(y_pred_valid_log)   # Inverse transform

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, np.maximum(0, y_pred)))

print(f"Validation RMSLE: {rmsle(y_valid, y_pred_valid):.5f}")

Validation RMSLE: 0.37258


In [18]:
# Predict on Test Set
X_test = test[features]
y_test_pred_log = model.predict(X_test, num_iterations=model.best_iteration)
y_test_pred = np.expm1(y_test_pred_log)
y_test_pred = np.maximum(0, y_test_pred)

In [21]:
# Submission
submission = pd.read_csv(output_dir / 'sample_submission.csv')
submission['sales'] = y_test_pred
submission.to_csv('submission.csv', index=False)

print('Submission file created successfully brother (Optimized)!')

Submission file created successfully brother (Optimized)!
