In [89]:
!pip install pandas numpy scikit-learn xgboost



In [90]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
# Load data
train = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')
oil = pd.read_csv('oil.csv')
holidays = pd.read_csv('holidays_events.csv')
transactions = pd.read_csv('transactions.csv')
test = pd.read_csv('test.csv')


In [92]:
# Convert date to datetime
train['date'] = pd.to_datetime(train_data['date'])
holidays['date'] = pd.to_datetime(holidays['date'])
oil['date'] = pd.to_datetime(oil['date'])
transactions['date'] = pd.to_datetime(transactions['date'])
test['date'] = pd.to_datetime(test['date'])



In [93]:
# Merge additional information into the train and test data
train = train.merge(stores, on='store_nbr', how='left')
test = test.merge(stores, on='store_nbr', how='left')

train = train.merge(oil, on='date', how='left')
test = test.merge(oil, on='date', how='left')

train = train.merge(holidays, on='date', how='left')
test = test.merge(holidays, on='date', how='left')

In [94]:
# Merge transactions data into train and test
train = train.merge(transactions, on=['date', 'store_nbr'], how='left')
test = test.merge(transactions, on=['date', 'store_nbr'], how='left')

In [95]:
# Fill missing values without inplace=True to avoid chained assignment
train['onpromotion'] = train['onpromotion'].fillna(0)
test['onpromotion'] = test['onpromotion'].fillna(0)

train['dcoilwtico'] = train['dcoilwtico'].ffill()
test['dcoilwtico'] = test['dcoilwtico'].ffill()

train['transactions'] = train['transactions'].fillna(0)
test['transactions'] = test['transactions'].fillna(0)

In [96]:
# Encode categorical features
train['family'] = train['family'].astype('category').cat.codes
test['family'] = test['family'].astype('category').cat.codes

train['type_x'] = train['type_x'].astype('category').cat.codes
test['type_x'] = test['type_x'].astype('category').cat.codes

In [97]:
# Selecting features and target variable
features = ['store_nbr', 'family', 'onpromotion', 'dcoilwtico', 'transactions', 'type_x', 'cluster']
X_train = train[features]
y_train = train['sales']
X_test = test[features]

In [98]:
# Initialize the LightGBM model
lgb_train_data = lgb.Dataset(X_train, label=y_train)
params = {
    'objective': 'regression',
    'metric': 'rmsle',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'random_state': 42
}

In [85]:
# Train the model
lgb_model = lgb.train(params, lgb_train_data, num_boost_round=100)

In [103]:
# Make predictions on the test set
test['sales'] = lgb_model.predict(X_test)

In [104]:
# Prepare submission
submission = test[['id', 'sales']]
submission.to_csv('submission.csv', index=False)

In [105]:
# Predictions on validation data
y_val_pred = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)

# Clip negative predictions to a small positive value (e.g., 0.1) to avoid errors
y_val_pred = np.clip(y_val_pred, 0.1, None)

print(f'Validation RMSE: {mean_squared_log_error(y_val, y_val_pred, squared=False)}')

Validation RMSE: 1.9336897344848585


