In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv', low_memory=False)
test_df = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
store_df = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
submission_df = pd.read_csv('/kaggle/input/rossmann-store-sales/sample_submission.csv')

In [None]:
merged_df = train_df.merge(store_df, how='left', on='Store')
merged_test_df = test_df.merge(store_df, how='left', on='Store')

In [None]:
merged_df[merged_df['Store'] == 1].set_index('Date')['Sales'].plot()

In [None]:
merged_df.isnull().sum()

In [None]:
sns.histplot(merged_df['Sales'], bins=50)

In [None]:
corr = merged_df[['Sales', 'Customers', 'Promo', 'SchoolHoliday']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')

In [None]:
sns.boxplot(x='DayOfWeek', y='Sales', data=merged_df)

In [None]:
sns.boxplot(x='Promo', y='Sales', data=merged_df)

In [None]:
sns.boxplot(x='SchoolHoliday', y='Sales', data=merged_df)

In [None]:
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.isocalendar().week

In [None]:
split_date(merged_df)
split_date(merged_test_df)

In [None]:
merged_df = merged_df[merged_df.Open == 1].copy()

In [None]:
merged_df.sample(10)

In [None]:
def comp_months(df):
    df['CompOpenSince'] = 12 * (df['Year'] - df['CompetitionOpenSinceYear']) + (df['Month'] - df['CompetitionOpenSinceMonth'])
    df['CompOpenSince'] = df['CompOpenSince'].map(lambda x: 0 if x < 0 else x).fillna(0)

In [None]:
comp_months(merged_df)
comp_months(merged_test_df)

In [None]:
merged_df.sample(10)

In [None]:
def check_promo_month(row):
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}

    try:
        if row['Promo2Open'] and month2str[row['Month']] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0

In [None]:
def promo_cols(df):
    df['Promo2OpenSince'] = 12 * (df['Year'] - df['Promo2SinceYear']) + (df['WeekOfYear'] - df['Promo2SinceWeek']) // 4
    df['Promo2OpenSince'] = df['Promo2OpenSince'].map(lambda x:0 if x<0 else x).fillna(0) * df['Promo2']
    df['IsPromo2Month'] = df.apply(check_promo_month, axis=1) * df['Promo2']

In [None]:
promo_cols(merged_df)
promo_cols(merged_test_df)

In [None]:
merged_df.columns

In [None]:
input_cols = ['Store', 'DayOfWeek', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompOpenSince', 'Promo2OpenSince', 'IsPromo2Month', 'Year', 'Month', 'Day', 'Promo2', 'WeekOfYear']
target_col = 'Sales'

In [None]:
train_inputs = merged_df[input_cols].copy()
targets = merged_df[target_col].copy()

In [None]:
test_inputs = merged_test_df[input_cols].copy()

In [None]:
numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 'CompetitionDistance', 'CompOpenSince', 'Promo2', 'Promo2OpenSince', 'IsPromo2Month', 'Day', 'Month', 'Year', 'WeekOfYear']
categorical_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']

In [None]:
max_distance = train_inputs['CompetitionDistance'].max()

In [None]:
train_inputs['CompetitionDistance'] = train_inputs['CompetitionDistance'].fillna(max_distance*2)
test_inputs['CompetitionDistance'] = test_inputs['CompetitionDistance'].fillna(max_distance*2)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(train_inputs[categorical_cols])

In [None]:
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [None]:
X = train_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=20, max_depth=4)

In [None]:
model.fit(X, targets)

In [None]:
predictions = model.predict(X)

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(a, b):
    return mean_squared_error(a, b, squared=False)

In [None]:
rmse(predictions, targets)

In [None]:
importance_df = pd.DataFrame({
    'feature' : X.columns,
    'importance' : model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
plt.figure(figsize=(10,6))
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature')

In [None]:
from sklearn.model_selection import KFold

In [None]:
def train_and_evaluate(X_train, train_targets, X_val, val_targets, **params):
    model = XGBRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    return model, train_rmse, val_rmse

In [None]:
kfold = KFold(n_splits=5)

In [None]:
models = []

for train_idxs, val_idxs in kfold.split(X):
    X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
    X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
    model, train_rmse, val_rmse = train_and_evaluate(X_train,
                                                    train_targets,
                                                    X_val, 
                                                    val_targets,
                                                    max_depth = 4, 
                                                    n_estimators = 20)
    models.append(model)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

In [None]:
def predict_avg(models, inputs):
    return np.mean([model.predict(inputs) for model in models], axis=0)

In [None]:
preds = predict_avg(models, X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, train_targets, val_targets = train_test_split(X, targets, test_size = 0.1)

def test_params(**params):
    model = XGBRegressor(random_state=42, n_jobs = -1, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    print('Train RMSE: {}, Val RMSE: {}'.format(train_rmse, val_rmse))

In [None]:
test_params(n_estimators=10)

In [None]:
test_params(n_estimators=30)

In [None]:
test_params(n_estimators=100)

In [None]:
test_params(n_estimators=200)

In [None]:
test_params(n_estimators=400)

In [None]:
test_params(n_estimators=600)

In [None]:
test_params(max_depth=2, n_estimators=10)

In [None]:
test_params(max_depth=10, n_estimators = 100)

In [None]:
model = XGBRegressor(random_state=42, n_jobs = -1, n_estimators = 800, max_depth = 10)

In [None]:
model.fit(X, targets)

In [None]:
test_preds = model.predict(X_test)

In [None]:
submission_df['Sales'] = test_preds

In [None]:
train_preds = model.predict(X)

In [None]:
rmse(train_preds, targets)

In [None]:
submission_df

In [None]:
submission_df['Sales'] = submission_df['Sales'] * test_df['Open']

In [None]:
submission_df.to_csv('submission.csv', index=None)