In [None]:
import libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from datetime import datetime

# Load Dataset

In [14]:
train = pd.read_csv("train.csv" , parse_dates=['date'])

In [16]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [18]:
test = pd.read_csv('test.csv', parse_dates=['date'])

In [20]:
test

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [22]:
stores = pd.read_csv('stores.csv')

In [24]:
stores

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


In [26]:
oil = pd.read_csv('oil.csv', parse_dates=['date'])

In [28]:
oil

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
...,...,...
1213,2017-08-25,47.65
1214,2017-08-28,46.40
1215,2017-08-29,46.46
1216,2017-08-30,45.96


In [30]:
holidays = pd.read_csv('holidays_events.csv', parse_dates=['date'])


In [32]:
holidays

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False


In [34]:
# Merge additional data into train dataset

In [36]:
train = train.merge(stores, on='store_nbr', how='left')

In [38]:
train = train.merge(oil, on='date', how='left')

In [60]:
train = train.merge(holidays[['date', 'type']], on='date', how='left')

In [42]:
# Handle missing oil prices with interpolation

In [44]:
train['dcoilwtico'].interpolate(inplace=True)

In [46]:
# Create time-based features
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['day_of_week'] = train['date'].dt.dayofweek
train['is_weekend'] = (train['day_of_week'] >= 5).astype(int)


In [48]:
# Lag Features
train['sales_lag_7'] = train['sales'].shift(7)
train['sales_lag_30'] = train['sales'].shift(30)


In [50]:
# Moving Average Features
train['sales_ma_7'] = train['sales'].rolling(7).mean()
train['sales_ma_30'] = train['sales'].rolling(30).mean()


In [62]:
# Holiday-based Features
train['is_holiday'] = train['type'].apply(lambda x: 1 if x in ['Holiday', 'Transfer'] else 0)
train['is_event'] = train['type'].apply(lambda x: 1 if x in ['Event', 'Additional'] else 0)
train['is_disaster'] = train['type'].apply(lambda x: 1 if x == 'Disaster' else 0)

In [64]:
# Fill NA values
train.fillna(0, inplace=True)

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

# Train- Test split

In [67]:
features = ['store_nbr', 'year', 'month', 'day', 'day_of_week', 'is_weekend', 'dcoilwtico', 
            'sales_lag_7', 'sales_lag_30', 'sales_ma_7', 'sales_ma_30', 'is_holiday']

In [69]:
X = train[features]
y = train['sales']

In [73]:
# Training data: data for  the last 80% of the days
train_size = int(len(train) * 0.8)

X_train = train.iloc[:train_size][features]
y_train = train.iloc[:train_size]['sales']

X_valid = train.iloc[train_size:][features]
y_valid = train.iloc[train_size:]['sales']

# Train Models

Baseline Model (Naïve)

In [77]:
y_pred_naive = y_valid.shift(1).fillna(method='bfill')  # Last known value as prediction
mse_naive = mean_squared_error(y_valid, y_pred_naive)
print(f'Naïve Model RMSE: {np.sqrt(mse_naive)}')

Naïve Model RMSE: 1916.9689166202973


  y_pred_naive = y_valid.shift(1).fillna(method='bfill')  # Last known value as prediction


Arima model

Random Forest Regressor

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_valid)
mse_rf = mean_squared_error(y_valid, y_pred_rf)
print(f'Random Forest RMSE: {np.sqrt(mse_rf)}')

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_valid)
mse_xgb = mean_squared_error(y_valid, y_pred_xgb)
print(f'XGBoost RMSE: {np.sqrt(mse_xgb)}')

In [None]:
X_train_lstm = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1], 1))
X_valid_lstm = np.array(X_valid).reshape((X_valid.shape[0], X_valid.shape[1], 1))

model_lstm = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], 1)),
    Dense(1)
])

model_lstm.compile(optimizer='adam', loss='mse')
model_lstm.fit(X_train_lstm, y_train, epochs=10, batch_size=16, verbose=1)

y_pred_lstm = model_lstm.predict(X_valid_lstm)
mse_lstm = mean_squared_error(y_valid, y_pred_lstm)
print(f'LSTM Model RMSE: {np.sqrt(mse_lstm)}')

ARIMA Model

In [None]:
model_arima = ARIMA(y_train, order=(5,1,0))  
model_arima_fit = model_arima.fit(low momery=True)
y_pred_arima = model_arima_fit.forecast(steps=len(y_valid))
mse_arima = mean_squared_error(y_valid, y_pred_arima)
print(f'ARIMA Model RMSE: {np.sqrt(mse_arima)}')

# Model Evaluation

In [None]:
# Model Comparison
models = ['Naïve', 'ARIMA', 'RandomForest', 'XGBoost', 'LSTM']
rmse_values = [np.sqrt(mse_naive), np.sqrt(mse_arima), np.sqrt(mse_rf), np.sqrt(mse_xgb), np.sqrt(mse_lstm)]

plt.figure(figsize=(10, 5))
sns.barplot(x=models, y=rmse_values)
plt.xlabel('Models')
plt.ylabel('RMSE')
plt.title('Model Performance Comparison')
plt.show()


# Make Predictions for Test Data

In [None]:
# Apply same preprocessing on test data
test = test.merge(stores, on='store_nbr', how='left')
test = test.merge(oil, on='date', how='left')
test = test.merge(holidays, on='date', how='left')

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['day_of_week'] = test['date'].dt.dayofweek
test['is_weekend'] = (test['day_of_week'] >= 5).astype(int)

test['is_holiday'] = test['type'].apply(lambda x: 1 if x in ['Holiday', 'Transfer'] else 0)
test.fillna(0, inplace=True)

# Use best model (XGBoost) for final predictions
test_preds = xgb_model.predict(test[features])

# Save results
submission = pd.DataFrame({'id': test['id'], 'sales': test_preds})
submission.to_csv('sales_forecast_submission.csv', index=False)