<a href="https://colab.research.google.com/github/Jyoti200/Sales_forescasting/blob/main/Sale_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Store Sales- Time Series Forecasting

 We will predict sales for the thousands of product families sold at Favorita stores located in Ecuador. The training data includes dates, store and product information, whether that item was being promoted, as well as the sales numbers.

**train.csv**
1. The training data, comprising time series of features store_nbr, family, and onpromotion as well as the target sales.
2. **store_nbr** identifies the store at which the products are sold. **family** identifies the type of product sold.
3. **sales** gives the total sales for a product family at a particular store at a given date. Fractional values are possible since products can be sold in fractional units (1.5 kg of cheese, for instance, as opposed to 1 bag of chips).
**onpromotion** gives the total number of items in a product family that were being promoted at a store at a given date.

**test.csv**
The test data, having the same features as the training data. You will predict the target sales for the dates in this file.
The dates in the test data are for the 15 days after the last date in the training data.

**stores.csv**

Store metadata, including city, state, type, and cluster.
cluster is a grouping of similar stores.

**oil.csv**
Daily oil price. Includes values during both the train and test data timeframes. (Ecuador is an oil-dependent country and it's economical health is highly vulnerable to shocks in oil prices.)

**holidays_events.csv**

Holidays and Events, with metadata

To predict sales of a store we need


*   Previous records of sales
*   Holidays present as during holidays sales gets increased
*   Oil prices as Ecuador is an oil based economy
*   Date for time series analysis
*   Geographical location
*   





In [9]:
# Import all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [10]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload()  # This will prompt you to upload the kaggle.json file


In [None]:
!mkdir -p ~/.kaggle
!mv 'kaggle.json' ~/.kaggle/kaggle.json

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c store-sales-time-series-forecasting


In [None]:
!unzip store-sales-time-series-forecasting.zip


In [None]:
store=pd.read_csv('stores.csv')
store.head()

In [None]:
transaction=pd.read_csv('transactions.csv')
transaction.head()

In [None]:
holiday=pd.read_csv('holidays_events.csv')
holiday.head()

In [None]:
oil=pd.read_csv('oil.csv')
oil.head()


In [None]:
train=pd.read_csv('train.csv')
train.head()

In [None]:
test=pd.read_csv('test.csv')
test.head()

In [None]:
def stats(df):
  print("Information of dataset:",df.info())
  print("************")
  print("Shape of dataset:",df.shape)
  print("************")
  print("Statstical Summary:",df.describe())
  print("************")
  print("Null Values:",df.isnull().sum())
  print("************")
  print("Duplicate Values",df.duplicated().sum())

In [None]:
stats(train)

In [None]:
stats(store)

In [None]:
stats(transaction)

In [None]:
stats(holiday)

In [None]:
stats(oil)

In [None]:
# dcoilwtico has missing values

In [None]:
# Null values present in oil dataframem
(oil['dcoilwtico'].isna().sum())/len(oil['dcoilwtico'])

In [None]:
# Missing value imputation
oil['dcoilwtico']=oil['dcoilwtico'].fillna(method='bfill').fillna(method='ffill')

In [None]:
(oil['dcoilwtico'].isna().sum())/len(oil['dcoilwtico'])

In [None]:
train.drop('id',axis=1,inplace=True)

In [None]:
# Converting all date object data type to date data type
train['date']=pd.to_datetime(train['date'])
test['date']=pd.to_datetime(test['date'])
transaction['date']=pd.to_datetime(transaction['date'])
oil['date']=pd.to_datetime(oil['date'])
holiday['date']=pd.to_datetime(holiday['date'])

In [None]:
# Checking memory usage of this feature!
train['store_nbr'].info()

In [None]:
# The feature is taking alot of memory so we will convert it to int8
train['store_nbr']=train['store_nbr'].astype('int8')
test['store_nbr']=test['store_nbr'].astype('int8')

In [None]:
train['store_nbr'].info()

In [None]:
train['sales']=train['sales'].astype('float32')

In [None]:
train['onpromotion'].info()

In [None]:
train['onpromotion']=train['onpromotion'].astype('int8')
test['onpromotion']=test['onpromotion'].astype('int8')

In [None]:
transaction['store_nbr']=transaction['store_nbr'].astype('int8')

In [None]:
fig = px.line(transaction, x='date', y='transactions', title='Transactions Over Time')
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Transactions'
)

fig.show()

In [None]:
# We can see transactions peak around end of December and in mid of May that means sales are more during holiday time i.e. chirstmas and battle of pinchina day.

In [None]:
holidays_based_transactions = pd.merge(transaction, holiday, on='date',how='right')
holidays_based_transactions.shape

In [None]:
holidays_based_transactions.head()

In [None]:
fig = px.line(holidays_based_transactions, x='date', y='transactions', title='Transactions Over Time')
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Transactions'
)

fig.show()

**Monthly Average Transactions**

In [None]:
a = transaction.set_index("date").resample("M").transactions.mean().reset_index()
a["year"] = a.date.dt.year
px.line(a, x='date', y='transactions', color='year',title = "Monthly Average Transactions" )

In [None]:
data=pd.merge(train,store,on='store_nbr',how='left')
data.head()

In [None]:
data=pd.merge(data,transaction,on=['date','store_nbr'],how='left')
data.head()

In [None]:
data=pd.merge(data,oil,on='date',how='left')
data.head()

In [None]:
data=pd.merge(data,holiday,on='date',how='left')
data.head()

In [None]:
final_data=data.copy()

In [None]:
final_data.head()

# **Exploratory Data Analysis**

In [None]:
sns.barplot(x='state',y='sales',data=final_data)
plt.title('Sales by state in Ecuador')
plt.xticks(rotation=90)
plt.show()

**Highest** sales are recorded in *Pichincha* state of Ecuador and lowest sales are of *Pastaza* state.

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='type_y', y='sales', data=final_data)
plt.show()

**Highest sales** are observed on *Additional* type of Holidays and second highest is on the days when holidays were *transferred*.

In [None]:
sns.barplot(x='family',y='sales',data=final_data)
plt.xticks(rotation=90)
plt.show()

**Top** **3** sales are Grocery 1, Beverages and Produce.

In [None]:
o = oil.set_index("date").resample("M").dcoilwtico.mean().reset_index()
o["year"] = o.date.dt.year
px.line(o, x='date', y='dcoilwtico', color='year',title = "Monthly Average oil production" )

**Oil** Prices after June 30th 2014 are continuously **decreasing**.

It is due to protestes conducted by Enviornmentalist and in 2024 voters voted against oil drilling in Amazon Forest.

In [None]:
# Which store is having highest sales?
sal=final_data.groupby('store_nbr')['sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x=sal.index, y=sal.values, order=sal.index,palette='viridis')
plt.title('Total Sales by Store')
plt.show()

Store number 44, 45, 47 are earning the highest sales recorded.

In [None]:
zero = final_data.groupby(["store_nbr", "family"]).sales.sum().reset_index().sort_values(["family","store_nbr"])
zero = zero[zero.sales == 0]
zero

As these stores are not getting in sales in some categories the recommended thing would be to trasform these sales data.

In [None]:
final_data['log_sales'] = np.log1p(final_data['sales'])

In [None]:
final_data['day_of_week'] = final_data['date'].dt.dayofweek

In [None]:
test['day_of_week'] = pd.to_datetime(test['date']).dt.dayofweek

In [None]:
features = ['store_nbr', 'day_of_week', 'cluster', 'dcoilwtico','onpromotion']

In [None]:
X_train = pd.get_dummies(final_data[features])

In [None]:
y_train=final_data['log_sales']

In [None]:
X_train = X_train.dropna()
y_train = y_train[X_train.index]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [None]:
X_test = pd.get_dummies(test[features])

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_predict = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_predict))

In [None]:
print(rmse)

In [None]:
# LSTM

In [None]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [None]:
final_data.set_index('date', inplace=True)

In [None]:
# To predict sales
sales = final_data['sales'].values.reshape(-1, 1)

In [None]:
# Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_sales = scaler.fit_transform(sales)

In [None]:
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        end_ix = i + seq_length
        seq_x, seq_y = data[i:end_ix], data[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# Define sequence length (model will use past 7 days of data)
seq_length = 7
# Create sequences of data
X, y = create_sequences(scaled_sales, seq_length)

In [None]:
# Split the data into training and testing sets
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size

X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
# Reshape input to be [samples, time steps, features] expected by LSTM
X_train = X_train.reshape(X_train.shape[0], seq_length, 1)
X_test = X_test.reshape(X_test.shape[0], seq_length, 1)

# Define the LSTM model
model = Sequential([
    LSTM(units=50, activation='relu', input_shape=(seq_length, 1)),
    Dense(units=1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32)


In [None]:
# Evaluate the model
train_loss = model.evaluate(X_train, y_train, verbose=0)
test_loss = model.evaluate(X_test, y_test, verbose=0)

print(f'Train Loss: {train_loss:.4f}')
print(f'Test Loss: {test_loss:.4f}')


In [None]:
# Make predictions
predicted_sales = model.predict(X_test)

# Inverse transform predictions to get actual values
predicted_sales = scaler.inverse_transform(predicted_sales)


In [None]:
# Plot actual vs predicted sales
plt.figure(figsize=(14, 7))
plt.plot(sales[-len(y_test):], label='Actual Sales')
plt.plot(predicted_sales, label='Predicted Sales')
plt.title('Sales Forecasting with LSTM')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.legend()
plt.show()
