Flow of Analysis :
1. Import the required libraries
2. Read and understand the data
3. Exploratory Data Analysis
4. Data Preparation
5. Time Series Decomposition
6. Build and Evaluate Time Series Forecast



In [None]:
# Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import math
from scipy.stats import variation

In [None]:
%matplotlib inline
warnings.filterwarnings('ignore')
sns.set_style("darkgrid")

In [None]:
import sklearn
from sklearn.metrics import mean_squared_error

Read and understand the data

In [None]:
# Read and understand the data
data = pd.read_csv('/content/GlobalSuperstoreData.csv')
data.head()

In [None]:
# Check the shape of the data
data.shape

In [None]:
# Check the overall info
data.info()

In [None]:
# Check the numerical data stats
data.describe()

Exploratory Data Analysis

In [None]:
# Missing values
data.isnull().sum()

In [None]:
# Outliers
data.describe(percentiles=[.25,.5,.75,.90,.95,.99])

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data['Profit'])
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data['Sales'])
plt.show()

Univariate Analysis

In [None]:
data.info()

In [None]:
# Segment column
plt.figure(figsize= (8,4))
data["Segment"].value_counts(normalize=True).plot.bar()
plt.title("Bar chart analysing the 3 product categories\n", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Green'})
plt.xlabel("Segment", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'})
plt.ylabel("Percentage", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'} )
plt.show()

In [None]:
# Inference : Consumer category forms the most in demand product category worldwide for Global Mart.
#Market column
plt.figure(figsize= (8,4))
data["Market"].value_counts(normalize=True).plot.bar()
plt.title("Bar chart analysing 7 geographical market segments\n", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Green'})
plt.xlabel("Market", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'})
plt.ylabel("Percentage", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'} )
plt.show()

In [None]:
# Bivariate Analysis
# Profit vs Sales
plt.figure(figsize= [8,4])
sns.scatterplot(data['Sales'], data['Profit'])
plt.title("Scatter plot analysing Profit v/s Sales\n", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Green'})
plt.xlabel("Sales", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'})
plt.ylabel("Profit", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'} )
plt.show()

In [None]:
# Sales w.r.t. Product Category
plt.figure(figsize= (8,4))
data.groupby("Segment")["Sales"].mean().plot.bar()
plt.title("Plot analysing Sales w.r.t. Product category\n", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Green'})
plt.xlabel("Product Category", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'})
plt.show()

In [None]:
# Profit w.r.t. Product Category
plt.figure(figsize= (8,4))
data.groupby("Segment")["Profit"].mean().plot.bar()
plt.title("Plot analysing Profit w.r.t. Product category\n", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Green'})
plt.xlabel("Product Category", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'})
plt.show()

In [None]:
# Sales w.r.t. Market Segment
plt.figure(figsize= (8,4))
data.groupby("Market")["Sales"].mean().plot.bar()
plt.title("Plot analysing Sales w.r.t. Market Segment\n", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Green'})
plt.xlabel("Market Segment", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'})
plt.show()

In [None]:
# Multivariate Analysis
plt.figure(figsize=(15,8))
sns.barplot(data=data, x='Market', y= 'Sales', hue='Segment')
plt.title('Bar chart analysing Sales for different Markets-Segments\n', fontdict={'fontsize': 16, 'fontweight' : 5, 'color' : 'Green'})
plt.legend(loc = 'best')
plt.show()

Data Preparation

In [None]:
data.info()

In [None]:
# Concatenate the Market and Segment column to get the Market Segment data
data['Market_Segment'] = data['Market']+'-'+data['Segment'] 

In [None]:
data.head() # Check whether the new column is properly created

In [None]:
# Market_Segment column
plt.figure(figsize= (15,8))
data["Market_Segment"].value_counts(normalize=True).plot.bar()
plt.title("Bar chart analysing the 21 Market Segments\n", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Green'})
plt.xlabel("Market_Segment", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'})
plt.ylabel("Percentage", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Black'} )
plt.show()

In [None]:
data.drop(['Segment', 'Market'], axis = 1, inplace = True)

In [None]:
data.head() # dataset having Order-Date, Sales, Profit against each market segment

In [None]:
# Coverting the Order Date column to date time format.
data['Order Date'].head()

In [None]:
data['Order Date'] = pd.to_datetime(data['Order Date']).dt.to_period('m') # Converting the Order Date to month-Year format

In [None]:
data.head()

In [None]:
# Displaying the time series data consisting of order, sales, profit for 21 market segments
data.groupby(['Market_Segment', 'Order Date']).sum()

In [None]:
# We create a Pivot Table to show the data by Order Date as index and each of the 21 market segments as columns with total profit values.
data_cov = pd.pivot_table(data = data, index = "Order Date", columns = "Market_Segment", values = "Profit", aggfunc="sum")
data_cov

In [None]:
# Train-Test Split : We divide the data such that train set contains 42 months and test set contains 6 months data
train_len = 42
train = data_cov[0 : train_len]
test = data_cov[train_len : ]

In [None]:
train.shape

In [None]:
test.shape

In [None]:
# Calculate mean for the train set
train_mean = np.mean(train)
train_mean

In [None]:
# Calculate standard deviation for the train set
train_std = np.std(train)
train_std

Coefficient of variation is a ratio of the standard deviation to mean. We need to find the market segment for which value of Coefficient of variation is the least for profit. This is because, less standard deviation means less variation in profit, which means more consistent are the profit figures for that region over the given period. We calculate the Coefficient of Variation for each of the 21 market segments for 42 months(train data) to decide which market segment is consistently profitable.

In [None]:
# Calculate Coefficient of variation for the train set
train_CoV = train_std/train_mean
train_CoV

In [None]:
# Creating a dataframe CoV displaying the mean, std. dev. and Coeff. of Var. values against each of the market segments
CoV = pd.DataFrame({'Mean' : train_mean, 'Std_Dev' : train_std, 'Coeff_of_Var' : train_CoV})
CoV

In [None]:
# Find the market segment with least Coefficient of Variation
CoV.sort_values('Coeff_of_Var')

In [None]:
# Filter the main data frame such for APAC-Consumer market segment
data_apac_con = data[data['Market_Segment'] == 'APAC-Consumer']
data_apac_con

In [None]:
# Group the data by Order Date for sum of Sales
data1 = data_apac_con.groupby(['Order Date'])['Sales'].sum()
data1

Time Series Decomposition

In [None]:
data1 = pd.DataFrame(data1) # Convert to data frame
data1.index = data1.index.to_timestamp() # Converting the index to timestamp
data1.index.dtype

In [None]:
plt.figure(figsize=(18,4))
plt.plot(data1, label='Sales')
plt.legend(loc='best')
plt.title('Sales for APAC-Consumer Segment\n', fontdict={'fontsize': 16, 'fontweight' : 5, 'color' : 'Green'})
plt.xticks(rotation = 90,fontweight="bold")
plt.show()

In [None]:
# Additive seasonal decomposition
from pylab import rcParams
import statsmodels.api as sm
rcParams['figure.figsize'] = 12, 8
decomposition = sm.tsa.seasonal_decompose(data1.Sales, model='additive') # additive seasonal index
fig = decomposition.plot()
plt.show()

In [None]:
# Multiplicative seasonal decomposition
decomposition = sm.tsa.seasonal_decompose(data1.Sales, model='multiplicative') # multiplicative seasonal index
fig = decomposition.plot()
plt.show()

We build various Time Series Forecast models and compare the RMSE (Root Mean Squared Error) and MAPE (Mean Absolute Percentage Error) values for all the models. Lower values of RMSE and MAPE are desired to conclude that a model performs better. Accuracy is calculated as (100 - MAPE). Lower the MAPE value, higher is the accuracy.

In [None]:
train_len = 42
train = data1[0 : train_len]
test = data1[train_len : ]

In [None]:
train.head()

In [None]:
test

In [None]:
from statsmodels.tsa.stattools import adfuller
adf_test = adfuller(data1['Sales'])

print('ADF Statistic: %f' % adf_test[0])
print('Critical Values @ 0.05: %.2f' % adf_test[4]['5%'])
print('p-value: %f' %adf_test[1])

In [None]:
# p-value is less than 0.05. This means that the series is stationary. Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test
from statsmodels.tsa.stattools import kpss
kpss_test = kpss(data1['Sales'])

print('KPSS Statistic: %f' % kpss_test[0])
print('Critical Values @ 0.05: %.2f' % kpss_test[3]['5%'])
print('p-value: %f' % kpss_test[1])

In [None]:
# Box Cox transformation to make variance constant
from scipy.stats import boxcox
data_boxcox = pd.Series(boxcox(data1['Sales'], lmbda=0), index = data1.index)

plt.figure(figsize=(12,4))
plt.plot(data_boxcox, label='After Box Cox tranformation')
plt.legend(loc='best')
plt.title('After Box Cox transform')
plt.show()

In [None]:
# Differencing to remove trend
data_boxcox_diff = pd.Series(data_boxcox - data_boxcox.shift(), data1.index)
plt.figure(figsize=(12,4))
plt.plot(data_boxcox_diff, label='After Box Cox tranformation and differencing')
plt.legend(loc='best')
plt.title('After Box Cox transform and differencing')
plt.show()

In [None]:
data_boxcox_diff.dropna(inplace=True) # After shifting the data, drop the first value which is NA

In [None]:
data_boxcox_diff.tail()

In [None]:
adf_test = adfuller(data_boxcox_diff)

print('ADF Statistic: %f' % adf_test[0])
print('Critical Values @ 0.05: %.2f' % adf_test[4]['5%'])
print('p-value: %f' % adf_test[1])

In [None]:
train_data_boxcox = data_boxcox[:train_len]
test_data_boxcox = data_boxcox[train_len:]
train_data_boxcox_diff = data_boxcox_diff[:train_len-1]
test_data_boxcox_diff = data_boxcox_diff[train_len-1:]
# SARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

model = SARIMAX(train_data_boxcox, order=(1, 1, 1), seasonal_order=(1, 1, 1, 6)) 
model_fit = model.fit()
print(model_fit.params)