# Case Study on ANOVA

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

In [2]:
# Loading the CSV file into a pandas dataframe.
data = pd.read_csv('Sales_data_zone_wise.csv')

In [3]:
data

Unnamed: 0,Month,Zone - A,Zone - B,Zone - C,Zone - D
0,Month - 1,1483525,1748451,1523308,2267260
1,Month - 2,1238428,1707421,2212113,1994341
2,Month - 3,1860771,2091194,1282374,1241600
3,Month - 4,1871571,1759617,2290580,2252681
4,Month - 5,1244922,1606010,1818334,1326062
5,Month - 6,1534390,1573128,1751825,2292044
6,Month - 7,1820196,1992031,1786826,1688055
7,Month - 8,1625696,1665534,2161754,2363315
8,Month - 9,1652644,1873402,1755290,1422059
9,Month - 10,1852450,1913059,1754314,1608387


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Month     29 non-null     object
 1   Zone - A  29 non-null     int64 
 2   Zone - B  29 non-null     int64 
 3   Zone - C  29 non-null     int64 
 4   Zone - D  29 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 1.3+ KB


In [5]:
data.columns

Index(['Month', 'Zone - A', 'Zone - B', 'Zone - C', 'Zone - D'], dtype='object')

In [6]:
# There are 4 independent variables(Zone - A, Zone - B, Zone - C, Zone - D) and 1 dependent variable(Month)

# ● The mean sales generated by each zone

In [29]:
data.mean().T.round()

Zone - A       1540493.0
Zone - B       1755560.0
Zone - C       1772871.0
Zone - D       1842927.0
Total Sales    6911851.0
dtype: float64

Zone D has the highest mean value 1842927.
Zone A has the lowest mean value 1540493

In [28]:
details = data.describe().T.round()
details

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Zone - A,29.0,1540493.0,261940.0,1128185.0,1305972.0,1534390.0,1820196.0,2004480.0
Zone - B,29.0,1755560.0,168390.0,1527574.0,1606010.0,1740365.0,1875658.0,2091194.0
Zone - C,29.0,1772871.0,333194.0,1237722.0,1523308.0,1767047.0,2098463.0,2290580.0
Zone - D,29.0,1842927.0,375016.0,1234311.0,1520406.0,1854412.0,2180416.0,2364132.0
Total Sales,29.0,6911851.0,590892.0,5925424.0,6506659.0,7032783.0,7155515.0,8174449.0


# ● Total sales generated by all the zones for each month.


In [9]:
data['Total Sales'] = data[['Zone - A', 'Zone - B', 'Zone - C', 'Zone - D']].sum(axis=1)

In [12]:
Total_sales = data[['Month','Total Sales']]

In [13]:
Total_sales

Unnamed: 0,Month,Total Sales
0,Month - 1,7022544
1,Month - 2,7152303
2,Month - 3,6475939
3,Month - 4,8174449
4,Month - 5,5995328
5,Month - 6,7151387
6,Month - 7,7287108
7,Month - 8,7816299
8,Month - 9,6703395
9,Month - 10,7128210


In [14]:
Total_sales.min()

Month          Month - 1
Total Sales      5925424
dtype: object

Sales low at Month 1

In [15]:
Total_sales.max()

Month          Month - 9
Total Sales      8174449
dtype: object

Sales high at Month 9

# ● Check whether all the zones generate the same amount of sales.

We are having 4 independent variables and to compare the mean of them-consider one-way ANOVA. 
Hypothesis are:

# H0-All the zones generate the same amount of sales

# Ha -All the zones generate different amount of sales

In [29]:
f, p = stats.f_oneway(data['Zone - A'],data['Zone - B'],data['Zone - C'],data['Zone - D'])

print("F-value:",f)
print("p-value:",p)

F-value: 5.672056106843581
p-value: 0.0011827601694503335


In [30]:
if p<0.05:
    print('Reject H0:All the zones generate the same amount of sales')
else:
    print('Accept H0:All the zones generate the same amount of sales')

Reject H0:All the zones generate the same amount of sales


 The value of p is less than 0.05 then we can reject null hypothesis.So all the zones generate different amount of sales