In [33]:
import pandas as pd
import numpy as np

In [34]:
df = pd.read_csv("Walmart_Sales.csv")

In [35]:
df.shape

(6435, 8)

In [36]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB


# Key Insights from This EDA:

Weekly_Sales is right-skewed (some weeks/stores have very high sales).

Only ~7% of weeks are holidays, but sales during holidays may spike.

Some stores consistently generate higher total sales than others.

Economic and weather factors (Temperature, Fuel_Price, CPI, Unemployment) can be correlated with sales.

In [38]:
df.describe()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,23.0,1046965.0,0.06993,60.663782,3.358607,171.578394,7.999151
std,12.988182,564366.6,0.255049,18.444933,0.45902,39.356712,1.875885
min,1.0,209986.2,0.0,-2.06,2.472,126.064,3.879
25%,12.0,553350.1,0.0,47.46,2.933,131.735,6.891
50%,23.0,960746.0,0.0,62.67,3.445,182.616521,7.874
75%,34.0,1420159.0,0.0,74.94,3.735,212.743293,8.622
max,45.0,3818686.0,1.0,100.14,4.468,227.232807,14.313


In [39]:
df.isnull().sum()

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64

No missing data observed.

In [40]:
df["Weekly_Sales"].sum()

np.float64(6737218987.11)

In [41]:
df.value_counts()

Store  Date        Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price  CPI         Unemployment
1      01-04-2011  1495064.75    0             59.17        3.524       214.837166  7.682           1
30     30-09-2011  387001.13     0             78.91        3.355       216.362033  7.852           1
31     02-07-2010  1311704.92    0             82.29        2.669       210.880373  8.099           1
       02-04-2010  1357600.68    0             64.12        2.719       210.479887  8.200           1
       02-03-2012  1427881.22    0             59.30        3.630       220.486689  7.057           1
                                                                                                   ..
15     30-12-2011  603460.79     1             31.44        3.566       136.643258  7.866           1
       30-09-2011  521297.31     0             64.87        3.858       136.419500  7.806           1
       30-07-2010  619224.06     0             72.04        2.932       132.598387  8.0

In [42]:
holiday_sales = df[df["Holiday_Flag"] == 1]['Weekly_Sales'].values
holiday_sales

array([1641957.44, 1507460.69, 1955624.11, 1367320.01, 1649614.93,
       1540471.24, 2033320.66, 1497462.72, 1802477.43, 1661767.33,
       2137809.5 , 1839128.83, 2658725.29, 1750434.55, 2168041.61,
       1748000.65, 2614202.3 , 1874226.52, 2103322.68, 1898777.07,
        420728.96,  352260.97,  565567.84,  382677.76,  430526.21,
        377347.49,  556925.19,  410553.88,  473292.47,  408229.73,
       2188307.39, 1865820.81, 2789469.45, 1794868.74, 2187847.29,
       2093139.01, 3004702.33, 2007105.86, 2374660.64, 2125104.72,
        311825.7 ,  306533.08,  488362.61,  298180.18,  311590.54,
        321110.22,  507900.07,  349624.88,  349239.88,  350648.91,
       1606283.86, 1424225.44, 2267452.4 , 1464050.02, 1486920.17,
       1483574.38, 2249811.55, 1598080.52, 1620603.92, 1608077.01,
        524104.92,  535769.32,  835189.26,  729572.08,  559903.13,
        613135.23,  949075.87,  815915.52,  563460.77,  597876.55,
        994801.4 ,  831425.2 , 1261693.16,  773586.49,  996147

In [43]:
non_holiday_sales = df[df['Holiday_Flag'] == 0]['Weekly_Sales'].values
non_holiday_sales

array([1643690.9 , 1611968.17, 1409727.59, ...,  734464.36,  718125.53,
        760281.43], shape=(5985,))

In [44]:
np.mean(holiday_sales)

np.float64(1122887.8923555557)

In [47]:
 np.mean(non_holiday_sales)*100

np.float64(104125638.02088556)

In [48]:
store_sales = df.groupby('Store')['Weekly_Sales'].sum()

In [49]:
store_sales

Store
1     2.224028e+08
2     2.753824e+08
3     5.758674e+07
4     2.995440e+08
5     4.547569e+07
6     2.237561e+08
7     8.159828e+07
8     1.299512e+08
9     7.778922e+07
10    2.716177e+08
11    1.939628e+08
12    1.442872e+08
13    2.865177e+08
14    2.889999e+08
15    8.913368e+07
16    7.425243e+07
17    1.277821e+08
18    1.551147e+08
19    2.066349e+08
20    3.013978e+08
21    1.081179e+08
22    1.470756e+08
23    1.987506e+08
24    1.940160e+08
25    1.010612e+08
26    1.434164e+08
27    2.538559e+08
28    1.892637e+08
29    7.714155e+07
30    6.271689e+07
31    1.996139e+08
32    1.668192e+08
33    3.716022e+07
34    1.382498e+08
35    1.315207e+08
36    5.341221e+07
37    7.420274e+07
38    5.515963e+07
39    2.074455e+08
40    1.378703e+08
41    1.813419e+08
42    7.956575e+07
43    9.056544e+07
44    4.329309e+07
45    1.123953e+08
Name: Weekly_Sales, dtype: float64

In [50]:
store_sales.sort_values(ascending=False).head()

Store
20    3.013978e+08
4     2.995440e+08
14    2.889999e+08
13    2.865177e+08
2     2.753824e+08
Name: Weekly_Sales, dtype: float64

In [51]:
store_sales.sort_values().head()

Store
33    37160221.96
44    43293087.84
5     45475688.90
36    53412214.97
38    55159626.42
Name: Weekly_Sales, dtype: float64

In [52]:
numeric_cols = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
correlation_matrix = df[numeric_cols].corr()

In [53]:
numeric_cols

['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']

In [54]:
correlation_matrix

Unnamed: 0,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment
Weekly_Sales,1.0,-0.06381,0.009464,-0.072634,-0.106176
Temperature,-0.06381,1.0,0.144982,0.176888,0.101158
Fuel_Price,0.009464,0.144982,1.0,-0.170642,-0.034684
CPI,-0.072634,0.176888,-0.170642,1.0,-0.30202
Unemployment,-0.106176,0.101158,-0.034684,-0.30202,1.0


In [56]:
pd.crosstab(
    [df.Temperature,df.Weekly_Sales],
    [df.CPI,df.Fuel_Price],
    margins = True
)*100

Unnamed: 0_level_0,CPI,126.064,126.064,126.064,126.064,126.076645,126.076645,126.076645,126.085452,126.085452,126.085452,...,226.966232,226.968844,226.973545,226.987364,227.018417,227.036936,227.169392,227.214288,227.232807,All
Unnamed: 0_level_1,Fuel_Price,2.698,2.837,3.049,3.159,2.671,2.85,3.041,2.64,2.802,3.123,...,3.617,3.594,3.601,3.506,3.594,3.506,3.601,3.594,3.506,Unnamed: 22_level_1
Temperature,Weekly_Sales,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
-2.06,558027.77,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
5.54,817485.14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
6.23,1083071.14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
7.46,593875.46,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
9.51,775910.43,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99.22,1205884.98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
99.66,237095.82,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
100.07,297753.49,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
100.14,280937.84,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
