### Import relevant libraries

In [8]:
import numpy as np
import pandas as pd

### Load dataset

In [9]:
walmart_data = pd.read_csv("data/Walmart.csv")

### Inspect first 2 rows and data types of the dataset

In [10]:
walmart_data.head(2).T

Unnamed: 0,0,1
Store,1,1
Date,5/2/2010,12/2/2010
Weekly_Sales,1643690.9,1641957.44
Holiday_Flag,0,1
Temperature,42.31,38.51
Fuel_Price,2.572,2.548
CPI,211.096358,211.24217
Unemployment,8.106,8.106


In [11]:
walmart_data.dtypes

Store             int64
Date             object
Weekly_Sales    float64
Holiday_Flag      int64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
dtype: object

In [12]:
walmart_data.shape

(6435, 8)

### Check the average weekly sales for each unique temperature value

In [15]:
walmart_data.groupby('Temperature')['Weekly_Sales'].mean()

Temperature
-2.06      5.580278e+05
 5.54      8.174851e+05
 6.23      1.083071e+06
 7.46      5.938755e+05
 9.51      7.759104e+05
               ...     
 99.20     2.391984e+05
 99.22     8.155418e+05
 99.66     2.370958e+05
 100.07    2.977535e+05
 100.14    2.809378e+05
Name: Weekly_Sales, Length: 3528, dtype: float64

### Check for missing values

In [16]:
walmart_data.isnull().sum()

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64

In [18]:
sorted_data = walmart_data.sort_values('Weekly_Sales', ascending=False)
sorted_data

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
1905,14,24-12-2010,3818686.45,0,30.59,3.141,182.544590,8.724
2763,20,24-12-2010,3766687.43,0,25.17,3.141,204.637673,7.484
1333,10,24-12-2010,3749057.69,0,57.06,3.236,126.983581,9.003
527,4,23-12-2011,3676388.98,0,35.92,3.103,129.984548,5.143
1762,13,24-12-2010,3595903.20,0,34.90,2.846,126.983581,7.795
...,...,...,...,...,...,...,...,...
4671,33,2/12/2011,220060.35,0,59.12,3.701,129.845967,8.010
4623,33,31-12-2010,219804.85,1,52.91,3.148,127.087677,9.265
4675,33,30-12-2011,215359.21,1,51.60,3.428,130.071032,8.010
4614,33,29-10-2010,213538.32,0,71.34,3.130,126.436419,9.265


In [19]:
sorted_data[['Store','Weekly_Sales']]

Unnamed: 0,Store,Weekly_Sales
1905,14,3818686.45
2763,20,3766687.43
1333,10,3749057.69
527,4,3676388.98
1762,13,3595903.20
...,...,...
4671,33,220060.35
4623,33,219804.85
4675,33,215359.21
4614,33,213538.32


### Categorise the weekly sales into very high, high, moderate, low & very low categories

In [29]:
walmart_data['Weekly_Sales'].fillna(0, inplace=True)
bins = [0, 250000, 500000, 750000, 1000000, max(walmart_data['Weekly_Sales'])]
labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
walmart_data['bins'] = pd.cut(x=walmart_data['Weekly_Sales'], bins=bins, labels=labels)

In [31]:
walmart_data[['Weekly_Sales','bins']].head()

Unnamed: 0,Weekly_Sales,bins
0,1643690.9,Very High
1,1641957.44,Very High
2,1611968.17,Very High
3,1409727.59,Very High
4,1554806.68,Very High
