Import pandas, numpy and matplotlib libraries

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Import Dataset

In [60]:
dataset = pd.read_csv('Walmart.csv')

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB


Check  for missing data

In [61]:
has_missing = dataset.isnull().any()
print(has_missing)

Store           False
Date            False
Weekly_Sales    False
Holiday_Flag    False
Temperature     False
Fuel_Price      False
CPI             False
Unemployment    False
dtype: bool


Format Date field into weekday, month and year

In [62]:
dataset['Date'] = pd.to_datetime(dataset['Date'],format='%d-%m-%Y')

dataset['weekday'] = dataset['Date'].dt.weekday
dataset['month'] = dataset['Date'].dt.month
dataset['Year'] = dataset['Date'].dt.year

dataset.drop(columns=['Date'], inplace=True)

Split dataset into matrix of features X and vector of dependent variables y.

In [63]:
X = pd.concat([dataset.iloc[:, 0:1], dataset.iloc[:, 3:]], axis=1).values
y = dataset.iloc[:, 1].values

In [64]:
print(X)

[[1.000e+00 4.231e+01 2.572e+00 ... 4.000e+00 2.000e+00 2.010e+03]
 [1.000e+00 3.851e+01 2.548e+00 ... 4.000e+00 2.000e+00 2.010e+03]
 [1.000e+00 3.993e+01 2.514e+00 ... 4.000e+00 2.000e+00 2.010e+03]
 ...
 [4.500e+01 5.447e+01 4.000e+00 ... 4.000e+00 1.000e+01 2.012e+03]
 [4.500e+01 5.647e+01 3.969e+00 ... 4.000e+00 1.000e+01 2.012e+03]
 [4.500e+01 5.885e+01 3.882e+00 ... 4.000e+00 1.000e+01 2.012e+03]]


In [65]:
print(y)

[1643690.9  1641957.44 1611968.17 ...  734464.36  718125.53  760281.43]


Split dataset into training and test set

In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Apply feature scaling (Standardization) on training and test set

In [67]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc_y = StandardScaler()

In [68]:
X_train_scaled = sc.fit_transform(X_train)
print(X_train)

[[2.600e+01 2.262e+01 2.747e+00 ... 4.000e+00 2.000e+00 2.010e+03]
 [4.400e+01 6.840e+01 3.776e+00 ... 4.000e+00 6.000e+00 2.012e+03]
 [3.600e+01 7.224e+01 2.698e+00 ... 4.000e+00 1.000e+01 2.010e+03]
 ...
 [2.300e+01 1.558e+01 3.232e+00 ... 4.000e+00 1.000e+00 2.011e+03]
 [4.100e+01 4.286e+01 3.636e+00 ... 4.000e+00 4.000e+00 2.011e+03]
 [1.000e+00 7.795e+01 3.501e+00 ... 4.000e+00 6.000e+00 2.012e+03]]


In [69]:
y_train_scaled = sc_y.fit_transform(y_train.reshape(-1,1))
print(y_train)

[ 999348.55  340238.38  424956.3  ... 1110706.06 1304481.75 1624477.58]


In [70]:
X_test_scaled = sc.fit_transform(X_test)
print(X_test)

[[9.000e+00 7.085e+01 3.810e+00 ... 4.000e+00 4.000e+00 2.011e+03]
 [4.500e+01 2.478e+01 3.205e+00 ... 4.000e+00 1.000e+00 2.011e+03]
 [1.800e+01 7.092e+01 3.629e+00 ... 4.000e+00 6.000e+00 2.012e+03]
 ...
 [3.600e+01 5.718e+01 3.129e+00 ... 4.000e+00 1.000e+00 2.012e+03]
 [4.300e+01 5.998e+01 2.728e+00 ... 4.000e+00 1.100e+01 2.010e+03]
 [3.800e+01 6.885e+01 3.014e+00 ... 4.000e+00 1.000e+01 2.010e+03]]


In [71]:
y_test_scaled = sc_y.fit_transform(y_test.reshape(-1,1))
y_test

array([ 532226.2 ,  654018.95, 1080357.89, ...,  329467.82,  595421.23,
        339042.18])

Export scaled training and test set variables 

In [72]:
%store X_train X_train_scaled X_test X_test_scaled y_train y_test

Stored 'X_train' (ndarray)
Stored 'X_train_scaled' (ndarray)
Stored 'X_test' (ndarray)
Stored 'X_test_scaled' (ndarray)


Stored 'y_train' (ndarray)
Stored 'y_test' (ndarray)
