In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load the cleaned datasets
sales_data = pd.read_csv('../data/sales data-set.csv')
stores_data = pd.read_csv('../data/modified_stores_data.csv')
features_data = pd.read_csv('../data/Features data set.csv')

In [3]:
sales_data.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,05/02/2010,24924.5,False
1,1,1,12/02/2010,46039.49,True
2,1,1,19/02/2010,41595.55,False
3,1,1,26/02/2010,19403.54,False
4,1,1,05/03/2010,21827.9,False


In [4]:
stores_data.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [5]:
features_data.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False
1,1,12/02/2010,38.51,2.548,,,,,,211.24217,8.106,True
2,1,19/02/2010,39.93,2.514,,,,,,211.289143,8.106,False
3,1,26/02/2010,46.63,2.561,,,,,,211.319643,8.106,False
4,1,05/03/2010,46.5,2.625,,,,,,211.350143,8.106,False


In [6]:
# Convert the 'Date' column to a datetime data type
sales_data['Date'] = pd.to_datetime(sales_data['Date'], format='%d/%m/%Y')

# Extract date-related features
sales_data['Year'] = sales_data['Date'].dt.year
sales_data['Month'] = sales_data['Date'].dt.month
sales_data['Day'] = sales_data['Date'].dt.day
sales_data['Weekday'] = sales_data['Date'].dt.weekday  # Monday = 0, Sunday = 6
sales_data['Quarter'] = sales_data['Date'].dt.quarter

# Display the updated dataset with new features
print("Updated Sales Data:")
print(sales_data.head())

Updated Sales Data:
   Store  Dept       Date  Weekly_Sales  IsHoliday  Year  Month  Day  Weekday  \
0      1     1 2010-02-05      24924.50      False  2010      2    5        4   
1      1     1 2010-02-12      46039.49       True  2010      2   12        4   
2      1     1 2010-02-19      41595.55      False  2010      2   19        4   
3      1     1 2010-02-26      19403.54      False  2010      2   26        4   
4      1     1 2010-03-05      21827.90      False  2010      3    5        4   

   Quarter  
0        1  
1        1  
2        1  
3        1  
4        1  


In [7]:
# Create binary features for each markdown event
for i in range(1, 6):
    features_data[f'MarkDown{i}_Event'] = features_data[f'MarkDown{i}'].apply(lambda x: 1 if x > 0 else 0)

# Display the updated features dataset with binary features
print("Updated Features Data:")
print(features_data.head())

Updated Features Data:
   Store        Date  Temperature  Fuel_Price  MarkDown1  MarkDown2  \
0      1  05/02/2010        42.31       2.572        NaN        NaN   
1      1  12/02/2010        38.51       2.548        NaN        NaN   
2      1  19/02/2010        39.93       2.514        NaN        NaN   
3      1  26/02/2010        46.63       2.561        NaN        NaN   
4      1  05/03/2010        46.50       2.625        NaN        NaN   

   MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment  IsHoliday  \
0        NaN        NaN        NaN  211.096358         8.106      False   
1        NaN        NaN        NaN  211.242170         8.106       True   
2        NaN        NaN        NaN  211.289143         8.106      False   
3        NaN        NaN        NaN  211.319643         8.106      False   
4        NaN        NaN        NaN  211.350143         8.106      False   

   MarkDown1_Event  MarkDown2_Event  MarkDown3_Event  MarkDown4_Event  \
0                0        

In [14]:
# Concatenate the DataFrames along the rows
merged_data = pd.concat([sales_data, features_data], axis=1)

# Define your features (X) and target variable (y) based on the merged data
X = merged_data[['Store', 'Dept', 'Year', 'Month', 'Day', 'Weekday', 'Quarter', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'IsHoliday', 'MarkDown1_Event', 'MarkDown2_Event', 'MarkDown3_Event', 'MarkDown4_Event', 'MarkDown5_Event']]
y = merged_data['Weekly_Sales']

# Split the merged data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape - X:", X_train.shape, "y:", y_train.shape)
print("Testing set shape - X:", X_test.shape, "y:", y_test.shape)

Training set shape - X: (337256, 19) y: (337256,)
Testing set shape - X: (84314, 19) y: (84314,)
