#### Feature Engineering  

Dataset: 
- _customers_clean.csv_
- _inventory_clean.csv_
- _products_clean.csv_
- _salesforce_clean.csv_
- _suppliers_clean.csv_
- _transactions_clean.csv_

Author: Luis Sergio Pastrana Lemus  
Date: 2025-07-06

# Feature engineering – Grocery store Dataset

## __1. Libraries__.

In [1]:
from pathlib import Path
import sys

# Define project root dynamically, gets the current directory from which the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:

    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *

from functools import partial
from IPython.display import display, HTML
import numpy as np
import os
import pandas as pd

## __2. Path to Data file__.

In [2]:
# Build route to data file and upload
data_file_path = project_root / "data" / "processed" / "clean"
df_customers_clean = load_dataset_from_csv(data_file_path, "customers_clean.csv", header='infer', parse_dates=['join_date'])
df_inventory_clean = load_dataset_from_csv(data_file_path, "inventory_clean.csv", header='infer', parse_dates=['date'])
df_products_clean = load_dataset_from_csv(data_file_path, "products_clean.csv", header='infer')
df_salesforce_clean = load_dataset_from_csv(data_file_path, "salesforce_clean.csv", header='infer')
df_suppliers_clean = load_dataset_from_csv(data_file_path, "suppliers_clean.csv", header='infer')
df_transactions_clean = load_dataset_from_csv(data_file_path, "transactions_clean.csv", header='infer', parse_dates=['date'])

In [3]:
# Format notebook output
format_notebook()

## __Functions__.

In [4]:
# Function for calculating ...

## 3 __Casting to data types__.

### 3.1 Casting to string data type.

In [5]:
# Call casting dtypes function from features.py and Identifying correctly missing values qith pd.NA

# missing values to pd.NA
df_inventory_clean = replace_missing_values(df_inventory_clean, include=['warehouse_location'])
df_customers_clean = replace_missing_values(df_customers_clean, include=['segment'])

# object to string
df_products_clean = cast_datatypes(df_products_clean, 'string', c_include=['product_name', 'brand'])
df_suppliers_clean = cast_datatypes(df_suppliers_clean, 'string', c_include=['supplier_name', 'contact_info'])
df_customers_clean = cast_datatypes(df_customers_clean, 'string', c_include=['customer_name'])
df_salesforce_clean = cast_datatypes(df_salesforce_clean, 'string', c_include=['employee_name'])

# object to numeric
df_products_clean = cast_datatypes(df_products_clean, 'numeric', numeric_type='Float64', c_include=['unit_cost'])
df_customers_clean = cast_datatypes(df_customers_clean, 'numeric', numeric_type="Float64", c_include=['total_spent'])

# object to category
df_products_clean = cast_datatypes(df_products_clean, 'category', c_include=['category', 'status'])
df_inventory_clean = cast_datatypes(df_inventory_clean, 'category', c_include=['warehouse_location'])
df_customers_clean = cast_datatypes(df_customers_clean, 'category', c_include=['segment'])
df_salesforce_clean = cast_datatypes(df_salesforce_clean, 'category', c_include=['region'])

# object to datetime
df_inventory_clean['date'] = pd.to_datetime(df_inventory_clean['date'], errors='coerce', utc=True)
df_customers_clean['join_date'] = pd.to_datetime(df_customers_clean['join_date'], errors='coerce', utc=True)
df_transactions_clean['date'] = pd.to_datetime(df_transactions_clean['date'], errors='coerce', utc=True)

## 4. Feature Engineering.

### 4.1 03_eda_initial_analysis.

4.1.1 Outliers analysis for Inventory['beginning_stock].

In [6]:
# Segment outliers whether beginning_stock is an error, if beginning_stock != (ending_stock - received + sold)
df_inventory_stock = df_inventory_clean.loc[(df_inventory_clean['beginning_stock'] > 152.500), :]
df_inventory_stock

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
65,66,2025-05-26 07:00:00+00:00,6690,370,2,8,south,364
259,260,2025-07-05 07:00:00+00:00,2066,160,19,7,east,172
428,429,2025-05-26 07:00:00+00:00,5948,168,4,7,west,165
498,499,2025-06-25 07:00:00+00:00,3688,184,15,8,east,191
532,533,2025-06-28 07:00:00+00:00,608,470,6,6,north,470
...,...,...,...,...,...,...,...,...
20043,16666,2025-06-25 07:00:00+00:00,2493,480,12,0,south,492
20046,1666,2025-06-19 07:00:00+00:00,6538,280,0,10,south,270
20049,2215,2025-07-02 07:00:00+00:00,219,603,9,2,south,610
20051,16616,2025-05-17 07:00:00+00:00,6764,756,6,12,east,750


In [7]:
df_inventory_stock = df_inventory_stock.copy()
df_inventory_stock['stock_error'] = (df_inventory_stock['beginning_stock'] != (df_inventory_stock['ending_stock'] - 
                                                                               df_inventory_stock['received'] +
                                                                               df_inventory_stock['sold']))
df_inventory_stock

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock,stock_error
65,66,2025-05-26 07:00:00+00:00,6690,370,2,8,south,364,False
259,260,2025-07-05 07:00:00+00:00,2066,160,19,7,east,172,False
428,429,2025-05-26 07:00:00+00:00,5948,168,4,7,west,165,False
498,499,2025-06-25 07:00:00+00:00,3688,184,15,8,east,191,False
532,533,2025-06-28 07:00:00+00:00,608,470,6,6,north,470,False
...,...,...,...,...,...,...,...,...,...
20043,16666,2025-06-25 07:00:00+00:00,2493,480,12,0,south,492,False
20046,1666,2025-06-19 07:00:00+00:00,6538,280,0,10,south,270,False
20049,2215,2025-07-02 07:00:00+00:00,219,603,9,2,south,610,False
20051,16616,2025-05-17 07:00:00+00:00,6764,756,6,12,east,750,False


In [8]:
# Check whether 'beginning_stock' outliers compared among days are errors
df_inventory_stock = df_inventory_stock.drop(labels='stock_error', axis=1)
df_inventory_stock = df_inventory_clean.loc[(df_inventory_clean['product_id'].isin(df_inventory_stock['product_id'])), :]
df_inventory_stock


Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
65,66,2025-05-26 07:00:00+00:00,6690,370,2,8,south,364
127,128,2025-05-21 07:00:00+00:00,2066,23,2,10,west,15
140,141,2025-06-17 07:00:00+00:00,3369,63,4,7,north,60
193,194,2025-07-02 07:00:00+00:00,2525,25,7,8,south,24
235,236,2025-06-02 07:00:00+00:00,3063,9,12,11,south,10
...,...,...,...,...,...,...,...,...
20050,14237,2025-06-14 07:00:00+00:00,9437,72,17,7,east,82
20051,16616,2025-05-17 07:00:00+00:00,6764,756,6,12,east,750
20052,9865,2025-05-16 07:00:00+00:00,4272,330,18,0,east,348
20053,16289,2025-06-23 07:00:00+00:00,6659,23,13,13,east,23


In [9]:
# Delete first the biggest values within the same day because these are considered as errors
df_inventory_stock = df_inventory_stock.sort_values(by=['product_id', 'warehouse_location', 'date', 'beginning_stock'], ascending=[True, True, True, True])
df_inventory_stock

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
20040,3546,2025-05-20 07:00:00+00:00,28,56,17,4,north,69
3545,3546,2025-05-20 07:00:00+00:00,28,448,17,4,north,461
1299,1300,2025-05-27 07:00:00+00:00,28,82,12,3,north,91
5342,5343,2025-06-03 07:00:00+00:00,219,71,11,13,east,69
948,949,2025-06-30 07:00:00+00:00,219,27,2,2,east,27
...,...,...,...,...,...,...,...,...
4919,4920,2025-07-02 07:00:00+00:00,9841,245,8,9,north,244
3249,3250,2025-05-15 07:00:00+00:00,9927,372,10,1,north,381
17248,17249,2025-05-12 07:00:00+00:00,9937,85,5,2,east,88
8011,8012,2025-06-12 07:00:00+00:00,9937,60,0,9,south,51


In [10]:
df_inventory_stock = df_inventory_stock.drop_duplicates(subset=['product_id', 'warehouse_location', 'date'], keep='first')
df_inventory_stock

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
20040,3546,2025-05-20 07:00:00+00:00,28,56,17,4,north,69
1299,1300,2025-05-27 07:00:00+00:00,28,82,12,3,north,91
5342,5343,2025-06-03 07:00:00+00:00,219,71,11,13,east,69
948,949,2025-06-30 07:00:00+00:00,219,27,2,2,east,27
17605,17606,2025-07-03 07:00:00+00:00,219,91,19,8,east,102
...,...,...,...,...,...,...,...,...
4919,4920,2025-07-02 07:00:00+00:00,9841,245,8,9,north,244
3249,3250,2025-05-15 07:00:00+00:00,9927,372,10,1,north,381
17248,17249,2025-05-12 07:00:00+00:00,9937,85,5,2,east,88
8011,8012,2025-06-12 07:00:00+00:00,9937,60,0,9,south,51


In [None]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "xxx_feature.csv"

df_xxx.to_csv(processed_path, index=False)