#### Feature Engineering  

Dataset: 
- _customers_clean.csv_
- _inventory_clean.csv_
- _products_clean.csv_
- _salesforce_clean.csv_
- _suppliers_clean.csv_
- _transactions_clean.csv_

Author: Luis Sergio Pastrana Lemus  
Date: 2025-07-06

# Feature engineering – Grocery store Dataset

## __1. Libraries__.

In [11]:
from pathlib import Path
import sys

# Define project root dynamically, gets the current directory from which the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:

    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *

from functools import partial
from IPython.display import display, HTML
import numpy as np
import os
import pandas as pd

## __2. Path to Data file__.

In [12]:
# Build route to data file and upload
data_file_path = project_root / "data" / "processed" / "clean"
df_customers_clean = load_dataset_from_csv(data_file_path, "customers_clean.csv", header='infer', parse_dates=['join_date'])
df_inventory_clean = load_dataset_from_csv(data_file_path, "inventory_clean.csv", header='infer', parse_dates=['date'])
df_products_clean = load_dataset_from_csv(data_file_path, "products_clean.csv", header='infer')
df_salesforce_clean = load_dataset_from_csv(data_file_path, "salesforce_clean.csv", header='infer')
df_suppliers_clean = load_dataset_from_csv(data_file_path, "suppliers_clean.csv", header='infer')
df_transactions_clean = load_dataset_from_csv(data_file_path, "transactions_clean.csv", header='infer', parse_dates=['date'])

In [13]:
# Format notebook output
format_notebook()

## __Functions__.

In [14]:
# Function for calculating ...

## 3 __Casting to data types__.

### 3.1 Casting to string data type.

In [15]:
# Call casting dtypes function from features.py and Identifying correctly missing values qith pd.NA

# missing values to pd.NA
df_inventory_clean = replace_missing_values(df_inventory_clean, include=['warehouse_location'])
df_customers_clean = replace_missing_values(df_customers_clean, include=['segment'])

# object to string
df_products_clean = cast_datatypes(df_products_clean, 'string', c_include=['product_name', 'brand'])
df_suppliers_clean = cast_datatypes(df_suppliers_clean, 'string', c_include=['supplier_name', 'contact_info'])
df_customers_clean = cast_datatypes(df_customers_clean, 'string', c_include=['customer_name'])
df_salesforce_clean = cast_datatypes(df_salesforce_clean, 'string', c_include=['employee_name'])

# object to numeric
df_products_clean = cast_datatypes(df_products_clean, 'numeric', numeric_type='Float64', c_include=['unit_cost'])
df_customers_clean = cast_datatypes(df_customers_clean, 'numeric', numeric_type="Float64", c_include=['total_spent'])

# object to category
df_products_clean = cast_datatypes(df_products_clean, 'category', c_include=['category', 'status'])
df_inventory_clean = cast_datatypes(df_inventory_clean, 'category', c_include=['warehouse_location'])
df_customers_clean = cast_datatypes(df_customers_clean, 'category', c_include=['segment'])
df_salesforce_clean = cast_datatypes(df_salesforce_clean, 'category', c_include=['region'])

# object to datetime
df_inventory_clean['date'] = pd.to_datetime(df_inventory_clean['date'], errors='coerce', utc=True)
df_customers_clean['join_date'] = pd.to_datetime(df_customers_clean['join_date'], errors='coerce', utc=True)
df_transactions_clean['date'] = pd.to_datetime(df_transactions_clean['date'], errors='coerce', utc=True)

## 4. Feature Engineering.

### 4.1 03_eda_initial_analysis.

4.1.1 Outliers analysis for Inventory['beginning_stock].

In [None]:
# Segment outliers per warehouse_location in order to know whether location stocking is a trend
mask = (df_inventory_clean['warehouse_location'] == 'north') & (df_inventory_clean['beginning_stock'] > 152.500)
df_inventory_north_stock = df_inventory_clean.loc[mask, :]
df_inventory_north_stock

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
532,533,2025-06-28 07:00:00+00:00,608,470,6,6,north,94
1820,1821,2025-06-13 07:00:00+00:00,3542,259,6,5,north,38
2257,2258,2025-05-16 07:00:00+00:00,5422,776,13,11,north,99
2487,2488,2025-06-05 07:00:00+00:00,8836,198,15,13,north,24
3191,3192,2025-06-22 07:00:00+00:00,3933,581,14,3,north,94
...,...,...,...,...,...,...,...,...
20010,5498,2025-06-24 07:00:00+00:00,8544,434,3,13,north,52
20014,7175,2025-06-16 07:00:00+00:00,6251,456,11,7,north,61
20017,18292,2025-06-12 07:00:00+00:00,1573,375,13,11,north,77
20022,4179,2025-05-27 07:00:00+00:00,3684,210,8,5,north,38


In [19]:
mask = (df_inventory_clean['warehouse_location'] == 'south') & (df_inventory_clean['beginning_stock'] > 152.500)
df_inventory_south_stock = df_inventory_clean.loc[mask, :]
df_inventory_south_stock

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
65,66,2025-05-26 07:00:00+00:00,6690,370,2,8,south,68
572,573,2025-05-24 07:00:00+00:00,4396,297,3,3,south,33
1111,1112,2025-06-30 07:00:00+00:00,5909,567,18,8,south,73
1139,1140,2025-05-08 07:00:00+00:00,7706,630,14,8,south,96
1153,1154,2025-05-15 07:00:00+00:00,3954,161,18,10,south,31
...,...,...,...,...,...,...,...,...
20028,3030,2025-06-04 07:00:00+00:00,2026,609,18,6,south,99
20039,12442,2025-06-16 07:00:00+00:00,3315,603,10,0,south,77
20043,16666,2025-06-25 07:00:00+00:00,2493,480,12,0,south,92
20046,1666,2025-06-19 07:00:00+00:00,6538,280,0,10,south,46


In [20]:
mask = (df_inventory_clean['warehouse_location'] == 'east') & (df_inventory_clean['beginning_stock'] > 152.500)
df_inventory_east_stock = df_inventory_clean.loc[mask, :]
df_inventory_east_stock

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
259,260,2025-07-05 07:00:00+00:00,2066,160,19,7,east,44
498,499,2025-06-25 07:00:00+00:00,3688,184,15,8,east,30
552,553,2025-05-08 07:00:00+00:00,7533,495,0,10,east,45
1686,1687,2025-06-20 07:00:00+00:00,1475,308,9,4,east,49
2200,2201,2025-06-14 07:00:00+00:00,2008,736,4,1,east,95
...,...,...,...,...,...,...,...,...
20009,16536,2025-06-07 07:00:00+00:00,3485,665,14,4,east,105
20035,7601,2025-05-28 07:00:00+00:00,1472,279,18,7,east,42
20038,4540,2025-07-03 07:00:00+00:00,3369,498,7,12,east,78
20051,16616,2025-05-17 07:00:00+00:00,6764,756,6,12,east,78


In [21]:
mask = (df_inventory_clean['warehouse_location'] == 'west') & (df_inventory_clean['beginning_stock'] > 152.500)
df_inventory_west_stock = df_inventory_clean.loc[mask, :]
df_inventory_west_stock

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
428,429,2025-05-26 07:00:00+00:00,5948,168,4,7,west,21
1075,1076,2025-06-19 07:00:00+00:00,8425,305,6,1,west,66
1361,1362,2025-05-23 07:00:00+00:00,9774,210,14,13,west,36
1477,1478,2025-05-22 07:00:00+00:00,1270,330,14,8,west,61
1917,1918,2025-05-19 07:00:00+00:00,5609,405,18,3,west,60
...,...,...,...,...,...,...,...,...
20000,15218,2025-05-22 07:00:00+00:00,2010,702,10,5,west,83
20019,19781,2025-06-16 07:00:00+00:00,2736,552,15,4,west,103
20021,9893,2025-06-20 07:00:00+00:00,4937,259,17,11,west,43
20031,19015,2025-05-27 07:00:00+00:00,3677,360,8,3,west,65


In [None]:
# Check whether 'beginning_stock ' outliers are errors, If beginning_stock is high but there are no sales or receipts, it could be an error.


In [None]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "xxx_feature.csv"

df_xxx.to_csv(processed_path, index=False)