#### Exploratory Data Analysis (EDA) - Initial Data Visualization: Distribution and Relations

Dataset: 
- _customers_clean.csv_
- _inventory_clean.csv_
- _products_clean.csv_
- _salesforce_clean.csv_
- _suppliers_clean.csv_
- _transactions_clean.csv_

Author: Luis Sergio Pastrana Lemus  
Date: 2025-07-06

# Exploratory Data Analysis – Grocery Store Dataset

## __1. Libraries__.

In [None]:
from pathlib import Path
import sys

# Define project root dynamically, gets the current directory from which the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:

    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *


from IPython.display import display, HTML
import os
import pandas as pd
import numpy as np

## __2. Path to Data file__.

In [None]:
# Build route to data file and upload
data_file_path = project_root / "data" / "processed" / "clean"

df_customers_clean = load_dataset_from_csv(data_file_path, "customers_clean.csv", header='infer', parse_dates=['join_date'])
df_inventory_clean = load_dataset_from_csv(data_file_path, "inventory_clean.csv", header='infer', parse_dates=['date'])
df_products_clean = load_dataset_from_csv(data_file_path, "products_clean.csv", header='infer')
df_salesforce_clean = load_dataset_from_csv(data_file_path, "salesforce_clean.csv", header='infer')
df_suppliers_clean = load_dataset_from_csv(data_file_path, "suppliers_clean.csv", header='infer')
df_transactions_clean = load_dataset_from_csv(data_file_path, "transactions_clean.csv", header='infer', parse_dates=['date'])

# data_file_path = project_root / "data" / "processed" / "feature"

# df_xxx_feature = load_dataset_from_csv(data_file_path, "xxx_feature.csv", sep=',', header='infer')

In [None]:
# Format notebook output
format_notebook()

## __3. Exploratory Data Analysis__.

### 3.0 Casting Data types.

In [None]:
# Call casting dtypes function from features.py and Identifying correctly missing values qith pd.NA

# missing values to pd.NA
df_inventory_clean = replace_missing_values(df_inventory_clean, include=['warehouse_location'])
df_customers_clean = replace_missing_values(df_customers_clean, include=['segment'])

# object to string
df_products_clean = cast_datatypes(df_products_clean, 'string', c_include=['product_name', 'brand'])
df_suppliers_clean = cast_datatypes(df_suppliers_clean, 'string', c_include=['supplier_name', 'contact_info'])
df_customers_clean = cast_datatypes(df_customers_clean, 'string', c_include=['customer_name'])
df_salesforce_clean = cast_datatypes(df_salesforce_clean, 'string', c_include=['employee_name'])

# object to numeric
df_products_clean = cast_datatypes(df_products_clean, 'numeric', numeric_type='Float64', c_include=['unit_cost'])
df_customers_clean = cast_datatypes(df_customers_clean, 'numeric', numeric_type="Float64", c_include=['total_spent'])

# object to category
df_products_clean = cast_datatypes(df_products_clean, 'category', c_include=['category', 'status'])
df_inventory_clean = cast_datatypes(df_inventory_clean, 'category', c_include=['warehouse_location'])
df_customers_clean = cast_datatypes(df_customers_clean, 'category', c_include=['segment'])
df_salesforce_clean = cast_datatypes(df_salesforce_clean, 'category', c_include=['region'])

# object to datetime
df_inventory_clean['date'] = pd.to_datetime(df_inventory_clean['date'], errors='coerce', utc=True)
df_customers_clean['join_date'] = pd.to_datetime(df_customers_clean['join_date'], errors='coerce', utc=True)
df_transactions_clean['date'] = pd.to_datetime(df_transactions_clean['date'], errors='coerce', utc=True)

### 3.2 Data Visualization: Distributions and Relationships.

#### 3.2.1 Covariance and Correlation Analysis.

##### 3.2.1.1 Covariance Matrix.

In [None]:
# Covariance for customers
df_customers_clean[['total_spent', 'frequency']].cov()

Unnamed: 0,total_spent,frequency
total_spent,2087992.0,487.793544
frequency,487.7935,203.97176


In [None]:
# Covariance for inventory
df_inventory_clean[['beginning_stock', 'received', 'sold', 'ending_stock']].cov()

Unnamed: 0,beginning_stock,received,sold,ending_stock
beginning_stock,2358.926113,1.376704,3.115496,2357.18732
received,1.376704,33.558322,0.614169,34.320857
sold,3.115496,0.614169,18.442084,-14.71242
ending_stock,2357.18732,34.320857,-14.71242,2406.220597


In [None]:
# Covariance for products
df_products_clean[['unit_cost', 'list_price']].cov()

Unnamed: 0,unit_cost,list_price
unit_cost,763.037286,921.608849
list_price,921.608849,1347.970422


In [None]:
# Covariance for salesforce
df_salesforce_clean[['total_sales', 'effectiveness']].cov()

Unnamed: 0,total_sales,effectiveness
total_sales,853072300.0,-120.914535
effectiveness,-120.9145,0.015602


In [None]:
# Covariance for suppliers
df_suppliers_clean[['lead_time_days', 'rating']].cov()

Unnamed: 0,lead_time_days,rating
lead_time_days,13.925234,0.032441
rating,0.032441,0.330691


In [None]:
# Covariance for transactions
df_transactions_clean[['units_sold', 'list_price', 'sales_amount']].cov()

Unnamed: 0,units_sold,list_price,sales_amount
units_sold,9.00615,-0.434634,143.227974
list_price,-0.434634,296.704669,1400.455745
sales_amount,143.227974,1400.455745,11131.645559


##### 3.2.1.2 Correlation Matrix.

| Correlation Value     | Interpretation                |
| --------------------- | ----------------------------- |
| `+0.7` to `+1.0`      | Strong positive correlation   |
| `+0.3` to `+0.7`      | Moderate positive correlation |
| `0.0` to `+0.3`       | Weak positive correlation     |
| `0`                   | No correlation                |
| `-0.3` to `0`         | Weak negative correlation     |
| `-0.7` to `-0.3`      | Moderate negative correlation |
| `-1.0` to `-0.7`      | Strong negative correlation   |


In [None]:
# Correlation for customers
evaluate_correlation(df_customers_clean, columns=['total_spent', 'frequency'])

In [None]:
plot_scatter_matrixpx(df_customers_clean, columns=['total_spent', 'frequency'])

In [None]:
# Correlation for inventory
evaluate_correlation(df_inventory_clean, columns=['beginning_stock', 'received', 'sold', 'ending_stock'])

In [None]:
plot_scatter_matrixpx(df_inventory_clean, columns=['beginning_stock', 'received', 'sold', 'ending_stock'])

In [None]:
# Correlation for products
evaluate_correlation(df_products_clean, columns=['unit_cost', 'list_price'])

In [None]:
plot_scatter_matrixpx(df_products_clean, columns=['unit_cost', 'list_price'])

In [None]:
# Correlation for salesforce
evaluate_correlation(df_salesforce_clean, columns=['total_sales', 'effectiveness'])

In [None]:
plot_scatter_matrixpx(df_salesforce_clean, columns=['total_sales', 'effectiveness'])

In [None]:
# Correlation for suppliers
evaluate_correlation(df_suppliers_clean, columns=['lead_time_days', 'rating'])

In [None]:
plot_scatter_matrixpx(df_suppliers_clean, columns=['lead_time_days', 'rating'])

In [None]:
# Correlation for salesforce
evaluate_correlation(df_transactions_clean, columns=['units_sold', 'list_price', 'sales_amount'])

In [None]:
plot_scatter_matrixpx(df_transactions_clean, columns=['units_sold', 'list_price', 'sales_amount'])