#### Exploratory Data Analysis (EDA) - Initial Descriptive Statistics

Dataset: 
- _customers_clean.csv_
- _inventory_clean.csv_
- _products_clean.csv_
- _salesforce_clean.csv_
- _suppliers_clean.csv_
- _transactions_clean.csv_

Author: Luis Sergio Pastrana Lemus  
Date: 2025-07-06

# Exploratory Data Analysis – Grocery Store Dataset

## __1. Libraries__.

In [None]:
from pathlib import Path
import sys

# Define project root dynamically, gets the current directory from which the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:

    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *


from IPython.display import display, HTML
import os
import pandas as pd
import numpy as np

## __2. Path to Data file__.

In [None]:
# Build route to data file and upload
data_file_path = project_root / "data" / "processed" / "clean"

df_customers_clean = load_dataset_from_csv(data_file_path, "customers_clean.csv", header='infer', parse_dates=['join_date'])
df_inventory_clean = load_dataset_from_csv(data_file_path, "inventory_clean.csv", header='infer', parse_dates=['date'])
df_products_clean = load_dataset_from_csv(data_file_path, "products_clean.csv", header='infer')
df_salesforce_clean = load_dataset_from_csv(data_file_path, "salesforce_clean.csv", header='infer')
df_suppliers_clean = load_dataset_from_csv(data_file_path, "suppliers_clean.csv", header='infer')
df_transactions_clean = load_dataset_from_csv(data_file_path, "transactions_clean.csv", header='infer', parse_dates=['date'])

# data_file_path = project_root / "data" / "processed" / "feature"

# df_xxx_feature = load_dataset_from_csv(data_file_path, "xxx_feature.csv", sep=',', header='infer')

In [None]:
# Format notebook output
format_notebook()

## __3. Exploratory Data Analysis__.

### 3.0 Casting Data types.

In [None]:
# Call casting dtypes function from features.py and Identifying correctly missing values qith pd.NA

# missing values to pd.NA
df_inventory_clean = replace_missing_values(df_inventory_clean, include=['warehouse_location'])
df_customers_clean = replace_missing_values(df_customers_clean, include=['segment'])

# object to string
df_products_clean = cast_datatypes(df_products_clean, 'string', c_include=['product_name', 'brand'])
df_suppliers_clean = cast_datatypes(df_suppliers_clean, 'string', c_include=['supplier_name', 'contact_info'])
df_customers_clean = cast_datatypes(df_customers_clean, 'string', c_include=['customer_name'])
df_salesforce_clean = cast_datatypes(df_salesforce_clean, 'string', c_include=['employee_name'])

# object to numeric
df_products_clean = cast_datatypes(df_products_clean, 'numeric', numeric_type='Float64', c_include=['unit_cost'])
df_customers_clean = cast_datatypes(df_customers_clean, 'numeric', numeric_type="Float64", c_include=['total_spent'])

# object to category
df_products_clean = cast_datatypes(df_products_clean, 'category', c_include=['category', 'status'])
df_inventory_clean = cast_datatypes(df_inventory_clean, 'category', c_include=['warehouse_location'])
df_customers_clean = cast_datatypes(df_customers_clean, 'category', c_include=['segment'])
df_salesforce_clean = cast_datatypes(df_salesforce_clean, 'category', c_include=['region'])

# object to datetime
df_inventory_clean['date'] = pd.to_datetime(df_inventory_clean['date'], errors='coerce', utc=True)
df_customers_clean['join_date'] = pd.to_datetime(df_customers_clean['join_date'], errors='coerce', utc=True)
df_transactions_clean['date'] = pd.to_datetime(df_transactions_clean['date'], errors='coerce', utc=True)

In [None]:
df_customers_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5009 entries, 0 to 5008
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   customer_id    5009 non-null   int64              
 1   customer_name  5009 non-null   string             
 2   join_date      5009 non-null   datetime64[ns, UTC]
 3   total_spent    5009 non-null   Float64            
 4   frequency      5009 non-null   int64              
 5   segment        4858 non-null   category           
dtypes: Float64(1), category(1), datetime64[ns, UTC](1), int64(2), string(1)
memory usage: 205.7 KB


In [None]:
df_inventory_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20056 entries, 0 to 20055
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   inventory_id        20056 non-null  int64              
 1   date                20056 non-null  datetime64[ns, UTC]
 2   product_id          20056 non-null  int64              
 3   beginning_stock     20056 non-null  int64              
 4   received            20056 non-null  int64              
 5   sold                20056 non-null  int64              
 6   warehouse_location  19759 non-null  category           
 7   ending_stock        20056 non-null  int64              
dtypes: category(1), datetime64[ns, UTC](1), int64(6)
memory usage: 1.1 MB


In [None]:
df_products_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10055 entries, 0 to 10054
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   product_id    10055 non-null  int64   
 1   product_name  10055 non-null  string  
 2   category      10055 non-null  category
 3   supplier_id   10055 non-null  int64   
 4   unit_cost     10055 non-null  Float64 
 5   status        10055 non-null  category
 6   brand         10055 non-null  string  
 7   list_price    10055 non-null  float64 
dtypes: Float64(1), category(2), float64(1), int64(2), string(2)
memory usage: 501.3 KB


In [None]:
df_salesforce_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2006 entries, 0 to 2005
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   employee_id    2006 non-null   int64   
 1   employee_name  2006 non-null   string  
 2   region         2006 non-null   category
 3   total_sales    2006 non-null   float64 
 4   effectiveness  2006 non-null   float64 
dtypes: category(1), float64(2), int64(1), string(1)
memory usage: 65.0 KB


In [None]:
df_suppliers_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   supplier_id     2000 non-null   int64  
 1   supplier_name   2000 non-null   string 
 2   lead_time_days  2000 non-null   int64  
 3   contact_info    2000 non-null   string 
 4   rating          2000 non-null   float64
dtypes: float64(1), int64(2), string(2)
memory usage: 78.3 KB


In [None]:
df_transactions_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20078 entries, 0 to 20077
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   transaction_id  20078 non-null  int64              
 1   date            20078 non-null  datetime64[ns, UTC]
 2   product_id      20078 non-null  int64              
 3   units_sold      20078 non-null  int64              
 4   customer_id     20078 non-null  int64              
 5   employee_id     20078 non-null  int64              
 6   sales_amount    20078 non-null  float64            
 7   list_price      20078 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(2), int64(5)
memory usage: 1.2 MB


### 3.1  Descriptive Statistics.

#### 3.1.1 Descriptive statistics for Original datasets.

In [None]:
# Descriptive statistics for df_customers_clean dataset
df_customers_clean.describe(include='all')

Unnamed: 0,customer_id,customer_name,join_date,total_spent,frequency,segment
count,5009.0,5009,5009,5009.0,5009.0,4858
unique,,4839,,,,3
top,,stephanie_smith,,,,occasional
freq,,5,,,,1670
mean,2501.34977,,2024-07-05 16:23:17.165102848+00:00,2467.201088,24.565782,
min,1.0,,2023-07-07 07:00:00+00:00,0.0,0.0,
25%,1252.0,,2024-01-04 08:00:00+00:00,1183.14,12.0,
50%,2502.0,,2024-07-07 07:00:00+00:00,2452.65,24.0,
75%,3751.0,,2025-01-04 08:00:00+00:00,3720.61,37.0,
max,5000.0,,2025-07-06 07:00:00+00:00,4998.81,49.0,


In [None]:
# Descriptive statistics for df_inventory_clean dataset
df_inventory_clean.describe(include='all')

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
count,20056.0,20056,20056.0,20056.0,20056.0,20056.0,19759,20056.0
unique,,,,,,,4,
top,,,,,,,east,
freq,,,,,,,5029,
mean,10000.669376,2025-06-06 18:37:44.539290112+00:00,4999.319356,53.128041,9.452184,6.905265,,55.67496
min,1.0,2025-05-08 07:00:00+00:00,2.0,0.0,0.0,0.0,,0.0
25%,4995.75,2025-05-23 07:00:00+00:00,2506.0,25.0,4.0,3.0,,27.0
50%,10001.5,2025-06-07 07:00:00+00:00,4992.5,50.0,9.0,7.0,,53.0
75%,15002.25,2025-06-21 07:00:00+00:00,7467.25,76.0,15.0,11.0,,78.0
max,20000.0,2025-07-06 07:00:00+00:00,10000.0,891.0,19.0,14.0,,902.0


In [None]:
# Descriptive statistics for df_products_clean dataset
df_products_clean.describe(include='all')

Unnamed: 0,product_id,product_name,category,supplier_id,unit_cost,status,brand,list_price
count,10055.0,10055,10055,10055.0,10055.0,10055,10055,10055.0
unique,,4735,6,,,3,8241,
top,,guess_frozen_foods,snacks,,,discontinued,unknown,
freq,,8,1742,,,3371,502,
mean,5001.179015,,,995.408851,12.160354,,,18.811385
std,2886.218493,,,576.6751,27.62313,,,36.714717
min,1.0,,,1.0,0.5,,,0.62
25%,2503.5,,,497.0,5.465,,,7.95
50%,5002.0,,,993.0,10.415,,,15.48
75%,7501.5,,,1485.0,15.17,,,22.97


In [None]:
# Descriptive statistics for df_salesforce_clean dataset
df_salesforce_clean.describe(include='all')

Unnamed: 0,employee_id,employee_name,region,total_sales,effectiveness
count,2006.0,2006,2006,2006.0,2006.0
unique,,1972,4,,
top,,michelle_brown,south,,
freq,,3,537,,
mean,1000.535892,,,55732.060887,0.767125
std,577.859662,,,29207.401709,0.124908
min,1.0,,,10106.1,0.014067
25%,501.25,,,32457.225,0.69
50%,999.5,,,54423.78,0.77
75%,1500.75,,,77539.1275,0.86


In [None]:
# Descriptive statistics for df_suppliers_clean dataset
df_suppliers_clean.describe(include='all')

Unnamed: 0,supplier_id,supplier_name,lead_time_days,contact_info,rating
count,2000.0,2000,2000.0,2000,2000.0
unique,,1899,,2000,
top,,smith_plc,,365-261-6825,
freq,,6,,1,
mean,1000.5,,8.027,,3.989455
std,577.494589,,3.731653,,0.575057
min,1.0,,2.0,,3.0
25%,500.75,,5.0,,3.48
50%,1000.5,,8.0,,4.0
75%,1500.25,,11.0,,4.47


In [None]:
# Descriptive statistics for df_transactions_clean dataset
df_transactions_clean.describe(include='all')

Unnamed: 0,transaction_id,date,product_id,units_sold,customer_id,employee_id,sales_amount,list_price
count,20078.0,20078,20078.0,20078.0,20078.0,20078.0,20078.0,20078.0
mean,9993.35128,2024-07-05 15:40:02.271142656+00:00,4997.537454,5.133629,2501.994721,1005.072965,84.11275,16.469317
min,1.0,2023-07-07 07:00:00+00:00,1.0,1.0,1.0,1.0,0.35,0.249144
25%,4993.25,2024-01-06 08:00:00+00:00,2499.25,3.0,1250.25,510.25,25.045,7.69
50%,9988.5,2024-07-04 07:00:00+00:00,5022.0,5.0,2503.5,1006.0,59.76,15.27
75%,14991.75,2025-01-04 08:00:00+00:00,7500.75,7.0,3737.0,1505.0,116.6725,22.55
max,20000.0,2025-07-06 07:00:00+00:00,9999.0,28.0,5000.0,2000.0,2670.3,347.7
std,5773.112759,,2892.379555,3.001025,1446.470239,578.153203,105.506614,17.225117


#### 3.1.2 Descriptive statistics for customers_clean dataset, quantitive values.

<table>
  <thead>
    <tr>
      <th>CV (%)</th>
      <th>Interpretation for Coefficient of Variation</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><small><strong>0–10%</strong></small></td>
      <td><small><strong>Very low</strong> variability → <strong>very reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>10–20%</strong></small></td>
      <td><small><strong>Moderate</strong> variability → <strong>reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>20–30%</strong></small></td>
      <td><small><strong>Considerable</strong> variability → <strong>some what skewed</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>>30%</strong></small></td>
      <td><small>High<strong> variability</strong> → <strong>prefer</strong> Median</small></td>
    </tr>
  </tbody>
</table>


In [None]:
df_customers_clean['total_spent'].describe()

count         5009.0
mean     2467.201088
std      1444.988578
min              0.0
25%          1183.14
50%          2452.65
75%          3720.61
max          4998.81
Name: total_spent, dtype: Float64

In [None]:
df_customers_clean['frequency'].describe()

count    5009.000000
mean       24.565782
std        14.281868
min         0.000000
25%        12.000000
50%        24.000000
75%        37.000000
max        49.000000
Name: frequency, dtype: float64

In [None]:
# Evaluate the coefficient of variation to select the proper measure of central tendency
evaluate_central_trend(df_customers_clean, 'total_spent')

In [None]:
evaluate_central_trend(df_customers_clean, 'frequency')

In [None]:
# Evaluate boundary thresholds and detect potential outliers
outlier_limit_bounds(df_customers_clean, 'total_spent', bound='both', clamp_zero=True)

In [None]:
outlier_limit_bounds(df_customers_clean, 'frequency', bound='both', clamp_zero=True)

#### 3.1.3 Descriptive statistics for inventory_clean dataset, quantitive values.

<table>
  <thead>
    <tr>
      <th>CV (%)</th>
      <th>Interpretation for Coefficient of Variation</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><small><strong>0–10%</strong></small></td>
      <td><small><strong>Very low</strong> variability → <strong>very reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>10–20%</strong></small></td>
      <td><small><strong>Moderate</strong> variability → <strong>reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>20–30%</strong></small></td>
      <td><small><strong>Considerable</strong> variability → <strong>some what skewed</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>>30%</strong></small></td>
      <td><small>High<strong> variability</strong> → <strong>prefer</strong> Median</small></td>
    </tr>
  </tbody>
</table>


In [None]:
df_inventory_clean['beginning_stock'].describe()

count    20056.000000
mean        53.128041
std         48.568777
min          0.000000
25%         25.000000
50%         50.000000
75%         76.000000
max        891.000000
Name: beginning_stock, dtype: float64

In [None]:
df_inventory_clean['received'].describe()

count    20056.000000
mean         9.452184
std          5.792954
min          0.000000
25%          4.000000
50%          9.000000
75%         15.000000
max         19.000000
Name: received, dtype: float64

In [None]:
df_inventory_clean['sold'].describe()

count    20056.000000
mean         6.905265
std          4.294425
min          0.000000
25%          3.000000
50%          7.000000
75%         11.000000
max         14.000000
Name: sold, dtype: float64

In [None]:
df_inventory_clean['ending_stock'].describe()

count    20056.000000
mean        55.674960
std         49.053242
min          0.000000
25%         27.000000
50%         53.000000
75%         78.000000
max        902.000000
Name: ending_stock, dtype: float64

In [None]:
# Evaluate the coefficient of variation to select the proper measure of central tendency
evaluate_central_trend(df_inventory_clean, 'beginning_stock')

In [None]:
evaluate_central_trend(df_inventory_clean, 'received')

In [None]:
evaluate_central_trend(df_inventory_clean, 'sold')

In [None]:
evaluate_central_trend(df_inventory_clean, 'ending_stock')

In [None]:
# Evaluate boundary thresholds and detect potential outliers
outlier_limit_bounds(df_inventory_clean, 'beginning_stock', bound='both', clamp_zero=True)

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
65,66,2025-05-26 07:00:00+00:00,6690,370,2,8,south,364
259,260,2025-07-05 07:00:00+00:00,2066,160,19,7,east,172
428,429,2025-05-26 07:00:00+00:00,5948,168,4,7,west,165
498,499,2025-06-25 07:00:00+00:00,3688,184,15,8,east,191
532,533,2025-06-28 07:00:00+00:00,608,470,6,6,north,470
...,...,...,...,...,...,...,...,...
20043,16666,2025-06-25 07:00:00+00:00,2493,480,12,0,south,492
20046,1666,2025-06-19 07:00:00+00:00,6538,280,0,10,south,270
20049,2215,2025-07-02 07:00:00+00:00,219,603,9,2,south,610
20051,16616,2025-05-17 07:00:00+00:00,6764,756,6,12,east,750


In [None]:
outlier_limit_bounds(df_inventory_clean, 'received', bound='both', clamp_zero=True)

In [None]:
outlier_limit_bounds(df_inventory_clean, 'sold', bound='both', clamp_zero=True)

In [None]:
outlier_limit_bounds(df_inventory_clean, 'ending_stock', bound='both', clamp_zero=True)

Unnamed: 0,inventory_id,date,product_id,beginning_stock,received,sold,warehouse_location,ending_stock
65,66,2025-05-26 07:00:00+00:00,6690,370,2,8,south,364
259,260,2025-07-05 07:00:00+00:00,2066,160,19,7,east,172
428,429,2025-05-26 07:00:00+00:00,5948,168,4,7,west,165
498,499,2025-06-25 07:00:00+00:00,3688,184,15,8,east,191
532,533,2025-06-28 07:00:00+00:00,608,470,6,6,north,470
...,...,...,...,...,...,...,...,...
20043,16666,2025-06-25 07:00:00+00:00,2493,480,12,0,south,492
20046,1666,2025-06-19 07:00:00+00:00,6538,280,0,10,south,270
20049,2215,2025-07-02 07:00:00+00:00,219,603,9,2,south,610
20051,16616,2025-05-17 07:00:00+00:00,6764,756,6,12,east,750


#### 3.1.4 Descriptive statistics for products_clean dataset, quantitive values.

<table>
  <thead>
    <tr>
      <th>CV (%)</th>
      <th>Interpretation for Coefficient of Variation</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><small><strong>0–10%</strong></small></td>
      <td><small><strong>Very low</strong> variability → <strong>very reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>10–20%</strong></small></td>
      <td><small><strong>Moderate</strong> variability → <strong>reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>20–30%</strong></small></td>
      <td><small><strong>Considerable</strong> variability → <strong>some what skewed</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>>30%</strong></small></td>
      <td><small>High<strong> variability</strong> → <strong>prefer</strong> Median</small></td>
    </tr>
  </tbody>
</table>


In [None]:
df_products_clean['unit_cost'].describe()

count        10055.0
mean       12.160354
std         27.62313
min              0.5
25%            5.465
50%           10.415
75%            15.17
max      1927.144909
Name: unit_cost, dtype: Float64

In [None]:
df_products_clean['list_price'].describe()

count    10055.000000
mean        18.811385
std         36.714717
min          0.620000
25%          7.950000
50%         15.480000
75%         22.970000
max       2312.570000
Name: list_price, dtype: float64

In [None]:
# Evaluate the coefficient of variation to select the proper measure of central tendency
evaluate_central_trend(df_products_clean, 'unit_cost')

In [None]:
evaluate_central_trend(df_products_clean, 'list_price')

In [None]:
# Evaluate boundary thresholds and detect potential outliers
outlier_limit_bounds(df_products_clean, 'unit_cost', bound='both', clamp_zero=True)

Unnamed: 0,product_id,product_name,category,supplier_id,unit_cost,status,brand,list_price
4,5,place_frozen_foods,bakery,964,73.3,backordered,leblanc_james_and_thompson,87.96
135,136,sell_dairy,dairy,1145,174.768661,backordered,thomas_wilson,209.72
138,139,free_frozen_foods,snacks,1941,145.1,backordered,woodard_plc,174.12
290,291,develop_bakery,bakery,1610,53.5,active,cox_rich_and_perez,64.20
351,352,away_frozen_foods,frozen_foods,766,57.206633,active,henderson_group,68.65
...,...,...,...,...,...,...,...,...
10038,2398,type_beverages,beverages,333,36.573256,active,lucero_coleman_and_martinez,43.89
10041,6656,job_dairy,bakery,1290,107.374606,backordered,meyer_simmons,128.85
10045,8994,month_bakery,produce,917,99.368985,active,vazquez_miller_and_shannon,119.24
10049,7000,science_bakery,dairy,1993,105.965485,backordered,kelly_patterson_and_garza,127.16


In [None]:
outlier_limit_bounds(df_products_clean, 'list_price', bound='both', clamp_zero=True)

Unnamed: 0,product_id,product_name,category,supplier_id,unit_cost,status,brand,list_price
4,5,place_frozen_foods,bakery,964,73.3,backordered,leblanc_james_and_thompson,87.96
135,136,sell_dairy,dairy,1145,174.768661,backordered,thomas_wilson,209.72
138,139,free_frozen_foods,snacks,1941,145.1,backordered,woodard_plc,174.12
290,291,develop_bakery,bakery,1610,53.5,active,cox_rich_and_perez,64.20
327,328,century_frozen_foods,dairy,780,6.35,discontinued,wood_perkins_and_rodriguez,97.80
...,...,...,...,...,...,...,...,...
10033,6961,himself_beverages,produce,182,80.418871,discontinued,cruz_inc,96.50
10041,6656,job_dairy,bakery,1290,107.374606,backordered,meyer_simmons,128.85
10045,8994,month_bakery,produce,917,99.368985,active,vazquez_miller_and_shannon,119.24
10049,7000,science_bakery,dairy,1993,105.965485,backordered,kelly_patterson_and_garza,127.16


#### 3.1.5 Descriptive statistics for salesforce_clean dataset, quantitive values.

<table>
  <thead>
    <tr>
      <th>CV (%)</th>
      <th>Interpretation for Coefficient of Variation</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><small><strong>0–10%</strong></small></td>
      <td><small><strong>Very low</strong> variability → <strong>very reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>10–20%</strong></small></td>
      <td><small><strong>Moderate</strong> variability → <strong>reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>20–30%</strong></small></td>
      <td><small><strong>Considerable</strong> variability → <strong>some what skewed</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>>30%</strong></small></td>
      <td><small>High<strong> variability</strong> → <strong>prefer</strong> Median</small></td>
    </tr>
  </tbody>
</table>


In [None]:
df_salesforce_clean['total_sales'].describe()

count      2006.000000
mean      55732.060887
std       29207.401709
min       10106.100000
25%       32457.225000
50%       54423.780000
75%       77539.127500
max      304984.180000
Name: total_sales, dtype: float64

In [None]:
df_salesforce_clean['effectiveness'].describe()

count    2006.000000
mean        0.767125
std         0.124908
min         0.014067
25%         0.690000
50%         0.770000
75%         0.860000
max         0.950000
Name: effectiveness, dtype: float64

In [None]:
# Evaluate the coefficient of variation to select the proper measure of central tendency
evaluate_central_trend(df_salesforce_clean, 'total_sales')

In [None]:
evaluate_central_trend(df_salesforce_clean, 'effectiveness')

In [None]:
# Evaluate boundary thresholds and detect potential outliers
outlier_limit_bounds(df_salesforce_clean, 'total_sales', bound='upper', clamp_zero=True)

Unnamed: 0,employee_id,employee_name,region,total_sales,effectiveness
35,36,james_gibson,north,285711.59,0.82
741,742,kenneth_nelson,east,244232.6,0.71
811,812,briana_mcdonald,east,216686.32,0.9
813,814,amanda_carroll,south,247344.63,0.76
1179,1180,jenna_pittman,south,304984.18,0.71
1693,1694,jennifer_ortiz,east,146798.72,0.63
1795,1796,keith_roberts,north,154181.17,0.87
1858,1859,rachel_morgan,east,254252.54,0.81
1906,1907,katie_preston,north,212858.23,0.83
1967,1968,robert_payne,south,187479.37,0.63


In [None]:
outlier_limit_bounds(df_salesforce_clean, 'effectiveness', bound='both', clamp_zero=True)

Unnamed: 0,employee_id,employee_name,region,total_sales,effectiveness
7,8,mitchell_thomas,east,71050.8,0.014107
34,35,melissa_rogers,north,27109.95,0.096224
217,218,ryan_hansen,east,38642.52,0.033314
270,271,susan_perkins,west,23865.54,0.05335
431,432,janet_hansen,north,25172.49,0.014067
596,597,toni_hoffman,south,59507.21,0.064532
640,641,alexandra_ho,east,64863.64,0.025719
658,659,danielle_gill,east,46156.18,0.093792
721,722,anthony_mcconnell_jr,west,95953.31,0.052051
739,740,joshua_hines,north,82681.33,0.099967


#### 3.1.6 Descriptive statistics for suppliers_clean dataset, quantitive values.

<table>
  <thead>
    <tr>
      <th>CV (%)</th>
      <th>Interpretation for Coefficient of Variation</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><small><strong>0–10%</strong></small></td>
      <td><small><strong>Very low</strong> variability → <strong>very reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>10–20%</strong></small></td>
      <td><small><strong>Moderate</strong> variability → <strong>reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>20–30%</strong></small></td>
      <td><small><strong>Considerable</strong> variability → <strong>some what skewed</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>>30%</strong></small></td>
      <td><small>High<strong> variability</strong> → <strong>prefer</strong> Median</small></td>
    </tr>
  </tbody>
</table>


In [None]:
df_suppliers_clean['lead_time_days'].describe()

count    2000.000000
mean        8.027000
std         3.731653
min         2.000000
25%         5.000000
50%         8.000000
75%        11.000000
max        14.000000
Name: lead_time_days, dtype: float64

In [None]:
df_suppliers_clean['rating'].describe()

count    2000.000000
mean        3.989455
std         0.575057
min         3.000000
25%         3.480000
50%         4.000000
75%         4.470000
max         5.000000
Name: rating, dtype: float64

In [None]:
# Evaluate the coefficient of variation to select the proper measure of central tendency
evaluate_central_trend(df_suppliers_clean, 'lead_time_days')

In [None]:
evaluate_central_trend(df_suppliers_clean, 'rating')

In [None]:
# Evaluate boundary thresholds and detect potential outliers
outlier_limit_bounds(df_suppliers_clean, 'lead_time_days', bound='both', clamp_zero=True)

In [None]:
outlier_limit_bounds(df_suppliers_clean, 'rating', bound='both', clamp_zero=True)

#### 3.1.7 Descriptive statistics for transactions_clean dataset, quantitive values.

<table>
  <thead>
    <tr>
      <th>CV (%)</th>
      <th>Interpretation for Coefficient of Variation</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><small><strong>0–10%</strong></small></td>
      <td><small><strong>Very low</strong> variability → <strong>very reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>10–20%</strong></small></td>
      <td><small><strong>Moderate</strong> variability → <strong>reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>20–30%</strong></small></td>
      <td><small><strong>Considerable</strong> variability → <strong>some what skewed</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>>30%</strong></small></td>
      <td><small>High<strong> variability</strong> → <strong>prefer</strong> Median</small></td>
    </tr>
  </tbody>
</table>


In [None]:
df_transactions_clean['units_sold'].describe()

count    20078.000000
mean         5.133629
std          3.001025
min          1.000000
25%          3.000000
50%          5.000000
75%          7.000000
max         28.000000
Name: units_sold, dtype: float64

In [None]:
df_transactions_clean['list_price'].describe()

count    20078.000000
mean        16.469317
std         17.225117
min          0.249144
25%          7.690000
50%         15.270000
75%         22.550000
max        347.700000
Name: list_price, dtype: float64

In [None]:
df_transactions_clean['sales_amount'].describe()

count    20078.000000
mean        84.112750
std        105.506614
min          0.350000
25%         25.045000
50%         59.760000
75%        116.672500
max       2670.300000
Name: sales_amount, dtype: float64

In [None]:
# Evaluate the coefficient of variation to select the proper measure of central tendency
evaluate_central_trend(df_transactions_clean, 'units_sold')

In [None]:
evaluate_central_trend(df_transactions_clean, 'list_price')

In [None]:
evaluate_central_trend(df_transactions_clean, 'sales_amount')

In [None]:
# Evaluate boundary thresholds and detect potential outliers
outlier_limit_bounds(df_transactions_clean, 'units_sold', bound='both', clamp_zero=True)

Unnamed: 0,transaction_id,date,product_id,units_sold,customer_id,employee_id,sales_amount,list_price
109,110,2025-02-19 08:00:00+00:00,1785,20,563,496,101.00,5.05
390,389,2024-03-13 07:00:00+00:00,2107,26,3755,1849,339.04,13.04
423,422,2024-04-20 07:00:00+00:00,5452,21,763,1685,157.71,7.51
481,480,2024-08-04 07:00:00+00:00,8840,18,3205,1036,190.98,10.61
504,503,2025-02-10 08:00:00+00:00,9796,18,4681,229,416.16,23.12
...,...,...,...,...,...,...,...,...
20069,8201,2024-05-09 07:00:00+00:00,6940,16,4444,1318,32.32,2.02
20070,19179,2023-12-05 08:00:00+00:00,6442,19,1041,1617,251.37,13.23
20072,15354,2023-07-30 07:00:00+00:00,5227,21,178,1723,378.21,18.01
20074,6893,2025-01-28 08:00:00+00:00,8314,22,2800,1316,492.14,22.37


In [None]:
outlier_limit_bounds(df_transactions_clean, 'list_price', bound='both', clamp_zero=True)

Unnamed: 0,transaction_id,date,product_id,units_sold,customer_id,employee_id,sales_amount,list_price
44,45,2024-11-15 08:00:00+00:00,2713,5,227,203,507.0,101.4
120,121,2024-11-03 07:00:00+00:00,3134,4,4263,1670,832.4,208.1
164,165,2025-05-14 07:00:00+00:00,2086,1,1420,464,222.4,222.4
296,296,2025-01-06 08:00:00+00:00,8269,3,4429,774,317.1,105.7
552,551,2025-04-13 07:00:00+00:00,3134,1,3333,1374,208.1,208.1
...,...,...,...,...,...,...,...,...
19053,19007,2024-10-02 07:00:00+00:00,2218,2,1369,1354,224.8,112.4
19101,19055,2025-02-03 08:00:00+00:00,7085,3,4704,883,317.1,105.7
19419,19373,2023-10-28 07:00:00+00:00,1127,6,1165,905,1328.4,221.4
19494,19448,2024-06-12 07:00:00+00:00,759,4,3336,516,834.4,208.6


In [None]:
outlier_limit_bounds(df_transactions_clean, 'sales_amount', bound='both', clamp_zero=True)

Unnamed: 0,transaction_id,date,product_id,units_sold,customer_id,employee_id,sales_amount,list_price
44,45,2024-11-15 08:00:00+00:00,2713,5,227,203,507.00,101.40
54,55,2024-05-18 07:00:00+00:00,1021,9,1697,1330,306.90,34.10
120,121,2024-11-03 07:00:00+00:00,3134,4,4263,1670,832.40,208.10
166,167,2023-12-13 08:00:00+00:00,6510,9,2230,1191,258.39,28.71
201,201,2024-07-19 07:00:00+00:00,788,9,2801,606,296.64,32.96
...,...,...,...,...,...,...,...,...
20061,3765,2024-06-27 07:00:00+00:00,5319,21,4694,318,467.25,22.25
20067,3547,2025-03-14 07:00:00+00:00,3169,20,2778,565,534.40,26.72
20068,9621,2023-12-18 08:00:00+00:00,7824,16,1737,532,465.76,29.11
20072,15354,2023-07-30 07:00:00+00:00,5227,21,178,1723,378.21,18.01
