#### Exploratory Data Analysis (EDA) - Initial Descriptive Statistics

Dataset: 
- _xxx_clean.csv_
- _yyy_clean.csv_
- _zzz_clean.csv_

Author: Luis Sergio Pastrana Lemus  
Date: 2025-MM-DD

# Exploratory Data Analysis – Name XXX Dataset

## __1. Libraries__.

In [None]:
from pathlib import Path
import sys

# Define project root dynamically, gets the current directory from which the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:

    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *


from IPython.display import display, HTML
import os
import pandas as pd
import numpy as np

## __2. Path to Data file__.

In [None]:
# Build route to data file and upload
data_file_path = project_root / "data" / "processed" / "clean"

df_xxx_clean = load_dataset_from_csv(data_file_path, "xxx_clean.csv", header='infer', parse_dates=['join_date'])

# data_file_path = project_root / "data" / "processed" / "feature"

# df_xxx_feature = load_dataset_from_csv(data_file_path, "xxx_feature.csv", sep=',', header='infer')

In [None]:
# Format notebook output
format_notebook()

## __3. Exploratory Data Analysis__.

### 3.0 Casting Data types.

In [None]:
# Call casting dtypes function from features.py and Identifying correctly missing values qith pd.NA

# missing values to pd.NA
df_inventory_clean = replace_missing_values(df_xxx_clean, include=['column_name'])

# object to string
df_products_clean = cast_datatypes(df_xxx_clean, 'string', c_include=['column_name'])

# object to numeric
df_products_clean = cast_datatypes(df_xxx_clean, 'numeric', numeric_type='Float64', c_include=['column_name'])

# object to category
df_products_clean = cast_datatypes(df_xxx_clean, 'category', c_include=['column_name_01'])

# object to datetime
df_inventory_clean['date'] = pd.to_datetime(df_xxx_clean['column_name'], errors='coerce', utc=True)

In [None]:
df_xxx_clean.info()

### 3.1  Descriptive Statistics.

#### 3.1.1 Descriptive statistics for Original datasets.

In [None]:
# Descriptive statistics for df_xxx_clean dataset
df_xxx_clean.describe(include='all')

#### 3.1.2 Descriptive statistics for xxx_clean dataset, quantitive values.

<table>
  <thead>
    <tr>
      <th>CV (%)</th>
      <th>Interpretation for Coefficient of Variation</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><small><strong>0–10%</strong></small></td>
      <td><small><strong>Very low</strong> variability → <strong>very reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>10–20%</strong></small></td>
      <td><small><strong>Moderate</strong> variability → <strong>reliable</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>20–30%</strong></small></td>
      <td><small><strong>Considerable</strong> variability → <strong>some what skewed</strong> Mean</small></td>
    </tr>
    <tr>
      <td><small><strong>>30%</strong></small></td>
      <td><small>High<strong> variability</strong> → <strong>prefer</strong> Median</small></td>
    </tr>
  </tbody>
</table>


In [None]:
df_xxx_clean['total_spent'].describe()

In [None]:
# Evaluate the coefficient of variation to select the proper measure of central tendency
evaluate_central_trend(df_xxx_clean, 'column_name')

In [None]:
# Evaluate boundary thresholds and detect potential outliers
outlier_limit_bounds(df_xxx_clean, 'total_spent', bound='both', clamp_zero=True)