### Exploratory Data Analysis (EDA)

This notebook is used to perform EDA on raw data or on features generated from data

In [None]:
import os 

import dtale
import pandas as pd
from zenml import pipeline
from IPython.display import clear_output

from configs import configs
from configs.parser import ConfigParser
from data_manager.loaders import StructuredData
from steps import (
    data_formatter,
    data_loader,
    features_engineer_creator,
    features_generator,
)

data_loader = data_loader.with_options(enable_cache=True)
data_formatter = data_formatter.with_options(enable_cache=True)
features_engineer_creator = features_engineer_creator.with_options(enable_cache=True)
features_generator = features_generator.with_options(enable_cache=True)


In [None]:
@pipeline(enable_cache=True)
def load_data() -> StructuredData:
    cfg_parser = ConfigParser()
    data = data_loader(cfg_parser.general().without_varieties(), cfg_parser.multispectral())
    data = data_formatter(data, cfg_parser.general(), cfg_parser.formatter())
    features_engineer = features_engineer_creator(data, cfg_parser.features())
    data, _, _ = features_generator(features_engineer, data)
    return data

# Set the TOML config file as an environment variable (parsed in the pipelines)
os.environ[configs.TOML_ENV_NAME] = str(configs.TOML_DIR / "clf/umap_varieties.toml")
# Run the pipeline only the first time to load the data
load_data()
clear_output()

In [None]:
last_run = load_data.model.last_successful_run
data = last_run.steps["features_generator"]
data = data.outputs["data_train_feat"].load()

In [None]:
# merge two dataframes
df = pd.concat([data.data, data.meta], axis=1)
# Assigning a reference to a running D-Tale process.
d = dtale.show(df)
# Using Python's `webbrowser` package it will try and open your server's default browser to this process.
d.open_browser()

In [None]:
# Shutting down D-Tale process
# d.kill()

In [None]:
import sys
sys.path.insert(0,'..')

import pygwalker as pyg
import pandas as pd
from rich import print

from configs import paths, configs

In [None]:
regression_label="LICOR_leaf"
excel_file = paths.PATHS_MEASUREMENTS[regression_label][1]
print(excel_file)


In [None]:
df = pd.read_excel(excel_file)
walker = pyg.walk(
    df,
    spec=str(configs.SAVE_DIR / "walker_spec.json"),    # this json file will save your chart state, you need to click save button in ui mannual when you finish a chart, 'autosave' will be supported in the future.
    use_kernel_calc=True,          # set `use_kernel_calc=True`, pygwalker will use duckdb as computing engine, it support you explore bigger dataset(<=100GB).
    dark="light",
    store_chart_data=True,          
    hideDataSourceConfig=False
)