In [None]:
import os
os.chdir('..')

In [None]:
import pandas as pd
import yaml

from src.features.functions_clean_ean import total_clean_ean
from src.features.functions_clean_labels import total_clean_label
from src.features.functions_get_true_label import get_true_label

def import_yaml_config(location: str) -> dict:
    """
    import_yaml_config:
        wrapper to easily import yaml

    @param location (str): File path
    """
    with open(location, "r", encoding="utf-8") as stream:
        config = yaml.safe_load(stream)

    return config

config = import_yaml_config('configuration/config.yaml')

In [None]:
path_data = config['path']['path_data']
data_lidl_raw = config['data_raw']['lidl_raw']

dtype={
    'ean': str,
    'description_ean': str,
    'id_famille': str,
    'ca': float,
    'ca_prix_qte': float
}

In [None]:
df = pd.read_csv(path_data + data_lidl_raw, dtype=dtype).rename(columns=lambda x: x.upper())

# Cleaning up & reconstructing EANs

In [None]:
df = total_clean_ean(
    df,
    drop=True,
    ean_column_input="EAN",
    ean_column_output="EAN_FINAL",
    column_not_to_drop=["DESCRIPTION_EAN", "ID_FAMILLE", "CA", "CA_PRIX_QTE"],
)

# Cleaning up product descriptions

In [None]:
df = total_clean_label(
    df,
    drop=True,
    stem=False,
    ean_column="EAN_FINAL",
    label_column="DESCRIPTION_EAN",
    label_column_final="DESCRIPTION_EAN_FINAL",
    column_not_to_drop=["EAN_FINAL", "ID_FAMILLE", "CA", "CA_PRIX_QTE"],
)

# Adding labels

In [None]:
df = get_true_label(
    df,
    path_conversion_iri=path_data + config["conversion_files"]["conversion_iri"],
    path_data_field=path_data + config["data_raw"]["sample_field"],
    path_conversion_variete=path_data + config["conversion_files"]["conversion_variete"],
    path_conversion_ean_iri=path_data + config["conversion_files"]["conversion_ean_iri"],
    brand_column="ENSEIGNE",
    brand="LIDL",
    iri_column="ID_FAMILLE",
    ean_column="CODE_BARRES",
    label_column="LIBELLE",
)