In [None]:
import os
os.chdir('..')

In [None]:
import pandas as pd
import yaml
import parquet
import multiprocessing

from src.models.lcpn_fasttext import get_LCPN_prediction_fasttext
from src.models.flat_fasttext import get_flat_prediction_fasttext
from src.models.evaluation import get_evaluation_model
from src.models.hiclass import get_data_hiclass
from src.visualization.export_data_quarto import (
    export_sankey_data,
    export_accuracy_data
)    

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from hiclass import LocalClassifierPerParentNode

def import_yaml_config(location: str) -> dict:
    """
    import_yaml_config:
        wrapper to easily import yaml

    @param location (str): File path
    """
    with open(location, "r", encoding="utf-8") as stream:
        config = yaml.safe_load(stream)

    return config

config = import_yaml_config('configuration/config.yaml')

In [None]:
params = {
    "dim": 100,                          # Size of word vectors
    "lr": 0.1,                           # Learning rate
    "epoch": 100,                       # Number of training epochs to train for
    "wordNgrams": 3,                     # Number of word n-grams to consider during training
    "minn": 3,                           # Min length of char ngram
    "maxn": 6,                           # Max length of char ngram
    "minCount": 3,                       # Min number of word occurences
    "bucket": 1000000,                   # Number of buckets (limit on the number of character n-grams)
    "loss": "ova",                         # Type of loss
    "label_prefix": "__label__",          # Label prefix
    "thread": int(2 * multiprocessing.cpu_count() / 3) # Number of CPU threads
}

In [None]:
path_data = config['path']['path_data']
data_lidl_clean = config['data_preprocessed']['lidl']

In [None]:
df = pd.read_parquet(path_data + data_lidl_clean)

In [None]:
df = df.loc[df['TRUE_COICOP'].notna()]

In [None]:
df_train, df_test = train_test_split(
            df, test_size=0.2, random_state=42
        )

# Prediction using _flat_ fastText

In [None]:
df_flat_fasttext = get_flat_prediction_fasttext(df_train, df_test, 'TRUE_COICOP', 'DESCRIPTION_EAN_FINAL', params)

In [None]:
get_evaluation_model(df_flat_fasttext, 'TRUE_COICOP', 'FLAT_PRED')

# Prediction using _hierarchical_ fastText

In [None]:
df_pred_lcpn = get_LCPN_prediction_fasttext(
    df_train, df_test, 'TRUE_COICOP', 'DESCRIPTION_EAN_FINAL', params
)

In [None]:
get_evaluation_model(df_pred_lcpn, 'TRUE_COICOP', 'LCPN_PRED')

# Prediction using Hiclass

### Given the limited amount of data available from Lidl, using HiClass for hierarchical predictions can be used as an alternative.

In [None]:
max_df = config['hyper_parameters_svm']['max_df']
C = config['hyper_parameters_svm']['C']
kernel = config['hyper_parameters_svm']['kernel']
gamma = config['hyper_parameters_svm']['gamma']

In [None]:
df_hiclass = get_data_hiclass(df, 'TRUE_COICOP')

In [None]:
X = df_hiclass['DESCRIPTION_EAN_FINAL']
y = df_hiclass[['DIVISION', 'GROUPE', 'CLASSE', 'SOUS-CLASSE', 'POSTE']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=max_df, ngram_range=(1,2))),
    ('clf', LocalClassifierPerParentNode(local_classifier=SVC(C=C, kernel=kernel, gamma=gamma), n_jobs=int(2 * multiprocessing.cpu_count() / 3))),
])

In [None]:
pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)

In [None]:
hiclass_pred = pd.DataFrame({'HICLASS_PRED':pd.DataFrame(predictions)[4], 'TRUE_COICOP':y_test.reset_index(drop=True)['POSTE']})

In [None]:
get_evaluation_model(hiclass_pred, 'TRUE_COICOP', 'HICLASS_PRED')

# Export data for DataViz

In [None]:
export_accuracy_data(
    hiclass_pred,
    true_coicop_column="TRUE_COICOP",
    pred_column="HICLASS_PRED",
    file_name="accuracy_lidl_hiclass",
)

In [None]:
export_sankey_data(
    hiclass_pred,
    true_coicop_column="TRUE_COICOP",
    pred_column="HICLASS_PRED",
    file_name="sankey_lidl_hiclass",
)