In [1]:
import sys
import time

import mlflow
import pandas as pd
import yaml
import fasttext

from constants import TEXT_FEATURE
from fasttext_classifier.fasttext_evaluator import FastTextEvaluator
from fasttext_classifier.fasttext_preprocessor import FastTextPreprocessor
from fasttext_classifier.fasttext_trainer import FastTextTrainer
from fasttext_classifier.fasttext_wrapper import FastTextWrapper
from utils import get_root_path


In [2]:
config_path = "config/config_fasttext41.yaml"

preprocessor = FastTextPreprocessor()
trainer = FastTextTrainer()

print("*** Preprocessing the database...\n")
# Load data, assumed to be stored in a .parquet file
df = pd.read_parquet("../data/extraction_sirene_20220712_harmonised.parquet", engine="pyarrow")
df = df.sample(frac=0.001)

with open(get_root_path() / config_path, "r") as stream:
    config = yaml.safe_load(stream)
params = config["params"]
categorical_features = config["categorical_features"]
Y = config["Y"][0]
oversampling = config["oversampling"]


*** Preprocessing the database...



In [3]:
df = df.rename(columns={"APE_SICORE": "APE_NIV5"})
try:
    df_naf = pd.read_csv(r"./data/naf_extended.csv", dtype=str)
except FileNotFoundError:
    df_naf = pd.read_csv(r"../data/naf_extended.csv", dtype=str)
df_naf[["NIV3", "NIV4", "NIV5"]] = df_naf[["NIV3", "NIV4", "NIV5"]].apply(
    lambda x: x.str.replace(".", "", regex=False)
)

df_naf = df_naf.rename(columns={f"NIV{i}" : f"APE_NIV{i}" for i in range(1,6)})
df_naf = df_naf[[f"APE_NIV{i}" for i in range(1,6)] + ["LIB_NIV5"]]
df = df.join(df_naf.set_index("APE_NIV5"), on="APE_NIV5")
MissingCodes = set(df_naf["APE_NIV5"]) - set(df["APE_NIV5"])

Fake_obs = df_naf[df_naf.APE_NIV5.isin(MissingCodes)]
Fake_obs.loc[:,"LIB_SICORE"] = Fake_obs.LIB_NIV5
Fake_obs.loc[:,"DATE"] = pd.Timestamp.today()
df = pd.concat([df, Fake_obs])

In [6]:
# General preprocessing
variables = [Y] + [TEXT_FEATURE]
if categorical_features is not None:
    variables += categorical_features
    df[categorical_features] = df[categorical_features].fillna(value="NaN")
df = df[
    variables
    + ["APE_NIV" + str(i) for i in range(1, 6) if str(i) not in [Y[-1]]]
]
df = df.dropna(subset=[Y] + [TEXT_FEATURE])

In [3]:
# Preprocess data
df_train, df_test, df_gu = preprocessor.preprocess(
    df=df,
    y=Y,
    text_feature=TEXT_FEATURE,
    categorical_features=categorical_features,
    oversampling=oversampling,
)

In [4]:
model = trainer.train(df_train, Y, TEXT_FEATURE, categorical_features, params)
model.save_model("../data/temp3.bin")

Read 1M words
Number of words:  4722
Number of labels: 245
Progress: 100.0% words/sec/thread:   30052 lr:  0.000000 avg.loss:  0.877309 ETA:   0h 0m 0s


In [4]:
#model = trainer.train(df_train, Y, TEXT_FEATURE, categorical_features, params)
#model.save_model("../data/temp.bin")
model = fasttext.load_model("../data/temp.bin")




In [89]:
# Evaluation
print("*** Evaluating the model...\n")
t = time.time()

evaluator = FastTextEvaluator(model)
accuracies = evaluator.evaluate(
df_test, Y, TEXT_FEATURE, categorical_features, 2
)
print(f"Evaluation lasted {round(time.time() - t,1)} seconds.\n")

*** Evaluating the model...

Evaluation lasted 2.1 seconds.



In [5]:
import numpy as np

In [87]:
print("*** Evaluating the model...\n")
t = time.time()

evaluator = FastTextEvaluator(model)
accuracies = compute_accuracies(get_aggregated_preds(df_test, Y, TEXT_FEATURE, categorical_features, 2), Y)
print(f"Evaluation lasted {round(time.time() - t,1)} seconds.\n")

*** Evaluating the model...

Evaluation lasted 1.6 seconds.



In [6]:
def get_preds(
    df,
    y,
    text_feature,
    categorical_features,
    k):
    """
    Returns the prediction of the model for pd.DataFrame `df`
    along with the output probabilities.

    Args:
        df (pd.DataFrame): Evaluation DataFrame.
        y (str): Name of the variable to predict.
        text_feature (str): Name of the text feature.
        categorical_features (Optional[List[str]]): Names of the
            categorical features.
        k (int): Number of predictions.

    Returns:
        List: List with the prediction and probability for the
            given text.
    """
    libs = []

    iterables_features = (
        categorical_features if categorical_features is not None else []
    )
    for item in df.iterrows():
        formatted_item = item[1][text_feature]
        for feature in iterables_features:
            formatted_item += f" {feature}_{item[1][feature]}"
        libs.append(formatted_item)

    res = model.predict(libs, k=k)
    return {
        rank_pred: [
            (x[rank_pred].replace("__label__", ""), y[rank_pred])
            for x, y in zip(res[0], res[1])
        ]
        for rank_pred in range(k)
    }

In [86]:
def get_aggregated_preds(df, y, text_feature, categorical_features, k):

    preds = get_preds(df, y, text_feature, categorical_features, k)
    level = int(y[-1])

    predicted_classes = {
        f"predictions_{level}_k{rank_pred+1}": [pred[0] for pred in preds[rank_pred]] for rank_pred in range(k)
    }
    probs_prediction = {
        f"probabilities_k{rank_pred+1}": [prob[1] for prob in preds[rank_pred]] for rank_pred in range(k)
    }
    liasseNb = df.index

    preds_df = pd.DataFrame(predicted_classes)
    preds_df.set_index(liasseNb, inplace= True)
    
    proba_df = pd.DataFrame(probs_prediction)
    proba_df.set_index(liasseNb, inplace= True)

    try:
        df_naf = pd.read_csv(r"./data/naf_extended.csv", dtype=str)
    except FileNotFoundError:
        df_naf = pd.read_csv(r"../data/naf_extended.csv", dtype=str)
        
    df_naf[["NIV3", "NIV4", "NIV5"]] = df_naf[["NIV3", "NIV4", "NIV5"]].apply(
        lambda x: x.str.replace(".", "", regex=False)
    )
    df_naf = df_naf[[f"NIV{i}"for i in range(1, level+1)]]

    for rank_pred in range(k):
        df_naf_renamed = df_naf.rename(columns= {f"NIV{i}" : f"predictions_{i}_k{rank_pred+1}" for i in range(1, level+1)})
        preds_df = preds_df.join(df_naf_renamed.set_index(f"predictions_{level}_k{rank_pred+1}"), on=f"predictions_{level}_k{rank_pred+1}")
        preds_df = preds_df[~preds_df.index.duplicated(keep='first')]

    df = df.rename(columns= {f"APE_NIV{i}" : f"ground_truth_{i}" for i in range(1, level+1)}) 

    return df.join(preds_df.join(proba_df))


In [8]:
def compute_accuracies(df, y):
    """
    Computes accuracies (for different levels of the NAF classification)
    of the trained model on DataFrame `df`.

    Args:
        aggregated_APE_dict (Dict[int, pd.DataFrame]): Dictionary
            of true and predicted labels at each level of the NAF
            classification.
        k (int): Number of predictions.

    Returns:
        Dict[str, float]: Accuracies dictionary.
    """
    
    level = int(y[-1])
    accuracies = {
        f"accuracy_level_{aLevel}": np.mean(
            (
                df[f"predictions_{aLevel}_k1"]
                == df[f"ground_truth_{aLevel}"]
            )
        )
        for aLevel in range(1, level+1)
    }

    return (
        accuracies
    )
