In [1]:
import sys
import time

import mlflow
import pandas as pd
import yaml
import fasttext

from constants import TEXT_FEATURE
from fasttext_classifier.fasttext_evaluator import FastTextEvaluator
from fasttext_classifier.fasttext_preprocessor import FastTextPreprocessor
from fasttext_classifier.fasttext_trainer import FastTextTrainer
from fasttext_classifier.fasttext_wrapper import FastTextWrapper
from utils import get_root_path

In [2]:
config_path = "config/config_fasttext27.yaml"

preprocessor = FastTextPreprocessor()

print("*** Preprocessing the database...\n")
# Load data, assumed to be stored in a .parquet file
df = pd.read_parquet(
    "../data/extraction_sirene_20220712_harmonized_20221014.parquet", engine="pyarrow"
)

with open(get_root_path() / config_path, "r") as stream:
    config = yaml.safe_load(stream)
params = config["params"]
categorical_features = config["categorical_features"]
Y = config["Y"][0]
oversampling = config["oversampling"]

# Preprocess data
df_train, df_test, df_gu = preprocessor.preprocess(
    df=df,
    y=Y,
    text_feature=TEXT_FEATURE,
    categorical_features=categorical_features,
    oversampling=oversampling,
)

*** Preprocessing the database...

	*** 5 missing codes have been added in the database.

	*** 2 missing codes have been added in the train database...



mc cp minio/projet-ape/mlflow-artifacts/6/4e5c4673cbbd412e91456b3443e8dabe/artifacts/default/artifacts/default.bin models/model.bin

In [3]:
model = fasttext.load_model("../models/model.bin")
evaluator = FastTextEvaluator(model)



In [4]:
gu = evaluator.get_aggregated_preds(df_gu, Y, TEXT_FEATURE, categorical_features, 5)
test = evaluator.get_aggregated_preds(df_test, Y, TEXT_FEATURE, categorical_features, 5)

In [None]:
test.to_csv("../data/predictions_test_4e5c4673cbbd412e91456b3443e8dabe.csv")
gu.to_csv("../data/predictions_GU_4e5c4673cbbd412e91456b3443e8dabe.csv")

mc cp data/predictions_test_4e5c4673cbbd412e91456b3443e8dabe.csv minio/projet-ape/data/predictions_test_4e5c4673cbbd412e91456b3443e8dabe.csv

mc cp data/predictions_GU_4e5c4673cbbd412e91456b3443e8dabe.csv minio/projet-ape/data/predictions_GU_4e5c4673cbbd412e91456b3443e8dabe.csv