In [None]:
import sys
import time

import mlflow
import pandas as pd
import yaml
import fasttext

from constants import TEXT_FEATURE
from fasttext_classifier.fasttext_evaluator import FastTextEvaluator
from fasttext_classifier.fasttext_preprocessor import FastTextPreprocessor
from fasttext_classifier.fasttext_trainer import FastTextTrainer
from fasttext_classifier.fasttext_wrapper import FastTextWrapper
from utils import get_root_path

In [None]:
config_path = "config/config_fasttext39.yaml"

preprocessor = FastTextPreprocessor()

print("*** Preprocessing the database...\n")
# Load data, assumed to be stored in a .parquet file
df = pd.read_parquet("../data/extraction_sirene_20220712_harmonised.parquet", engine="pyarrow")

with open(get_root_path() / config_path, "r") as stream:
    config = yaml.safe_load(stream)
params = config["params"]
categorical_features = config["categorical_features"]
Y = config["Y"][0]
oversampling = {'threshold': 1} #config["oversampling"]

# Preprocess data
df_train, df_test, df_gu = preprocessor.preprocess(
    df=df,
    y=Y,
    text_feature=TEXT_FEATURE,
    categorical_features=categorical_features,
    oversampling=oversampling,
)

mc cp minio/projet-ape/mlflow-artifacts/1/5490ebb3b62a43e494517f819cf20322/artifacts/default/artifacts/default.bin models/model.bin

In [None]:
model = fasttext.load_model("../models/model.bin")
evaluator = FastTextEvaluator(model)

In [None]:
gu = evaluator.get_aggregated_preds(df_gu, Y, TEXT_FEATURE, categorical_features, 5)
test = evaluator.get_aggregated_preds(df_test, Y, TEXT_FEATURE, categorical_features, 5)

In [None]:
test.to_csv("../data/predictions_test_5490ebb3b62a43e494517f819cf20322.csv")
gu.to_csv("../data/predictions_GU_5490ebb3b62a43e494517f819cf20322.csv")

mc cp data/predictions_test_5490ebb3b62a43e494517f819cf20322.csv minio/projet-ape/data/predictions_test_5490ebb3b62a43e494517f819cf20322.csv

mc cp data/predictions_GU_5490ebb3b62a43e494517f819cf20322.csv minio/projet-ape/data/predictions_GU_5490ebb3b62a43e494517f819cf20322.csv