In [2]:
import sys
import time

import mlflow
import pandas as pd
import yaml
import fasttext

from constants import TEXT_FEATURE
from fasttext_classifier.fasttext_evaluator import FastTextEvaluator
from fasttext_classifier.fasttext_preprocessor import FastTextPreprocessor
from fasttext_classifier.fasttext_trainer import FastTextTrainer
from fasttext_classifier.fasttext_wrapper import FastTextWrapper
from utils import get_root_path


In [3]:
config_path = "config/config_fasttext27.yaml"

preprocessor = FastTextPreprocessor()

print("*** Preprocessing the database...\n")
# Load data, assumed to be stored in a .parquet file
df = pd.read_parquet("../data/extraction_sirene_20220712_harmonised.parquet", engine="pyarrow")

with open(get_root_path() / config_path, "r") as stream:
    config = yaml.safe_load(stream)
params = config["params"]
categorical_features = config["categorical_features"]
Y = config["Y"][0]
oversampling = config["oversampling"]

# Preprocess data
df_train, df_test, df_gu = preprocessor.preprocess(
    df=df,
    y=Y,
    text_feature=TEXT_FEATURE,
    categorical_features=categorical_features,
    oversampling=oversampling,
)

*** Preprocessing the database...



- Model 1
    mc cp minio/projet-ape/mlflow-artifacts/1/533b0455ad9e4ba2abc3806a6a5867d6/artifacts/default/artifacts/default.bin models/model1.bin

- Model 2
    mc cp minio/projet-ape/mlflow-artifacts/1/d59e23a319fe40e0afed1c79172ba9bd/artifacts/default/artifacts/default.bin models/model2.bin

- Model 3
    mc cp minio/projet-ape/mlflow-artifacts/1/62b2c9295f3a48b3b81872e6db560bd5/artifacts/default/artifacts/default.bin models/model3.bin
    
- Model 4
    mc cp minio/projet-ape/mlflow-artifacts/1/cdde7a391d0c423499b64ccdf5ec941a/artifacts/default/artifacts/default.bin models/model4.bin

In [5]:
model4 = fasttext.load_model("../models/model4.bin")
evaluator4 = FastTextEvaluator(model4)



In [6]:
model2 = fasttext.load_model("../models/model2.bin")
evaluator2 = FastTextEvaluator(model2)



In [7]:
gu2 = evaluator2.get_aggregated_preds(df_gu, Y, TEXT_FEATURE, categorical_features, 5)
test2 = evaluator2.get_aggregated_preds(df_test, Y, TEXT_FEATURE, categorical_features, 5)
gu4 = evaluator4.get_aggregated_preds(df_gu, Y, TEXT_FEATURE, categorical_features, 5)
test4 = evaluator4.get_aggregated_preds(df_test, Y, TEXT_FEATURE, categorical_features, 5)

In [9]:
test2.to_csv("../data/predictions_test_d59e23a319fe40e0afed1c79172ba9bd.csv")
gu2.to_csv("../data/predictions_GU_d59e23a319fe40e0afed1c79172ba9bd.csv")
test4.to_csv("../data/predictions_test_cdde7a391d0c423499b64ccdf5ec941a.csv")
gu4.to_csv("../data/predictions_GU_cdde7a391d0c423499b64ccdf5ec941a.csv")

mc cp data/predictions_test_cdde7a391d0c423499b64ccdf5ec941a.csv minio/projet-ape/data/predictions_test_cdde7a391d0c423499b64ccdf5ec941a.csv

mc cp data/predictions_GU_cdde7a391d0c423499b64ccdf5ec941a.csv minio/projet-ape/data/predictions_GU_cdde7a391d0c423499b64ccdf5ec941a.csv