In [1]:
import s3fs
import hvac
import os

client = hvac.Client(url='https://vault.lab.sspcloud.fr',
                     token=os.environ['VAULT_TOKEN'])
secret = os.environ['VAULT_MOUNT'] + os.environ['VAULT_TOP_DIR'] + '/s3_creds'
mount_point, secret_path = secret.split('/', 1)
secret_dict = client.secrets.kv.read_secret_version(path=secret_path, mount_point = mount_point)
os.environ["AWS_ACCESS_KEY_ID"] = secret_dict['data']['data']['ACCESS_KEY']
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_dict['data']['data']['SECRET_KEY']
try:
    del os.environ['AWS_SESSION_TOKEN']
except KeyError:
    pass

In [65]:
import pickle
import json
import mlflow
import os
import sys
import re
import unidecode
from nltk.corpus import stopwords as ntlk_stopwords
from nltk.stem.snowball import SnowballStemmer
sys.path.append("../src/page_selection/")
from tqdm import tqdm
from time import time
from sklearn import metrics
from sklearn.model_selection import train_test_split
from utils import (
    clean_page_content,
    extract_document_content,
    fit_transform_vectorizer,
    train_random_forest,
    get_numeric_char_rate
)
from model_wrapper import RandomForestWrapper

In [18]:
def clean_page_content2(page_content: str) -> str:
    """
    From a raw page content input as a string, return
    a clean string.

    Args:
        page_content (str): Content of a page.
    """
    # Remove line breaks
    content = page_content.replace("\r", "").replace("\n", "")
    # Remove punctuation
    content = re.sub(r"[^\w\s]", "", content)

    words = content.split()
    # Convert to lower case
    words = [word.lower() for word in words]
    # Remove stopwords and stem
    stopwords = tuple(ntlk_stopwords.words("french"))
    stemmer = SnowballStemmer(language="french")
    words_no_numbers = [
        stemmer.stem(word)
        for word in words
        if word not in stopwords
    ]
    # Remove accents
    clean_content = " ".join(
        [unidecode.unidecode(word) for word in words_no_numbers]
    )

    return clean_content

In [6]:
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'https://minio.lab.sspcloud.fr'})
with fs.open("s3://projet-extraction-tableaux/data/df_trainrf.pickle", 'rb') as f:
    df = pickle.load(f)

In [76]:
text = df.text[1]
text

"GREFFE DU TRIBUNAL DE COMMERCE …… D'AMIENS Date : 16/07/2020 DEPOT DES COMPTES ANNUELS n° de dépôt : B2020/002813 n° de gestion : 1958B70037 n° SIREN : 005 820 378 RCS Amiens Le greffier du Tribunal de Commerce d'Amiens certifie avoir procédé le 16/07/2020 à un dépôt annexé au dossier du registre du commerce et des sociétés de : ENTREPRISE DEMOUSELLE Rue du Château d'Eau ZONE INDUSTRIELLE 80100 ABBEVILLE date de clôture : 31/12/2019 Ce dépôt comprend les documents comptables prévus par la législation en vigueur. Concernant les évènements RCS suivants : Dépôt des comptes annuels"

In [83]:
text_clean = clean_page_content2(text)
text_clean

'greff tribunal commerc damien dat 16072020 depot compt annuel depot b2020002813 gestion 1958b70037 siren 005 820 378 rc amien greffi tribunal commerc damien certif avoir proced 16072020 depot annex dossi registr commerc societ entrepris demousel ru chateau deau zon industriel 80100 abbevill dat clotur 31122019 depot comprend docu comptabl prevus legisl vigueur concern even rc suiv depot compt annuel'

In [84]:
get_numeric_char_rate(text_clean)

0.16569767441860464

In [81]:
def load_extra_labeled_data():
    """ """

    with fs.open(
        "s3://projet-extraction-tableaux/data/df_trainrf.pickle", "rb"
    ) as f:
        df = pickle.load(f).head(5)

    flat_corpus = list(df.text)
    flat_corpus = [clean_page_content2(page) for page in flat_corpus]
    valid_labels = list(df.tableau_f_et_p)
    
    num_rates=[]
    num_rates = [get_numeric_char_rate(content) for content in flat_corpus]

    return flat_corpus, valid_labels, num_rates

In [82]:
flat_corpus, valid_labels, num_rates = load_extra_labeled_data()

In [85]:
flat_corpus[1]

'greff tribunal commerc damien dat 16072020 depot compt annuel depot b2020002813 gestion 1958b70037 siren 005 820 378 rc amien greffi tribunal commerc damien certif avoir proced 16072020 depot annex dossi registr commerc societ entrepris demousel ru chateau deau zon industriel 80100 abbevill dat clotur 31122019 depot comprend docu comptabl prevus legisl vigueur concern even rc suiv depot compt annuel'

In [86]:
num_rates[1]

0.16569767441860464