In [1]:
import s3fs
import hvac
import os

client = hvac.Client(url='https://vault.lab.sspcloud.fr',
                     token=os.environ['VAULT_TOKEN'])
secret = os.environ['VAULT_MOUNT'] + os.environ['VAULT_TOP_DIR'] + '/s3_creds'
mount_point, secret_path = secret.split('/', 1)
secret_dict = client.secrets.kv.read_secret_version(path=secret_path, mount_point = mount_point)
os.environ["AWS_ACCESS_KEY_ID"] = secret_dict['data']['data']['ACCESS_KEY']
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_dict['data']['data']['SECRET_KEY']
try:
    del os.environ['AWS_SESSION_TOKEN']
except KeyError:
    pass

In [2]:
import pickle
import json
import mlflow
import os
import sys
sys.path.append("../src/")
sys.path.append("../")
sys.path.append("../src/page_selection/")
from tqdm import tqdm
from time import time
from sklearn import metrics
from sklearn.model_selection import train_test_split
from utils import (
    clean_page_content,
    extract_document_content,
    fit_transform_vectorizer,
    train_random_forest,
)
from model_wrapper import RandomForestWrapper

In [4]:
# TODO: clean up
with open("../data/updated_labels_filtered.json", "r") as fp:
    labels = json.load(fp)

In [5]:
labeled_file_names = []
valid_labels = []

i = 0
for file_name, file_labels in labels.items():
    # Keep documents with at least 1 table
    table_count = sum(file_labels)
    if table_count > 0:
        i += 1
        labeled_file_names.append(file_name)
        for label in file_labels:
            valid_labels.append(label)

In [6]:
labeled_file_names

['CA_322804147_3801_2004B00488_2019_B2020010356.pdf',
 'CA_316025394_9301_1988B01227_2019_11700.pdf',
 'CA_325166577_7401_2008B80133_2019_B2020012662-1.pdf',
 'CA_311962443_5402_2003B00803_2019_4173.pdf',
 'CA_334057361_9201_2014B06634_2019_40642.pdf',
 'CA_303284269_9301_2006B03196_2019_21880.pdf',
 'CA_322152851_7802_1989B00865_2019_3617.pdf',
 'CA_311362313_7801_1977B00667_2019_8833.pdf',
 'CA_057505562_3801_1957B00556_2019_B2020010102.pdf',
 'CA_329481196_7803_1998B01253_2019_9604.pdf',
 'CA_312379076_9201_1980B18083_2019_19932.pdf',
 'CA_331633545_6601_1985B00059_2019_B2020007499.pdf',
 'CA_311243794_7501_1977B07407_2019_99467.pdf',
 'CA_300265253_0901_1974B00013_2019_B2020001602.pdf',
 'CA_312212301_9201_1997B01668_2019_26180.pdf',
 'CA_300573193_9201_2011B05919_2019_21179.pdf',
 'CA_319730339_1402_1982B00122_2019_4905.pdf',
 'CA_303197586_9731_1973B00016_2019_1820.pdf',
 'CA_303091227_9712_1974B00127_2019_B2020001314.pdf',
 'CA_328619721_5910_1984B40008_2019_9923.pdf',
 'CA_3202

In [7]:
corpus = []
labeled_file_names = [
    "projet-extraction-tableaux/raw-comptes/CS_extrait/" + file_name
    for file_name in labeled_file_names
]

In [8]:
for file_name in tqdm(labeled_file_names):
    clean_document_content = []
    page_list = extract_document_content(file_name, resolution=50)
    for page in page_list:
        clean_content = clean_page_content(page)
        clean_document_content.append(clean_content)
    corpus.append(clean_document_content)

100%|██████████| 36/36 [04:13<00:00,  7.05s/it]


In [10]:
corpus[0]

['vid',
 'g r e f f e u r i b u a e o e r e e g r e o b edat 14082020depot compt annuel depot b2020010356 gestion 2004b00488 siren rc grenoblel greffi tribunal commerc grenobl certif avoir proced depot annex dossi registr commerc societ fromager letoile38680 saintjustdeclaixdat clotur 31122019ce depot comprend docu comptabl prevus legisl vigueurconcern even rc suiv depot compt annuel',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'vid',
 'sas fromager letoil i bilan a bilan actif rubriqu mont brut amort prov net net capital souscr non appel immobilis incorporel frais detabl frais developp concess brevet droit similair fond commercial autr immobilis incorporel avanc acompt immo incorporel immobilis corporel terrain construct install techniqu mat outillag autr immobilis corporel immobilis cour avanc acompt immobilis financier 

In [11]:
flat_corpus = [item for sublist in corpus for item in sublist]
vectorizer, vectorized_corpus = fit_transform_vectorizer(flat_corpus)

In [13]:
flat_corpus[1]

'g r e f f e u r i b u a e o e r e e g r e o b edat 14082020depot compt annuel depot b2020010356 gestion 2004b00488 siren rc grenoblel greffi tribunal commerc grenobl certif avoir proced depot annex dossi registr commerc societ fromager letoile38680 saintjustdeclaixdat clotur 31122019ce depot comprend docu comptabl prevus legisl vigueurconcern even rc suiv depot compt annuel'

In [14]:
vectorized_corpus[1]

<1x24497 sparse matrix of type '<class 'numpy.float64'>'
	with 35 stored elements in Compressed Sparse Row format>

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    vectorized_corpus, valid_labels, test_size=0.2, random_state=42
)

In [16]:
X_train

<1733x24497 sparse matrix of type '<class 'numpy.float64'>'
	with 96419 stored elements in Compressed Sparse Row format>

In [17]:
# Training classifier
params = {
    "n_estimators": 100,
    "criterion": "gini",
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
}

clf, clf_descr, train_time = train_random_forest(
    params, X_train, y_train
)

In [18]:
clf

In [19]:
# Test time
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
# Score
score = metrics.accuracy_score(y_test, pred)

In [20]:
score

0.9746543778801844