In [1]:
import s3fs
import hvac
import os

client = hvac.Client(url='https://vault.lab.sspcloud.fr',
                     token=os.environ['VAULT_TOKEN'])
secret = os.environ['VAULT_MOUNT'] + os.environ['VAULT_TOP_DIR'] + '/s3_creds'
mount_point, secret_path = secret.split('/', 1)
secret_dict = client.secrets.kv.read_secret_version(path=secret_path, mount_point = mount_point)
os.environ["AWS_ACCESS_KEY_ID"] = secret_dict['data']['data']['ACCESS_KEY']
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_dict['data']['data']['SECRET_KEY']
try:
    del os.environ['AWS_SESSION_TOKEN']
except KeyError:
    pass

In [2]:
import pickle
import json
import mlflow
import os
import sys
sys.path.append("../src/")
sys.path.append("../")
sys.path.append("../src/page_selection/")
from tqdm import tqdm
from time import time
from sklearn import metrics
from sklearn.model_selection import train_test_split
from utils import (
    clean_page_content,
    extract_document_content,
    fit_transform_vectorizer,
    train_random_forest,
)
from model_wrapper import RandomForestWrapper

### Nos données annotées

In [None]:
# TODO: clean up
with open("../data/updated_labels_filtered.json", "r") as fp:
    labels = json.load(fp)

In [None]:
labeled_file_names = []
valid_labels = []

i = 0
for file_name, file_labels in labels.items():
    # Keep documents with at least 1 table
    table_count = sum(file_labels)
    if table_count > 0:
        i += 1
        labeled_file_names.append(file_name)
        for label in file_labels:
            valid_labels.append(label)

In [None]:
corpus = []
labeled_file_names = [
    "projet-extraction-tableaux/raw-comptes/CS_extrait/" + file_name
    for file_name in labeled_file_names
]

In [None]:
for file_name in tqdm(labeled_file_names):
    clean_document_content = []
    page_list = extract_document_content(file_name, resolution=200)
    for page in page_list:
        clean_content = clean_page_content(page)
        clean_document_content.append(clean_content)
    corpus.append(clean_document_content)

In [None]:
corpus[0]

In [None]:
flat_corpus = [item for sublist in corpus for item in sublist]

In [None]:
flat_corpus[:5]

In [None]:
valid_labels[:5]

In [None]:
len(flat_corpus) == len(valid_labels)

### Données Adem + Laura

In [3]:
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'https://minio.lab.sspcloud.fr'})
with fs.open("s3://projet-extraction-tableaux/data/df_trainrf.pickle", 'rb') as f:
    df = pickle.load(f)

In [4]:
df

Unnamed: 0,siren,pagenumber,text,tableau_f_et_p,accOCR,tableauPDF
0,5820378,1,RCS : AMIENS Code greffe : 8002 Documents comp...,0,857764666501213,1
1,5820378,2,GREFFE DU TRIBUNAL DE COMMERCE …… D'AMIENS Dat...,0,916485136349996,1
2,5820378,3,...,0,,1
3,5820378,4,...,0,,1
4,5820378,5,...,0,,1
...,...,...,...,...,...,...
34906,998823504,45,4.9. Analyse et échéance des dettes\r\r\n...,0,,1
34907,998823504,46,- de remboursement de pénalités et intérêts...,0,,1
34908,998823504,47,6.2.2. ...,0,,1
34909,998823504,48,ADECCO FRANCE Société par actions simplifiée a...,0,917648922486163,1


In [None]:
flat_corpus2 = list(df.text)
valid_labels2 = list(df.tableau_f_et_p)

In [None]:
len(flat_corpus2) == len(valid_labels2)

In [None]:
flat_corpus2[:5]

In [None]:
valid_labels2[:5]

In [None]:
flat_corpus2 = [clean_page_content(page) for page in flat_corpus2]

In [None]:
flat_corpus2[:5]

#### Alternative way of splitting train / test

In [None]:
train_siren, test_siren = train_test_split(df[df.tableau_f_et_p == 1]["siren"].unique())

In [None]:
train_df = df[df.siren.isin(train_siren)]
test_df = df[df.siren.isin(test_siren)]

In [None]:
train_corpus = list(train_df.text)
train_labels = list(train_df.tableau_f_et_p)

In [None]:
vectorizer, vectorized_corpus = fit_transform_vectorizer(train_corpus)

In [None]:
test_corpus = list(test_df.text)
test_labels = list(test_df.tableau_f_et_p)
vectorized_test = vectorizer.transform(test_corpus)

In [None]:
# Training classifier
params = {
    "n_estimators": 100,
    "criterion": "gini",
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
}

clf, clf_descr, train_time = train_random_forest(
    params, vectorized_corpus, train_labels
)

In [None]:
# Test time
t0 = time()
pred = clf.predict(vectorized_test)
test_time = time() - t0
# Score
accuracy = metrics.accuracy_score(test_labels, pred)
f1 = metrics.f1_score(test_labels, pred)
precision = metrics.precision_score(test_labels, pred)
recall = metrics.recall_score(test_labels, pred)
cm = metrics.confusion_matrix(test_labels, pred)

In [None]:
accuracy, f1, precision, recall

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')

### Flattening, vectorizing

On fait les choses différemment ici mais pb: on fit le vectorizer sur le test, ce qui ne devrait pas être le cas.

In [None]:
vectorizer, vectorized_corpus = fit_transform_vectorizer(flat_corpus)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    vectorized_corpus, valid_labels, test_size=0.2, random_state=42
)

In [None]:
X_train

In [None]:
# Training classifier
params = {
    "n_estimators": 100,
    "criterion": "gini",
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
}

clf, clf_descr, train_time = train_random_forest(
    params, X_train, y_train
)

In [None]:
clf

In [None]:
# Test time
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
# Score
accuracy = metrics.accuracy_score(y_test, pred)
f1 = metrics.f1_score(y_test, pred)
precision = metrics.precision_score(y_test, pred)
recall = metrics.recall_score(y_test, pred)
cm = metrics.confusion_matrix(y_test, pred)

In [None]:
accuracy, f1, precision, recall

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')

### Test updated train script

In [1]:
%load_ext autoreload
%autoreload 2

In [20]:
from utils import load_labeled_data, load_extra_labeled_data

flat_corpus, valid_labels = load_labeled_data()
flat_corpus_extra, valid_labels_extra = load_extra_labeled_data()
flat_corpus += flat_corpus_extra
valid_labels += valid_labels

100%|██████████| 36/36 [03:21<00:00,  5.60s/it]


In [21]:
valid_labels = valid_labels[:int(len(valid_labels)/2)]
valid_labels += valid_labels_extra

In [22]:
train_corpus, test_corpus, y_train, y_test = train_test_split(
    flat_corpus, valid_labels, test_size=0.2, random_state=42
)

In [23]:
# Add new feature : rate of numeric characters
num_rates = [get_numeric_char_rate(content) for content in flat_corpus]

train_num_rates, test_num_rates = train_test_split(
    num_rates, test_size=0.2, random_state=42
)

In [24]:
len(train_num_rates)

29662

In [25]:
len(train_corpus)

29662

In [26]:
train_corpus[:5]

['respons lentrepris e pi lot g e e a e a r h e e h i q u e co pl i a e e r e polit outil dialogu part pren mis oeuvr demarch rse group accor sappui dialogu frequent part pren essentiel dialogu inherent conduit activ group procedur group dialogu part pren precis champ dappliqu mis oeuvr respons tous niveau group mondial regional local ains moyen utilis renforc dialogu rendr compt part pren nombreux partenariat accor entretient ex ecpat purprojet energy observ community conserv fund afric the camp participent egal cet polit dialogu gouvern lethiqu complianc rse deploi anim u25cf prend connaiss annuel rapport comit chart ethiqu rse ethiqu rse present chart sappliqu lensembl collabor comit daud complianc risqu tient group accor mond cestadir lensembl moin trois reunion an chaqu reunion lopportun collabor permanent occasionnel societ accor tout mesur necessair dun point deploi societ quel control cet chart egal disposit complianc polit anticorrupt applic collabor permanent occasionnel syst

In [27]:
train_num_rates[:5]

[0.008210180623973728,
 0.0,
 0.004850444624090542,
 0.18649193548387097,
 0.007512019230769231]

In [30]:
get_numeric_char_rate(train_corpus[2])

0.004850444624090542

In [31]:
vectorizer, X_train = fit_transform_vectorizer(train_corpus)

# Training classifier
params = {
    "n_estimators": 100,
    "criterion": "gini",
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
}

In [33]:
X_train

<29662x174618 sparse matrix of type '<class 'numpy.float64'>'
	with 3298548 stored elements in Compressed Sparse Row format>

In [36]:
from scipy import sparse
import numpy as np

In [37]:
sparse.hstack((X_train, np.array(train_num_rates)[:, None]))

<29662x174619 sparse matrix of type '<class 'numpy.float64'>'
	with 3319600 stored elements in COOrdinate format>