# Construction des fichiers pour la définition du pipeline de processing textuel

Ce notebook permet d'assembler les fichiers text_process.csv des datasets 1, 3 et 4 en un fichier qui nous servira pour la définition du pipeline de processing textuel


In [2]:
import os
current_dir = %pwd
project_dir = os.path.dirname(current_dir)
%cd $project_dir

/data/dhryniewski/DataScientest/doc-classifier


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Dans le dossier "data", nous allons créer un dossier "processed" dans lequel nous stockerons les fichiers.

In [3]:
import os
import pandas as pd

if not os.path.exists("data/processed"):
    os.makedirs("data/processed")
    print(f"Folder created: 'data/processed'")
else:
    print(f"Folder already exists: 'data/processed'")

Folder already exists: 'data/processed'


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Ce code permet de concaténer les 3 fichiers text_process.csv en ajoutant la colonne 'data' pour avoir le jeu de données d'origine des images.


In [4]:
li = []

for i in ["data_01", "data_03", "data_04"]:
    df = pd.read_csv(f"data/raw/{i}/text_process.csv")
    df['data'] = i
    li.append(df)

frame_words = pd.concat(li, axis=0, ignore_index=True)
frame_words.to_csv("data/processed/words.csv", index=False)

frame_words.head()

Unnamed: 0,words,lang,lang_score,file_name,category,data
0,hazleton laboratory anerca fmc inuoioe ohon mi...,en,0.857141,image_0000297,invoice,data_01
1,ragnar rylander invoice consultant fee travel ...,en,0.857138,image_0000313,invoice,data_01
2,please remit box chicago invoice invoice date ...,en,0.999994,image_0000505,invoice,data_01
3,purchase requisition originator copt sheet rtf...,en,0.999995,image_0000269,invoice,data_01
4,hbi healthy building international inc covingt...,en,0.999995,image_0000020,invoice,data_01


Ce code permet de concaténer les 3 fichiers text_process.csv ne gardant que les colonnes 'words' et 'category' et en ajoutant la colonne 'count' qui correspond au nombre de mots dans le text.


In [5]:
li = []

for i in ["data_01", "data_03", "data_04"]:
    df = pd.read_csv(f"data/raw/{i}/text_process.csv",
                     usecols=['words', 'category'])
    df['words'] = df['words'].fillna('')
    df.insert(loc=1, column='count', value=df['words'].apply(
        lambda x: len(x.split())))
    li.append(df)

frame_count = pd.concat(li, axis=0, ignore_index=True)
#frame_count.to_csv("data/processed/count_words.csv", index=False)
frame_count.head()

Unnamed: 0,words,count,category
0,hazleton laboratory anerca fmc inuoioe ohon mi...,76,invoice
1,ragnar rylander invoice consultant fee travel ...,12,invoice
2,please remit box chicago invoice invoice date ...,105,invoice
3,purchase requisition originator copt sheet rtf...,138,invoice
4,hbi healthy building international inc covingt...,69,invoice


Sélection des catégories définies:

In [6]:
data = frame_count[frame_count["category"].isin(['passeport', 'national_identity_card', 'email', 'invoice', 'scientific_publication', 'handwritten'])].copy()
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,words,count,category
0,hazleton laboratory anerca fmc inuoioe ohon mi...,76,invoice
1,ragnar rylander invoice consultant fee travel ...,12,invoice
2,please remit box chicago invoice invoice date ...,105,invoice
3,purchase requisition originator copt sheet rtf...,138,invoice
4,hbi healthy building international inc covingt...,69,invoice
...,...,...,...
7104,give name inate burg van stad dorp nldspeci nl...,12,national_identity_card
7105,romania identitate romania nume manole tdentit...,28,national_identity_card
7106,issue authority mun bucuresti sec mare idrousp...,8,national_identity_card
7107,roumanie romania didentite identitate identity...,34,national_identity_card


Création d'un dictionnaire avec les mots les plus communs pour chaque catégorie.

In [8]:
from collections import Counter
from src import save_jsonfile

dic_counter = {}
for cat in data['category'].unique():
    df_cat = data[data['category'] == cat]
    cat_words = ' '.join(df_cat['words'].values)
    cat_count = Counter(cat_words.split(' '))
    dic_counter[cat] = cat_count.most_common()

save_jsonfile('data/processed/most_common_words.json', dic_counter, indent=4)

Création de la structure de chaque document, en cherchant la diversité lexicale et la densité des mots-clefs pour chaque catégorie et pour 5, 10, 25 & 50 mots les plus communs.

In [9]:
from src import calculate_lexical_diversity, calculate_keyword_density

data['lexical_diversity'] = data['words'].apply(calculate_lexical_diversity)

for cat in data['category'].unique():
    for k in [5, 10, 25, 50]:
        words = [w[0] for w in dic_counter[cat][:k]]
        data[f"keyword_{cat}_{k}"] = data['words'].apply(calculate_keyword_density, keywords=words)

data.to_csv("data/processed/words_structure.csv", index=False)

In [13]:
data.head()

Unnamed: 0,words,count,category,lexical_diversity,keyword_invoice_5,keyword_invoice_10,keyword_invoice_25,keyword_invoice_50,keyword_handwritten_5,keyword_handwritten_10,...,keyword_scientific_publication_25,keyword_scientific_publication_50,keyword_passeport_5,keyword_passeport_10,keyword_passeport_25,keyword_passeport_50,keyword_national_identity_card_5,keyword_national_identity_card_10,keyword_national_identity_card_25,keyword_national_identity_card_50
0,hazleton laboratory anerca fmc inuoioe ohon mi...,76,invoice,0.894737,0.039474,0.092105,0.210526,0.25,0.039474,0.039474,...,0.026316,0.039474,0.0,0.0,0.013158,0.013158,0.0,0.0,0.026316,0.026316
1,ragnar rylander invoice consultant fee travel ...,12,invoice,1.0,0.166667,0.166667,0.25,0.416667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,please remit box chicago invoice invoice date ...,105,invoice,0.67619,0.085714,0.114286,0.2,0.247619,0.028571,0.028571,...,0.0,0.028571,0.009524,0.009524,0.019048,0.019048,0.009524,0.009524,0.019048,0.019048
3,purchase requisition originator copt sheet rtf...,138,invoice,0.702899,0.050725,0.07971,0.130435,0.173913,0.028986,0.043478,...,0.014493,0.014493,0.043478,0.043478,0.057971,0.065217,0.043478,0.043478,0.043478,0.043478
4,hbi healthy building international inc covingt...,69,invoice,0.782609,0.072464,0.115942,0.188406,0.217391,0.0,0.0,...,0.0,0.0,0.028986,0.028986,0.057971,0.057971,0.028986,0.028986,0.028986,0.028986
