# Construction des fichiers pour la définition du pipeline de processing textuel

Ce notebook permet d'assembler les fichiers text_process.csv des datasets 1, 3 et 4 en un fichier qui nous servira pour la définition du pipeline de processing textuel


In [1]:
import os
current_dir = %pwd
project_dir = os.path.dirname(current_dir)
%cd $project_dir

/data/dhryniewski/DataScientest/doc-classifier


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Dans le dossier "data", nous allons créer un dossier "processed" dans lequel nous stockerons les fichiers.

In [2]:
import os
import pandas as pd

if not os.path.exists("data/processed"):
    os.makedirs("data/processed")
    print(f"Folder created: 'data/processed'")
else:
    print(f"Folder already exists: 'data/processed'")

Folder already exists: 'data/processed'


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Ce code permet de concaténer les 3 fichiers text_process.csv en ajoutant la colonne 'data' pour avoir le jeu de données d'origine des images.


In [3]:
li = []

for i in ["data_01", "data_03", "data_04"]:
    df = pd.read_csv(f"data/raw/{i}/text_process.csv")
    df['data'] = i
    li.append(df)

frame_words = pd.concat(li, axis=0, ignore_index=True)
frame_words.to_csv("data/processed/words.csv", index=False)

frame_words.head()

Unnamed: 0,words,lang,lang_score,file_name,category,data
0,hazleton laboratory anerca fmc inuoioe ohon mi...,en,0.857141,image_0000297,invoice,data_01
1,ragnar rylander invoice consultant fee travel ...,en,0.857138,image_0000313,invoice,data_01
2,please remit box chicago invoice invoice date ...,en,0.999994,image_0000505,invoice,data_01
3,purchase requisition originator copt sheet rtf...,en,0.999995,image_0000269,invoice,data_01
4,hbi healthy building international inc covingt...,en,0.999995,image_0000020,invoice,data_01


Ce code permet de concaténer les 3 fichiers text_process.csv ne gardant que les colonnes 'words' et 'category' et en ajoutant la colonne 'count' qui correspond au nombre de mots dans le text.


In [4]:
li = []
for i in ["data_01", "data_03", "data_04"]:
    df = pd.read_csv(f"data/raw/{i}/text_process.csv",
                     usecols=['words', 'category'])
    df['words'] = df['words'].fillna('')
    df.insert(loc=1, column='count', value=df['words'].apply(
        lambda x: len(x.split())))
    li.append(df)

frame_count = pd.concat(li, axis=0, ignore_index=True)
#frame_count.to_csv("data/processed/count_words.csv", index=False)
frame_count.head()

Unnamed: 0,words,count,category
0,hazleton laboratory anerca fmc inuoioe ohon mi...,76,invoice
1,ragnar rylander invoice consultant fee travel ...,12,invoice
2,please remit box chicago invoice invoice date ...,105,invoice
3,purchase requisition originator copt sheet rtf...,138,invoice
4,hbi healthy building international inc covingt...,69,invoice


Sélection des catégories définies:

In [18]:
data = frame_count[frame_count["category"].isin(['passeport', 'national_identity_card', 'email', 'invoice', 'scientific_publication', 'handwritten'])].copy()
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,words,count,category
0,hazleton laboratory anerca fmc inuoioe ohon mi...,76,invoice
1,ragnar rylander invoice consultant fee travel ...,12,invoice
2,please remit box chicago invoice invoice date ...,105,invoice
3,purchase requisition originator copt sheet rtf...,138,invoice
4,hbi healthy building international inc covingt...,69,invoice
...,...,...,...
7104,give name inate burg van stad dorp nldspeci nl...,12,national_identity_card
7105,romania identitate romania nume manole tdentit...,28,national_identity_card
7106,issue authority mun bucuresti sec mare idrousp...,8,national_identity_card
7107,roumanie romania didentite identitate identity...,34,national_identity_card


## Split la data en train et test.

In [19]:
from sklearn.model_selection import train_test_split

seed = 42
data['words'] = data['words'].fillna('')
target = data['category']
features = data.drop('category', axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=seed)

df_train = pd.concat((X_train, y_train), axis=1).reset_index(drop=True)
df_test = pd.concat((X_test, y_test), axis=1).reset_index(drop=True)

Création d'un dictionnaire avec les mots les plus communs pour chaque catégorie.

In [26]:
from collections import Counter
from src import save_jsonfile

dic_counter = {}
for cat in df_train['category'].unique():
    df_cat = df_train[df_train['category'] == cat]
    cat_words = ' '.join(df_cat['words'].values)
    cat_count = Counter(cat_words.split(' '))
    most_common = cat_count.most_common()
    most_common = [(word, count) for word, count in most_common if word != ""]
    dic_counter[cat] = most_common

save_jsonfile('data/processed/most_common_words.json', dic_counter, indent=4)

Création de la structure de chaque document, en cherchant la diversité lexicale et la densité des mots-clefs pour chaque catégorie et pour 5, 10, 25 & 50 mots les plus communs.

In [27]:
from src import calculate_lexical_diversity, calculate_keyword_density

df_train['lexical_diversity'] = df_train['words'].apply(calculate_lexical_diversity)
df_test['lexical_diversity'] = df_test['words'].apply(calculate_lexical_diversity)

for cat in df_train['category'].unique():
    for k in [5, 10, 25, 50]:
        words = [w[0] for w in dic_counter[cat][:k]]
        df_train[f"keyword_{cat}_{k}"] = df_train['words'].apply(calculate_keyword_density, keywords=words)
        df_test[f"keyword_{cat}_{k}"] = df_test['words'].apply(calculate_keyword_density, keywords=words)

df_train.to_csv("data/processed/words_structure_train.csv", index=False)
df_test.to_csv("data/processed/words_structure_test.csv", index=False)

In [28]:
df_train.head()

Unnamed: 0,words,count,category,lexical_diversity,keyword_passeport_5,keyword_passeport_10,keyword_passeport_25,keyword_passeport_50,keyword_email_5,keyword_email_10,...,keyword_scientific_publication_25,keyword_scientific_publication_50,keyword_handwritten_5,keyword_handwritten_10,keyword_handwritten_25,keyword_handwritten_50,keyword_national_identity_card_5,keyword_national_identity_card_10,keyword_national_identity_card_25,keyword_national_identity_card_50
0,latvija latvia gode ste passport berzina anna ...,22,passeport,0.909091,0.136364,0.136364,0.136364,0.181818,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.136364,0.181818,0.227273
1,original karen chaikin pmusa com send tuesday ...,48,email,0.854167,0.020833,0.041667,0.041667,0.041667,0.041667,0.104167,...,0.0625,0.083333,0.020833,0.041667,0.0625,0.104167,0.020833,0.020833,0.020833,0.020833
2,please return one copy payment covington burl ...,80,invoice,0.8625,0.0125,0.0125,0.05,0.05,0.05,0.0625,...,0.0625,0.0875,0.0375,0.0625,0.125,0.1375,0.0125,0.0125,0.0125,0.0125
3,appendix strbasku file copy identification pur...,181,scientific_publication,0.856354,0.0,0.0,0.0,0.027624,0.0,0.0,...,0.060773,0.099448,0.005525,0.005525,0.016575,0.044199,0.0,0.0,0.0,0.027624
4,medical school develop commercial liver lily w...,350,scientific_publication,0.848571,0.0,0.0,0.008571,0.011429,0.0,0.002857,...,0.008571,0.034286,0.002857,0.028571,0.048571,0.06,0.0,0.002857,0.008571,0.008571


In [29]:
df_test.head()

Unnamed: 0,words,count,category,lexical_diversity,keyword_passeport_5,keyword_passeport_10,keyword_passeport_25,keyword_passeport_50,keyword_email_5,keyword_email_10,...,keyword_scientific_publication_25,keyword_scientific_publication_50,keyword_handwritten_5,keyword_handwritten_10,keyword_handwritten_25,keyword_handwritten_50,keyword_national_identity_card_5,keyword_national_identity_card_10,keyword_national_identity_card_25,keyword_national_identity_card_50
0,cal cel page bill date philip morris usa marlb...,60,invoice,0.966667,0.016667,0.016667,0.033333,0.066667,0.016667,0.016667,...,0.0,0.016667,0.0,0.0,0.016667,0.05,0.033333,0.033333,0.033333,0.033333
1,jackson priscilla gomer bert suggs wae ruffin ...,42,email,0.857143,0.095238,0.095238,0.095238,0.166667,0.190476,0.214286,...,0.0,0.02381,0.02381,0.02381,0.047619,0.095238,0.095238,0.095238,0.095238,0.095238
2,lietuvi republic lithuania ref bliqu lituanie ...,26,passeport,0.961538,0.115385,0.115385,0.153846,0.153846,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.076923
3,outgoing mal crc contrac business administrati...,96,invoice,0.677083,0.0,0.0,0.0,0.010417,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0625
4,vol acc interstitial preumonitis associate ble...,198,scientific_publication,0.848485,0.0,0.0,0.005051,0.005051,0.0,0.0,...,0.065657,0.10101,0.005051,0.005051,0.015152,0.030303,0.0,0.0,0.0,0.0
