# Prepare the datasets

## data449_text.csv

We use the data set data449_cats which is directly extracted from Label Studio with the csv format. It is made out of the labellisation of data. Our objective is to reduce it to two columns with the text and 0 labels.

In [11]:
import pandas as pd 

# Read in the data
df = pd.read_csv('../data/raw/data449_cats.csv')

# Drop columns
df = df[["text"]].copy()

# Save to csv
df.to_csv('../data/processed/data449_text.csv', index=False)

## full_data_link_legifrance_id.xlsx

In [8]:
import pandas as pd

df = pd.read_excel(r"../data/raw/full_data_link_legifrance.xlsx")
df = df[["UrlLegifrance", "Entreprise","Siret", "Naf732", "Date Texte","Fichier"]].copy()
df["code"] = df["Fichier"].str.slice(101,113)
df.to_csv('../data/processed/full_data_link_legifrance_id.csv', index=False)
df.to_excel('../data/processed/full_data_link_legifrance_id.xlsx', index=False)

## long_short_texts.csv

In [9]:
import pandas as pd
import os
import glob

We import the shorten texts

In [10]:
# Read in the data
df_short_text = pd.read_excel(r'../data/raw/short_texts.xlsx').loc[0:495, ["code","entreprise","text"]]

In [11]:
df_short_text.head()

Unnamed: 0,code,entreprise,text
0,T03022004649,ROYAL CANIN SAS,négociation annuelle obligatoire 2022-23. les ...
1,T02622004554,BOIRON FRERES,accord collectif d’entreprise à l’issue de la...
2,T06222008590,UNITED PETFOOD FRANCE-UPF,négociations annuelles obligatoires 2022 les ...
3,T03322011613,LDC AQUITAINE,"la direction, a échangé avec les organisations..."
4,T59L22018637,COCA COLA PRODUCTION,accord relatif à la négociation annuelle oblig...


We build the long text database

In [35]:
df_id = pd.read_csv(r'../data/processed/full_data_link_legifrance_id.csv')

df_text_link= pd.merge(df_id, df_short_text, on="code", how="inner")

In [36]:
df_text_link = df_text_link[["code", "entreprise", "text", "Fichier"]].copy()
df_text_link = df_text_link.rename(columns={"Fichier":"url_proapp"})
df_text_link = df_text_link.rename(columns={"text":"text_short"})

In [37]:
# We write the list of proapp links and download them (proapp are directly connected to local server, you should use the legifrance url and download.py code instead)

list_proapp = df_text_link["url_proapp"].tolist()

with open("../data/text/docx/Liste.txt", "w", encoding="utf-8") as f:
    for item in list_proapp:
        f.write("%s\n" % item)

# I used the exe file not furnished here to download the files

In [None]:
# We convert all the files to txt and clean them

!python.exe "../src/convert_to_txt.py"
!python.exe "../src/clean_txt.py"

In [38]:
# Now we can store the text in a csv file and match them by their code

texts = glob.glob("../data/text/txt/*.txt")

lis_of_text = []
code = []

for text in texts:
    with open(text, "r", encoding="utf-8") as f:
        lis_of_text.append(f.read())
        code.append(text[17:29])

df_long_text = pd.DataFrame({"code":code, "text":lis_of_text})

Unnamed: 0,code,text
0,A59V17002488,negociation annuelle obligatoire 2017 la n...
1,T00122004154,proces verbal d’accord negociation annuell...
2,T00122004256,accord annuel d’entreprise mars 2022 entre ...
3,T00122004270,negociation annuelle 2022 protocole d’acco...
4,T00122004501,negociation annuelle obligatoire 2022 proces v...


In [44]:
# We merge the two dataframes

df_text = pd.merge(df_short_text, df_long_text, on="code", how="inner", suffixes=("_short", "_long"))
df_text.to_csv('../data/processed/short_long_texts.csv', index=False)