# Analyse des données de la campagne - JMS 2025

Ce notebook permet d'expérimenter et analyser des données de la campagne de labellisation.

In [33]:
import os
import s3fs

import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

from typing import List

In [34]:
def load_excel_from_fs(fs, file_path):
    """Load an Excel file from the file system."""
    try:
        with fs.open(file_path) as f:
            return pd.read_excel(f, dtype=str)
    except Exception as e:
        logging.error(f"Failed to load file {file_path}: {e}")
        raise

    
def format_mapping_table(mapping_table: str) -> str:
    columns_mapping = {
        "NAFold-code\n(code niveau sous-classe de la nomenclature actuelle)": "naf08_niv5",
        "NACEold-code\n(niveau classe)": "naf08_niv4",
        "NAFold-intitulé\n(niveau sous-classe)": "lib_naf08_niv5",
        "NACEnew-code\n(niveau classe)": "naf25_niv4",
        "NAFnew-code\n(code niveau sous-classe de la nomenclature 2025, correspondance logique avec les NAFold-codes)": "naf25_niv5",
        "NAFnew-intitulé\n(niveau sous-classe)": "lib_naf25_niv5",
    }

    return (
        mapping_table.iloc[:, [1, 3, 2, 10, 5, 11]]
        .rename(columns=columns_mapping)
        .assign(
            naf08_niv5=mapping_table.iloc[:, 1],
            naf08_niv4=mapping_table.iloc[:, 3],
            naf25_niv4=mapping_table.iloc[:, 5],
            naf25_niv5=mapping_table.iloc[:, 10],
        )
        .copy()
    )


def clean_dots_simple(
    df: pd.DataFrame, 
    exclude_cols: List[str] = None
) -> pd.DataFrame:
    """
    Supprime les points '.' des colonnes de type 'object' (chaînes de caractères) 
    dans un DataFrame, en excluant les colonnes spécifiées.
    """
    cols_to_clean = df.select_dtypes(include='object').columns
    if exclude_cols:
        cols_to_clean = [col for col in cols_to_clean if col not in exclude_cols]
        
    df.loc[:, cols_to_clean] = df[cols_to_clean].apply(
        lambda col: col.str.replace('.', '', regex=False)
    )
    
    return df

In [31]:
fs = s3fs.S3FileSystem(
        client_kwargs={"endpoint_url": os.getenv("S3_ENDPOINT")},
        key=os.getenv("AWS_ACCESS_KEY_ID"),
        secret=os.getenv("AWS_SECRET_ACCESS_KEY"),
    )

In [64]:
annotated_data = pd.read_parquet('s3://projet-ape/label-studio/annotation-campaign-2024/rev-NAF2025/preprocessed/training_data_NAF2025.parquet', filesystem=fs)
unclassifiable_data = pd.read_parquet('s3://projet-ape/label-studio/annotation-campaign-2024/rev-NAF2025/preprocessed/unclassifiable_data_NAF2025.parquet', filesystem=fs)

NAF_mapping = load_excel_from_fs(fs,'s3://projet-ape/NAF-revision/table-correspondance-naf2025.xls')
NAF_mapping = format_mapping_table(NAF_mapping)
NAF_mapping = clean_dots_simple(NAF_mapping)

NAF_mapping_agg_niv5 = NAF_mapping.groupby('naf08_niv5')['naf25_niv5'].agg(list).reset_index()
NAF_mapping_agg_niv5.rename(columns={'naf25_niv5': 'liste_naf25_niv5'}, inplace=True)
NAF_mapping_agg_niv4 = NAF_mapping.groupby(['naf08_niv4', 'naf08_niv5'])['naf25_niv4'].agg(lambda x: list(set(x))).reset_index()
NAF_mapping_agg_niv4.rename(columns={'naf25_niv4': 'liste_naf25_niv4'}, inplace=True)

## Analyse de la difficulté

In [68]:
# Jointure 1 : Ajout de la liste NAF 2025 Niv 5
data = pd.merge(
    annotated_data,
    NAF_mapping_agg_niv5,
    left_on='NAF2008_code', 
    right_on='naf08_niv5',
    how='left'
)

# Jointure 2 : Ajout de la liste NAF 2025 Niv 4
data = pd.merge(
    data,
    NAF_mapping_agg_niv4,
    left_on='NAF2008_code', 
    right_on='naf08_niv5',
    how='left'
)
data
data = data.drop(columns=['NAF2008_code', 
                          'naf08_niv5_x',
                          'naf08_niv5_y'])
data['len_naf25_niv5'] = data['liste_naf25_niv5'].apply(lambda x: len(x) if isinstance(x, list) else 0)
data['len_naf25_niv4'] = data['liste_naf25_niv4'].apply(lambda x: len(x) if isinstance(x, list) else 0)


In [69]:
data

Unnamed: 0,liasse_numero,libelle,evenement_type,liasse_type,activ_surf_et,activ_nat_et,activ_nat_lib_et,activ_sec_agri_et,cj,date,mode_calcul_ape,apet_manual,rating,liste_naf25_niv5,naf08_niv4,liste_naf25_niv4,len_naf25_niv5,len_naf25_niv4
0,J00066144973,"Culture de plantes à épices, aromatiques, médi...",05P,A,,,,Production de PPAM - Production de Petits fruits,,2024-04-09,AUTO,0128Y,1,"[0113Y, 0128Y]",0128,"[0113, 0128]",2,2
1,J00018686568,Elevage d'autres bovins et buffles,01P,A,,,,je souhaite reprendre du terrain agricole fami...,,2023-04-03,AUTO,0142Y,1,"[0141Y, 0142Y]",0142,"[0141, 0142]",2,2
2,J00056753361,Entretien et soin de chevaux,01P,A,,,,Entretien et soin de chevaux,,2024-02-12,FASTTEXT,0162Y,0,"[0147Y, 0162Y]",0162,"[0162, 0147]",2,2
3,J00083929216,Elevage d'autres animaux,01P,A,,,,Apiculture: élevage d'abeilles et vente de pro...,,2024-07-21,AUTO,0148G,1,"[0147Y, 0148G, 0148H, 0148J]",0149,"[0148, 0147]",4,2
4,J00036623890,Marin pêcheur à la ligne et aux casiers,01P,X,,,,,,2023-10-10,AUTO,0311Y,1,"[0311Y, 0330Y]",0311,"[0330, 0311]",2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30910,J00054893581,Vente de biens non règlementés à des particuliers,07M,I,,,,,3220,2024-04-17,FASTTEXT,4712H,0,"[4711L, 4712H, 4791Y, 4792H, 4792J]",4791,"[4712, 4711, 4791, 4792]",5,4
30911,J00065324865,"création , édition et vente à distance de Jeux...",07M,I,,,,,3220,2024-04-05,AUTO,5821Y,1,"[4721Y, 4722Y, 4723Y, 4724Y, 4725Y, 4726Y, 472...",4791,"[4721, 4791, 4752, 4755, 4740, 4751, 4723, 476...",36,29
30912,J00093483055,Vente de produits électroniques.Les ventes se ...,07M,I,,,,,3220,2024-09-19,AUTO,4778H,1,"[4711L, 4712H, 4791Y, 4792H, 4792J]",4791,"[4712, 4711, 4791, 4792]",5,4
30913,J00094385044,Ventes de biens non règlementés par Internet,07M,I,,,,,3220,2024-09-23,AUTO,4712H,1,"[4711L, 4712H, 4791Y, 4792H, 4792J]",4791,"[4712, 4711, 4791, 4792]",5,4
