In [1]:
import xmltodict
import pandas as pd
from pathlib import Path

def create_dict_from_xml(chemin_fichier):
    with open(chemin_fichier, encoding='utf8') as fd:
        doc = xmltodict.parse(fd.read(), dict_constructor=dict)
    return doc

In [2]:
"""
  Parse le dictionnaire des types complexes et retourne le résultat dans un dataframe et un dictionnaire
"""

def _parse_complex_type(annexe_type : dict):
    list_records = []
    all_complex_type = annexe_type['xs:schema']['xs:complexType']
    i=0
    for complex_type in all_complex_type:
        temp_dict = dict()
        nom_type_complexe = complex_type['@name']
        if isinstance(complex_type['xs:attribute'], dict):
            for element in complex_type['xs:attribute']['xs:simpleType']['xs:restriction']['xs:enumeration']:
                temp_dict[element['@value']] = element.get('xs:annotation', {}).get('xs:documentation', element['@value'])
        
        result_dict = {
            "type" : nom_type_complexe,
            "enum" : temp_dict
        }
        list_records.append(result_dict)
    
    df = pd.DataFrame.from_records(list_records)

    return df

def _generate_complex_type_df(chemin : Path) -> pd.DataFrame:
    annexe_type = create_dict_from_xml(chemin)
    complexe_types_df = _parse_complex_type(annexe_type)
    
    return complexe_types_df

In [3]:
"""
  Génération d'une documentation des codes de données des annexes
"""

def _parse_annexe_fields_documentation(class_annexe : dict) -> pd.DataFrame:
    elements = class_annexe['xs:sequence']['xs:element']
    list_records = []
    dict_champs = dict()
    nom_annexe = class_annexe["@name"][1:]
    
    for element in elements:
        documentation = element['xs:annotation']['xs:documentation']
        if isinstance(documentation, str):
            libelle = documentation
            description = documentation
        else:
            libelle = element['xs:annotation']['xs:documentation']['z:libelle']
            description = element['xs:annotation']['xs:documentation'].get('z:description')
        dict_champs = {
            "nom_annexe" : nom_annexe,
            "nom_champ" : element["@name"],
            "type" : element["@type"],
            "libelle" : libelle,
            "description" : description,
        }
        list_records.append(dict_champs)
    
    df = pd.DataFrame.from_records(list_records)
    df["description"] = df["description"].str.replace(r'^<[^<>]*>', '', regex=True)
    df["description"] = df["description"].str.replace(r'^\s*<ul>', '', regex=True)
    df["description"] = df["description"].str.replace(r'^\s*<li>', '', regex=True)
    df["description"] = df["description"].str.replace(r'<ul>', ' : ', regex=True)
    df["description"] = df["description"].str.replace(r'<li>', ' - ', regex=True)
    df["description"] = df["description"].str.replace(r'<[^<>]*>', ' ', regex=True)
    df["description"] = df["description"].str.replace(r'\s\s+', ' ', regex=True)

    
    return df

def _merge_annexe_type_and_documentation(chemin_annexe: Path, complex_type_df: pd.DataFrame) -> pd.DataFrame:
    class_to_generate = create_dict_from_xml(chemin_annexe)['xs:schema']['xs:complexType'][1]
    init_df = _parse_annexe_fields_documentation(class_to_generate)
    init_df = init_df.merge(complex_type_df, how='left')
    return init_df



In [4]:
from pathlib import Path
PATH_TO_SCHEMA = Path("./download/")

def _get_list_annexes_path():
    dict_annexe = create_dict_from_xml(PATH_TO_SCHEMA.joinpath("SchemaDocBudg/Class_Annexes.xsd"))["xs:schema"]['xs:include']
    dict_annexe.pop(0)
    class_annexe_paths = []
    for annexe in dict_annexe: 
        class_annexe_paths.append(PATH_TO_SCHEMA.joinpath(f"SchemaDocBudg/{annexe['@schemaLocation']}"))
    return class_annexe_paths


def _parse_all_annexes_fields_documentation() -> pd.DataFrame:
    #Erreurs à traiter manuellement :
    # - l'annexe signatures dont la balise xs:complextype est inversée par rapport à l'habitude, il faut copier le premier bloc xs:complextype en dessous du 2ème.
    # - l'annexe emprunt "IndSousJacentDtVote" ou il y a deux balises documentation qui génère une liste (seul cas)
    df_result = pd.DataFrame()
    
    annexes_paths = _get_list_annexes_path()
    annexe_complexe_types = _generate_complex_type_df(PATH_TO_SCHEMA.joinpath("SchemaDocBudg/CommunAnnexe.xsd"))
    
    for annexe_path in annexes_paths:
        df = _merge_annexe_type_and_documentation(annexe_path, annexe_complexe_types)
        df_result = pd.concat([df, df_result])
    
    return df_result.set_index('nom_champ')


def generate_annexe_data_documentation_csv(path_to_export : Path):
    df_to_csv = _parse_all_annexes_fields_documentation()
    generate_csv(path_to_export, df_to_csv)  
    

def generate_csv(path_to_export : Path, df_to_csv : pd.DataFrame):
    df_to_csv.to_csv(path_to_export, index=False) 

In [5]:
import psycopg2

def connection_pg2():
    return psycopg2.connect(database ="actes_budgetaire", user = "postgres",
                        password = "vGXGLqTY4EF7DCiozYoKlqRRUaC7ECd", host = "ab-postgres",
                        port = "5432")

def get_data(connexion, annexe, year, nature_dec):
    query= f"""SELECT annexe.id as annexe_id, *  FROM annexe 
    JOIN documentbudgetaire 
        ON documentbudgetaire.id = annexe.fk_id_document_budgetaire
    JOIN collectivite 
        ON collectivite.siret_coll = documentbudgetaire.fk_siret_collectivite 
    WHERE annexe.type_annexe = '{annexe}' 
    AND documentbudgetaire.exercice = '{year}'
    AND documentbudgetaire.nature_dec = '{nature_dec}' 
    AND documentbudgetaire.fk_siret_collectivite = '21640260200017'
    """
    dat = pd.read_sql_query(query, connexion)
    return dat

needed_columns = ['annexe_id', 'fk_id_document_budgetaire', 'exercice', 'type_annexe', 'json_annexe','siret_etablissement', 'libelle', 'nomenclature', 'nature_dec', 'nature_vote','type_budget', 'siret_coll', 'libelle_collectivite', 'nature_collectivite']

def keep_needed_columns(needed_columns : list, df : pd.DataFrame) -> pd.DataFrame:
    to_drop = [item for item in list(df.columns) if item not in needed_columns]
    return df.drop(to_drop, axis=1)

import app.backend.timer as timer
import app.backend.config as conf

data = conf.CHAMPS_ANNEXES
    
def annexe(nom):
    donnees = data[nom]
    return donnees

EMPRUNT_FIELD = annexe(nom="DATA_EMPRUNT")

In [9]:
# 1 - récupération des champs spéciaux
#Quid des données obsolètes ?
# A gérer le cas de CodMotifContrAgent (nested documentation)
to_replace = _parse_all_annexes_fields_documentation()["enum"]   \
                                        .dropna()  \
                                        .to_dict()

In [6]:
import json
import copy

def _all_annexe_columns(df, annexe_fields):
    all_columns = copy.deepcopy(annexe_fields)
    columns = list(df.columns)
    for i in columns:
        all_columns.append(i)
    return all_columns

@timer.timed
def explode_annexe_json_into_rows_first_way(df, annexe_fields):
    all_columns = _all_annexe_columns(df, annexe_fields)
    df['json_annexe'] = df.json_annexe.apply(eval)
    temp = df.groupby('annexe_id').json_annexe.apply(lambda x: pd.DataFrame(x.values[0])).reset_index()
    df.drop(columns="json_annexe", inplace=True)
    df_result = temp.merge(df, left_on='annexe_id', right_on='annexe_id')
    return result.reindex(columns = all_columns)
  
@timer.timed
def explode_annexe_json_into_rows_second_way(df, annexe_fields):
    all_columns = _all_annexe_columns(df, annexe_fields)
    s= df.set_index('annexe_id').json_annexe.apply(eval).explode()
    temp = pd.DataFrame(s.tolist(), index = s.index).reset_index()
    df.drop(columns="json_annexe", inplace=True)
    df_result = temp.merge(df, left_on='annexe_id', right_on='annexe_id')
    return result.reindex(columns = all_columns)

@timer.timed
def explode_annexe_json_into_rows_third_way_old(df, annexe_fields, path: Path, to_replace):
    dfs = []
    dict_annexe = dict()
    for index, row  in df.iterrows():
        dict_annexe[index] = json.loads(row["json_annexe"])
        for element in dict_annexe[index]:
            if element:
                for field in annexe_fields:
                    element.setdefault(field, None)

        json_df = pd.json_normalize(dict_annexe[index])
        #dfs.append(json_df.assign(**row.drop("json_annexe")))
        df_temp = json_df.assign(**row.drop("json_annexe"))#.replace(to_replace, inplace=True)
        append_csv(df_temp, path)

@timer.timed
def explode_annexe_json_into_rows_third_way(df, annexe_fields, path: Path, to_replace, needed_columns):
    dfs = []
    dict_annexe = dict()
    for index, row  in df.iterrows():
        dict_annexe = json.loads(row["json_annexe"])
        for element in dict_annexe:
            if element:
                for field in annexe_fields:
                    element.setdefault(field, None)

        json_df = pd.json_normalize(dict_annexe)
        #dfs.append(json_df.assign(**row.drop("json_annexe")))
        df_temp = json_df.assign(**row.drop("json_annexe"))#.replace(to_replace, inplace=True)
        try:
            df_temp.replace(to_replace, inplace=True)
        except TypeError:
            print(df_temp[annexe_id])
        if index == 0:
            all_columns = needed_columns + annexe_fields
            all_columns.remove("json_annexe")
            generate_csv(df_temp, path, all_columns)
        else:
            try:
                append_csv(df_temp, path, all_columns)
            except KeyError:
                print(df_temp)

def generate_csv(df, path: Path, annexe_fields):
    return df.to_csv(path, mode='a', index=False, columns=annexe_fields, header=True)

def append_csv(df, path: Path, annexe_fields):
    return df.to_csv(path, mode='a', index=False, columns=annexe_fields, header=False)

In [40]:
data.keys()
elements = os.listdir(Path("./csv/2020"))
elements = map(lambda e :  e.split(".")[0] , elements)
elements = list(elements)
annexes = list(data.keys())
annexes_manquantes = [item for item in annexes if item not in elements]


In [7]:
with connection_pg2() as conn:
    dat = get_data(connexion=conn, annexe="DATA_EMPRUNT", year=2020, nature_dec= '09')
dat



Unnamed: 0,annexe_id,id,type_annexe,json_annexe,fk_id_document_budgetaire,siret_etablissement,list_annexes,id.1,libelle,code_insee,...,num_dec,nature_vote,type_budget,id_etabl_princ,json_budget,fk_siret_collectivite,siret_coll,libelle_collectivite,nature_collectivite,departement
0,76910,76910,DATA_EMPRUNT,"[{""CodTypEmpr"": ""01"", ""CodProfilAmort"": ""X"", ""...",13473,21640260200017,"[DATA_EMPRUNT, DATA_TRESORERIE, DATA_TIERS, DA...",13473,BUDGET PRINCIPAL,64260,...,,3,P,,"[{""Nature"": ""001"", ""Fonction"": ""01"", ""ContNat""...",21640260200017,21640260200017,01 VILLE DE HENDAYE,COMMUNE dont la population est de 3500 habitan...,


In [11]:
import app.backend.config as conf

PATH_TO_EXPORT = Path("./csv/2020/hendaye")
                      
needed_columns = ['annexe_id', 'fk_id_document_budgetaire', 'exercice', 'type_annexe', 'json_annexe','siret_etablissement', 'libelle', 'nomenclature', 'nature_dec', 'nature_vote','type_budget', 'siret_coll', 'libelle_collectivite', 'nature_collectivite']
data = conf.CHAMPS_ANNEXES

to_replace["nature_dec"] = dict(zip(["01", "02", "03", "09"],
              ["Budget primitif", "Décision modificative", "Budget supplémentaire", "Compte administratif"]))
to_replace["type_budget"] = dict(zip(["P", "A"],
                      ["Principal","Annexe"]))


for annexe in data.keys():
    with connection_pg2() as conn:
        dat = get_data(connexion=conn, annexe=annexe, year=2020, nature_dec= '09')
    df = keep_needed_columns(needed_columns, dat)
    if not df.empty:
        print(annexe)
        try:
            df_annexe = explode_annexe_json_into_rows_third_way(df,
                                                data[annexe], 
                                                f"{PATH_TO_EXPORT.joinpath(annexe)}.csv",
                                                to_replace,
                                                needed_columns)
            #df_annexe.replace(to_replace, inplace=True)
            #generate_csv(f"{PATH_TO_EXPORT.joinpath(annexe)}.csv",df_annexe)
        except NameError:
            print(annexe)

2022-11-29 11:10:26,152 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.04s


DATA_EMPRUNT


2022-11-29 11:10:26,488 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s


DATA_TRESORERIE


2022-11-29 11:10:27,649 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s


DATA_TIERS


2022-11-29 11:10:28,857 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.02s
2022-11-29 11:10:29,017 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s


DATA_CONCOURS
DATA_RECETTE_AFFECTEE


2022-11-29 11:10:29,691 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s


DATA_FORMATION


2022-11-29 11:10:32,492 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.02s


DATA_CONSOLIDATION


2022-11-29 11:10:33,465 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s


DATA_ORGANISME_GROUP


2022-11-29 11:10:34,636 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.02s


DATA_PATRIMOINE


2022-11-29 11:10:35,431 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.04s


DATA_PERSONNEL


2022-11-29 11:10:36,297 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.02s


DATA_VENTILATION


2022-11-29 11:10:37,182 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s
2022-11-29 11:10:37,351 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s


DATA_AMORTISSEMENT_METHODE
DATA_PROVISION


2022-11-29 11:10:38,060 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s


DATA_SIGNATURE


2022-11-29 11:10:38,567 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s


DATA_SIGNATAIRE


2022-11-29 11:10:39,246 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.01s


DATA_ETAB_SERVICE


2022-11-29 11:10:40,133 DEBUG -- explode_annexe_json_into_rows_third_way ran in 0.02s


DATA_SOMMAIRE
