In [5]:
import os
import pandas as pd
import pyarrow.dataset as ds

# Chemin racine du dossier
table_path = "C:/Users/julie/Documents/Streaming/data_lake/TRANSACTIONS_PENDING"

# Charger récursivement tous les fichiers parquet en tenant compte des partitions
dataset = ds.dataset(table_path, format="parquet", partitioning="hive")

# Convertir en DataFrame pandas
df = dataset.to_table().to_pandas()

# Afficher un aperçu des données
print("Aperçu des données :")
print(df.head())

# Infos sur les colonnes et types
print("\nInformations DataFrame :")
print(df.info())

# Statistiques descriptives des colonnes numériques
print("\nStatistiques descriptives :")
print(df.describe())

# Compter les valeurs manquantes par colonne
print("\nValeurs manquantes par colonne :")
print(df.isna().sum())

Aperçu des données :
  TRANSACTION_ID                    TIMESTAMP  \
0   TXN-4aeb6c15  2025-04-19T07:20:12.305600Z   
1   TXN-cdf1eb54  2025-04-17T08:57:51.305600Z   
2   TXN-05db4e32  2025-04-14T00:25:23.305600Z   
3   TXN-3b1bb2d2  2025-05-08T11:38:25.312495Z   
4   TXN-c9f2d7da  2025-04-27T03:14:24.305600Z   

                     USER_ID_HASHED                  USER_NAME_HASHED  \
0  54f57e09864b54ed59a5cc633673dde4  194cd2f711cebc782ada13e259bf61dd   
1  3ff77438146870ee5dcd120f13dec323  01c3012a4b4c3a3ed2aeefefafa38cc5   
2  385f7d23d00928a441e647da673b2c06  88700f697fe98f2bf0370a7114d5bacf   
3  7271fbbbc55316d6aab210f25eeda95c  5447aa4026288039fccf95cf270282b3   
4  2b4f14972a7ae2a887a3bdaade7bf126  262031397020fd8df478ec13b4b096c5   

  PRODUCT_ID  AMOUNT CURRENCY TRANSACTION_TYPE   STATUS            CITY  ...  \
0   PROD-669  873.04      JPY          payment  pending        Salvador  ...   
1   PROD-506  416.12      EUR           refund  pending       São Paulo  ...   
2   P

In [11]:
import pyarrow.dataset as ds
import pandas as pd
import numpy as np
import json

# Charger le dataset
table_path = "C:/Users/julie/Documents/Streaming/data_lake/TRANSACTIONS_PENDING"
dataset = ds.dataset(table_path, format="parquet", partitioning="hive")
df = dataset.to_table().to_pandas()

# Colonnes d'exemple
columns = ["TRANSACTION_ID", "AMOUNT", "STATUS"]
df_proj = df

# Pagination : page 1, 10 lignes
page_size = 10
page = 1
start = (page - 1) * page_size
end = start + page_size
page_df = df_proj.iloc[start:end]

# Fonction nettoyage complète
def clean_df_for_json(df: pd.DataFrame) -> pd.DataFrame:
    # Remplacer inf et -inf par NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Convertir datetime en string ISO 8601 (en remplaçant NaT par None)
    for col in df.select_dtypes(include=['datetime64[ns, UTC]', 'datetime64[ns]']):
        df[col] = df[col].astype(str).replace('NaT', None)

    # Forcer le type object pour pouvoir mettre None
    df = df.astype(object)

    # Remplacer NaN par None
    df = df.where(pd.notnull(df), None)

    return df

page_df_clean = clean_df_for_json(page_df)

print(page_df_clean)

# Essayer de convertir en JSON
try:
    json_data = json.dumps(page_df_clean.to_dict(orient="records"))
    print("Conversion JSON réussie !")
    print(json_data)
except Exception as e:
    print("Erreur lors de la conversion JSON :", e)


  TRANSACTION_ID                    TIMESTAMP  \
0   TXN-4aeb6c15  2025-04-19T07:20:12.305600Z   
1   TXN-cdf1eb54  2025-04-17T08:57:51.305600Z   
2   TXN-05db4e32  2025-04-14T00:25:23.305600Z   
3   TXN-3b1bb2d2  2025-05-08T11:38:25.312495Z   
4   TXN-c9f2d7da  2025-04-27T03:14:24.305600Z   
5   TXN-8e7c7031  2025-04-26T08:05:23.312495Z   
6   TXN-50d21d0c  2025-05-07T14:13:09.312495Z   
7   TXN-492b36c3  2025-04-24T10:00:22.312495Z   
8   TXN-c61925f2  2025-04-25T06:41:30.305600Z   

                     USER_ID_HASHED                  USER_NAME_HASHED  \
0  54f57e09864b54ed59a5cc633673dde4  194cd2f711cebc782ada13e259bf61dd   
1  3ff77438146870ee5dcd120f13dec323  01c3012a4b4c3a3ed2aeefefafa38cc5   
2  385f7d23d00928a441e647da673b2c06  88700f697fe98f2bf0370a7114d5bacf   
3  7271fbbbc55316d6aab210f25eeda95c  5447aa4026288039fccf95cf270282b3   
4  2b4f14972a7ae2a887a3bdaade7bf126  262031397020fd8df478ec13b4b096c5   
5  71deaaea048fc460df652343efe92b98  72626c2e45e149d005a0d408e80dcfea  