In [92]:
import pandas as pd
import h5py as hdf5
import os
import time
import sqlite3

path_file = "sales_data.csv"
use_cols = ['customer_id', 'product_id', 'quantity', 'price']

dtypes = {
    'transaction_id' : 'int32' ,
    'customer_id' : 'int32'  ,  
    'product_id' : 'int16'  ,        
    'quantity': 'int16',          
    'price': 'float32',                  
}

chunk_size = 1000
chunk_size_rows = 10
data = pd.DataFrame() 
data_transaction = pd.DataFrame()

def check_fun_error(fun):
    def wrapper(*args, **kwargs):
        try:
            return fun(*args, **kwargs)  
        except Exception as e: 
            print(f"❌ An error occurred in function '{fun.__name__}': {str(e)}")
            exit(1)  
    return wrapper


def timing(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Temps d'exécution de {func.__name__}: {end_time - start_time:.4f} secondes")
        return result
    return wrapper

def connect_to_db(func):
    def wrapper(*args , **kwargs) :
        try :
            connection = sqlite3.connect('sales.db')
            cursor = connection.cursor()
            kwargs['connection'] = connection
            kwargs['cursor'] = cursor
            print("connect success") 
            return func(*args , **kwargs)
        except Exception as e :
            print(f"❌ An error occurred in function '{func.__name__}': {str(e)}")
        
        finally :
            print("connect close") 
            connection.commit()
            connection.close()
    return wrapper
            
                 
@check_fun_error  
def load_file_with_chunks(path_file, chunk_size) -> None:
    global data 
    try : 
        for ch in pd.read_csv(path_file, usecols= use_cols, dtype= dtypes  , chunksize=chunk_size):
            data = pd.concat([data, ch], ignore_index=True).sample(frac=0.1  , random_state = 2)
    finally : 
        if len(data) > 0:
            print("✅ Data loaded successfully.")
        else:
            print("⚠️ Something went wrong, data is empty.")


@check_fun_error
@timing
def convert_to_feather_or_parquet(type_convert) : 
    global data
    type_convert = type_convert.lower()
    if type_convert == "feather":
        print("You convert to feather ")
        return data.to_feather('data.feather')
    if type_convert == "parquet" :
        print("You convert to parquet  ") 
        return data.to_parquet('data.parquet')
    
@check_fun_error
@timing
def read_files_feather_parquet_csv(type_file):
    type_file = type_file.lower()  
    if type_file == "feather":
        print("You are reading a Feather file")
        return pd.read_feather("data.feather")
    elif type_file == "parquet":
        print("You are reading a Parquet file")
        return pd.read_parquet("data.parquet")
    elif type_file == "csv":
        print("You are reading a CSV file")
        return pd.read_csv("sales_data.csv")
    else:
        print("Unsupported file type. Please choose from 'feather', 'parquet', or 'csv'.")
        return None       

@check_fun_error
def compare_size_files():
    # Obtenir la taille des fichiers en octets
    file_size_csv = os.path.getsize('sales_data.csv')
    file_size_feather = os.path.getsize('data.feather')
    file_size_parquet = os.path.getsize('data.parquet')
    
    # Afficher les tailles des fichiers
    print(f"Taille du fichier CSV: {file_size_csv:.2f} octets")
    print(f"Taille du fichier Feather: {file_size_feather:.2f} octets")
    print(f"Taille du fichier Parquet: {file_size_parquet:.2f} octets")

    # Comparaison des tailles pour déterminer le fichier le plus volumineux
    if file_size_csv > file_size_feather and file_size_csv > file_size_parquet:
        print("Le fichier CSV est le plus volumineux.")
    elif file_size_feather > file_size_csv and file_size_feather > file_size_parquet:
        print("Le fichier Feather est le plus volumineux.")
    elif file_size_parquet > file_size_csv and file_size_parquet > file_size_feather:
        print("Le fichier Parquet est le plus volumineux.")
    else:
        print("Plusieurs fichiers ont des tailles équivalentes.")
        
@check_fun_error
def file_hdf5( file_name , data , data_transaction_supp_100) :
    try : 
        with hdf5.File(file_name, "w") as hd5_file:
            hd5_file.create_dataset('sales_sample', data=data)
            hd5_file.create_dataset('sales_high_transaction', data=data_transaction_supp_100)
        print("ok")
    except Exception as e :
        print(f"❌ An error occurred in function : {str(e)}")   
    pass

@check_fun_error
def read_first_five_rows_from_hdf5(file_name):
    with hdf5.File(file_name, 'r') as hdf:
        print(hdf['sales_sample'][0:5])    
        
@check_fun_error       
def read_file_from_rows (file_path , chunk_size) : 
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        yield chunk 

@check_fun_error
def combine_and_calcul_total() : 
    global data_transaction
    for chunk in read_file_from_rows('sales_data.csv'  , chunk_size_rows):
        data_transaction = pd.concat([data_transaction , chunk[chunk.quantity > 10]] , ignore_index=True)
    
    data_transaction['total_value'] = data_transaction['price'] * data_transaction['quantity']
    total = data_transaction['total_value'].sum()
    print("Data : \n", data_transaction.head(10))
    print("Total : \n", total)

@check_fun_error
@connect_to_db
def convert_csv_to_sql_db(connection , cursor = None):
    data_transaction.to_sql('sales' , connection , if_exists='replace' , index=False )

@check_fun_error
@connect_to_db
def select_transaction_group_by_eur(cursor ,connection = None ):
    stm = cursor.execute(""" SELECT * FROM sales WHERE region='East' and price > 50   """)
    data = stm.fetchall()
    return  data
      
        
if len(data)  ==  0 :
    load_file_with_chunks(path_file=path_file, chunk_size=chunk_size)

data_transaction_supp_100 = data[ data.price > 100 ]
    
# print("#" * 60 )
# convert_to_feather_or_parquet("feather")

# print("#" * 60 )
# convert_to_feather_or_parquet("parquet")

# print("#" * 60 )
# file_hdf5('sales_data.h5' ,  data=data , data_transaction_supp_100=data_transaction_supp_100)
# read_first_five_rows_from_hdf5('sales_data.h5')
# df = read_files_feather_parquet_csv("feather")


# print("#" * 60 )
# df = read_files_feather_parquet_csv("parquet")
# print("#" * 60 )
# df = read_files_feather_parquet_csv("csv")

# print(df.head(10))

# compare_size_files()

# print(data.info())

combine_and_calcul_total()
convert_csv_to_sql_db()
print(select_transaction_group_by_eur())




✅ Data loaded successfully.
Data : 
    transaction_id  customer_id  product_id  quantity   price transaction_date  \
0               2         6831         260        17  817.46       2023-02-28   
1               3         8325         677        19  105.57       2020-10-11   
2               4         5339         932        11   29.94       2024-07-17   
3               5         3868         546        12   35.08       2022-07-05   
4               6         9910         721        15  625.86       2024-06-30   
5               7         4165         782        17   12.08       2023-06-26   
6              12         2814         532        19  892.34       2021-06-22   
7              13         2085         354        16  918.40       2021-12-09   
8              15         2577         301        17   32.87       2023-10-08   
9              16         3343         509        17  840.91       2020-11-14   

    region  total_value  
0  Central     13896.82  
1    South      200

In [None]:
import h5py
import pandas as pd
import numpy as np


data = {
    'TransactionID': [1, 2, 3, 4, 5],
    'Prix': [50, 150, 80, 120, 200],
    'Quantité': [1, 2, 1, 3, 1]
}
normal_data = pd.DataFrame(data)

# Convertir les types avec numpy
normal_data['TransactionID'] = normal_data['TransactionID'].astype(np.int32) 
normal_data['Prix'] = normal_data['Prix'].astype(np.float64)
normal_data['Quantité'] = normal_data['Quantité'].astype(np.int32)

# Filtrer les transactions dont le prix est supérieur à 100
data_transaction_supp_100 = normal_data[normal_data['Prix'] > 100]

# Créer un fichier HDF5 et y stocker les deux tables
with h5py.File("sales_data.h5", "w") as hd5_file:
    hd5_file.create_dataset('sales_sample', data=normal_data)
    hd5_file.create_dataset('sales_high_transaction', data=data_transaction_supp_100)

with h5py.File('sales_data.h5', 'r') as hdf:
    # Lire les données du table 'sales_sample'
    print(hdf['sales_sample'][0:5]) 