In [None]:
!pip install faker 

In [None]:
!pip install tqdm

## Modelo simples para gerar dados aleatórios com faker

In [7]:
# Gera dados aleatórios 
import faker
import os
import sys
import time
from tqdm import tqdm
import random

# Cria um objeto Faker
fake = faker.Faker()

def format_time(segundos):
    """
    Formata os milisegundos em hora:minuto:segundo
    """
    if segundos < 60:
        return f"{segundos:.3f} segundos"
    elif segundos < 3600:
        minutos, segundos = divmod(segundos, 60)
        return f"{int(minutos)} minutos {int(segundos)} segundos"
    else:
        horas, remainder = divmod(seconds, 3600)
        minutos, segundos = divmod(remainder, 60)
        if minutos == 0:
            return f"{int(horas)} horas {int(segundos)} segundos"
        else:
            return f"{int(horas)} horas {int(minutos)} minutos {int(segundos)} segundos"

# Função para gerar os dados de teste
def generate_test_data(num_rows):
  """
  Gera dados de teste e os escreve em um arquivo.

  Argumentos:
    num_rows: Número de linhas a serem geradas.

  Retorno:
    None.
  """
  with open("data/measurements1.txt", "w", encoding="utf-8") as file:
    for _ in tqdm(range(num_rows)):
      # Gera dados aleatórios
      station_name = fake.city()
      temperature = round(fake.random_element([random.uniform(-50, 50), random.uniform(50, 100)]), 1)

      # Escreve a linha no arquivo
      file.write(f"{station_name};{temperature}\n")

# Solicita o número de linhas ao usuário
num_rows = int(input("Digite o número de linhas a serem geradas: "))

# Gera os dados de teste
start_time = time.time()
generate_test_data(num_rows)
end_time = time.time()

# Exibe o tempo real de geração
tempo_em_segundos = end_time - start_time
print(f"Tempo real de geração:{format_time(tempo_em_segundos)}")

# Converte o tamanho do arquivo para megabytes
file_size = os.path.getsize("data/measurements1.txt")
file_size_mb = file_size / (1024 * 1024)

# Exibe o tamanho do arquivo em megabytes
print(f"Tamanho real do arquivo: {file_size_mb:.2f} Megabytes")

Digite o número de linhas a serem geradas:  1000000


100%|██████████████████████████████████████████| 1000000/1000000 [01:57<00:00, 8520.61it/s]

Tempo real de geração:1 minutos 57 segundos
Tamanho real do arquivo: 17.36 Megabytes





## Modelo para gerar dados aleatórios com faker e multiprocessing.

In [8]:
import faker
import os
import sys
import time
import random
from tqdm import tqdm
from multiprocessing import Pool

# Cria um objeto Faker
fake = faker.Faker()

# Formata o tempo.
def format_time(segundos):
    """
    Formata os milisegundos em hora:minuto:segundo
    """
    if segundos < 60:
        return f"{segundos:.3f} segundos"
    elif segundos < 3600:
        minutos, segundos = divmod(segundos, 60)
        return f"{int(minutos)} minutos {int(segundos)} segundos"
    else:
        horas, remainder = divmod(seconds, 3600)
        minutos, segundos = divmod(remainder, 60)
        if minutos == 0:
            return f"{int(horas)} horas {int(segundos)} segundos"
        else:
            return f"{int(horas)} horas {int(minutos)} minutos {int(segundos)} segundos"

# Função para gerar dados de teste
def generate_test_data(num_rows, chunk_size):
  """
  Gera dados de teste e os escreve em um arquivo.

  Argumentos:
    num_rows: Número de linhas a serem geradas.
    chunk_size: Tamanho do bloco de dados a ser gerado por cada thread.

  Retorno:
    None.
  """
  with open("data/measurements.txt", "w", encoding="utf-8") as file:
    for i in tqdm(range(0, num_rows, chunk_size)):
      # Gera um bloco de dados
      data_chunk = []
      for _ in range(chunk_size):
        station_name = fake.city()
        temperature = round(fake.random_element([random.uniform(-50, 50), random.uniform(50, 100)]), 1)
        data_chunk.append(f"{station_name};{temperature}\n")

      # Escreve o bloco de dados no arquivo
      file.writelines(data_chunk)

# Solicita o número de linhas ao usuário
num_rows = int(input("Digite o número de linhas a serem geradas: "))

# Define o tamanho do bloco de dados
chunk_size = 10_000  # Ajuste este valor de acordo com a sua memória disponível

# Número de threads
num_workers = os.cpu_count()

# Gera os dados de teste em paralelo
start_time = time.time()
with Pool(num_workers) as pool:
  pool.starmap(generate_test_data, [(num_rows, chunk_size)])
end_time = time.time()

# Exibe o tempo real de geração
tempo_em_segundos = end_time - start_time
print(f"Tempo real de geração:{format_time(tempo_em_segundos)}")

# Exibe o tamanho real do arquivo
file_size = os.path.getsize("data/measurements.txt")

# Converte o tamanho do arquivo para megabytes
file_size_mb = file_size / (1024 * 1024)

# Exibe o tamanho do arquivo em megabytes
print(f"Tamanho real do arquivo: {file_size_mb:.2f} Megabytes")

Digite o número de linhas a serem geradas:  1000000


100%|████████████████████████████████████████████████████| 100/100 [01:46<00:00,  1.07s/it]


Tempo real de geração:1 minutos 47 segundos
Tamanho real do arquivo: 17.36 Megabytes


## Scrip python para gerar dados aleatórios
* **O script foi retirado do desafio [The One Billion Row Challenge](https://github.com/gunnarmorling/1brc), originalmente proposto para Java.**

In [5]:
# Based on https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CreateMeasurements.java

import os
import sys
import random
import time


def check_args(file_args):
    """
    Sanity checks out input and prints out usage if input is not a positive integer
    """
    try:
        if len(file_args) != 2 or int(file_args[1]) <= 0:
            raise Exception()
    except:
        print("Usage:  create_measurements.sh <positive integer number of records to create>")
        print("        You can use underscore notation for large number of records.")
        print("        For example:  1_000_000_000 for one billion")
        exit()


def build_weather_station_name_list():
    """
    Grabs the weather station names from example data provided in repo and dedups
    """
    station_names = []
    with open('data/weather_stations.csv', 'r') as file:
        file_contents = file.read()
    for station in file_contents.splitlines():
        if "#" in station:
            next
        else:
            station_names.append(station.split(';')[0])
    return list(set(station_names))


def convert_bytes(num):
    """
    Convert bytes to a human-readable format (e.g., KiB, MiB, GiB)
    """
    for x in ['bytes', 'KiB', 'MiB', 'GiB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0


def format_elapsed_time(seconds):
    """
    Format elapsed time in a human-readable format
    """
    if seconds < 60:
        return f"{seconds:.3f} seconds"
    elif seconds < 3600:
        minutes, seconds = divmod(seconds, 60)
        return f"{int(minutes)} minutes {int(seconds)} seconds"
    else:
        hours, remainder = divmod(seconds, 3600)
        minutes, seconds = divmod(remainder, 60)
        if minutes == 0:
            return f"{int(hours)} hours {int(seconds)} seconds"
        else:
            return f"{int(hours)} hours {int(minutes)} minutes {int(seconds)} seconds"


def estimate_file_size(weather_station_names, num_rows_to_create):
    """
    Tries to estimate how large a file the test data will be
    """
    total_name_bytes = sum(len(s.encode("utf-8")) for s in weather_station_names)
    avg_name_bytes = total_name_bytes / float(len(weather_station_names))

    # avg_temp_bytes = sum(len(str(n / 10.0)) for n in range(-999, 1000)) / 1999
    avg_temp_bytes = 4.400200100050025

    # add 2 for separator and newline
    avg_line_length = avg_name_bytes + avg_temp_bytes + 2

    human_file_size = convert_bytes(num_rows_to_create * avg_line_length)

    return f"Estimated max file size is:  {human_file_size}."


def build_test_data(weather_station_names, num_rows_to_create):
    """
    Generates and writes to file the requested length of test data
    """
    start_time = time.time()
    coldest_temp = -99.9
    hottest_temp = 99.9
    station_names_10k_max = random.choices(weather_station_names, k=10_000)
    batch_size = 10000 # instead of writing line by line to file, process a batch of stations and put it to disk
    chunks = num_rows_to_create // batch_size
    print('Building test data...')

    try:
        with open("data/measurements1.txt", 'w') as file:
            progress = 0
            for chunk in range(chunks):
                
                batch = random.choices(station_names_10k_max, k=batch_size)
                prepped_deviated_batch = '\n'.join([f"{station};{random.uniform(coldest_temp, hottest_temp):.1f}" for station in batch]) # :.1f should quicker than round on a large scale, because round utilizes mathematical operation
                file.write(prepped_deviated_batch + '\n')
                
                # Update progress bar every 1%
                if (chunk + 1) * 100 // chunks != progress:
                    progress = (chunk + 1) * 100 // chunks
                    bars = '=' * (progress // 2)
                    sys.stdout.write(f"\r[{bars:<50}] {progress}%")
                    sys.stdout.flush()
        sys.stdout.write('\n')
    except Exception as e:
        print("Something went wrong. Printing error info and exiting...")
        print(e)
        exit()
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    file_size = os.path.getsize("data/measurements1.txt")
    human_file_size = convert_bytes(file_size)
 
    print("Test data successfully written to data/measurements1.txt")
    print(f"Actual file size:  {human_file_size}")
    print(f"Elapsed time: {format_elapsed_time(elapsed_time)}")


def main():
    """
    main program function
    """
    num_rows_to_create = 1_000_000
    weather_station_names = []
    weather_station_names = build_weather_station_name_list()
    print(estimate_file_size(weather_station_names, num_rows_to_create))
    build_test_data(weather_station_names, num_rows_to_create)
    print("Test data build complete.")


if __name__ == "__main__":
    main()
exit()

### Conclusão
* **O modelo para gerar dados com o faker se saiu extremamente lento em comparação ao script do [Gunnar Morling](https://github.com/gunnarmorling) acima.**

## Gerando 1 bilhão de linhas:

In [6]:
# gerando 1 bilhão de linhas
num_rows_to_create = 1_000_000_000
weather_station_names = []
weather_station_names = build_weather_station_name_list()
print(estimate_file_size(weather_station_names, num_rows_to_create))
build_test_data(weather_station_names, num_rows_to_create)
print("Test data build complete.")




Estimated max file size is:  14.8 GiB.
Building test data...
Test data successfully written to data/measurements.txt
Actual file size:  14.9 GiB
Elapsed time: 15 minutes 3 seconds
Test data build complete.


## Decoradores

In [2]:
# Criando um decorator para calcular o tempo de processamento
def timer(func):
    import time
    # Formata o tempo.
    def format_time(segundos: int): 
        """
        Formata os milisegundos em hora:minuto:segundo
        """
        if segundos < 60:
            return f"{segundos:.3f} segundos"
        elif segundos < 3600:
            minutos, segundos = divmod(segundos, 60)
            return f"{int(minutos)} minutos {int(segundos)} segundos"
        else:
            horas, remainder = divmod(seconds, 3600)
            minutos, segundos = divmod(remainder, 60)
            if minutos == 0:
                return f"{int(horas)} horas {int(segundos)} segundos"
            else:
                return f"{int(horas)} horas {int(minutos)} minutos {int(segundos)} segundos"  

    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        tempo_em_segundos = end - start       
        # Exibe o tempo real de geração    
        print(f"{func.__name__} Tempo de processamento:{format_time(tempo_em_segundos)}")   
        #print(type(format_time(tempo_em_segundos)))
        return result
    return wrapper


## Leitura com pandas

In [None]:
# Instalando a lib pandas.
!poetry add pandas

In [1]:
# Script para min, max e mean de um bilhão de linhas com pandas.
import pandas as pd
import time
from multiprocessing import Pool, cpu_count
from tqdm import tqdm  # importa o tqdm para barra de progresso

CONCURRENCY = cpu_count()

total_linhas = 1_000_000_000  # Total de linhas conhecido
chunksize = 100_000_000  # Define o tamanho do chunk
filename = "data/measurements.txt"  # Certifique-se de que este é o caminho correto para o arquivo

# Formata o tempo.
def format_time(segundos):
    """
    Formata os milisegundos em hora:minuto:segundo
    """
    if segundos < 60:
        return f"{segundos:.3f} segundos"
    elif segundos < 3600:
        minutos, segundos = divmod(segundos, 60)
        return f"{int(minutos)} minutos {int(segundos)} segundos"
    else:
        horas, remainder = divmod(seconds, 3600)
        minutos, segundos = divmod(remainder, 60)
        if minutos == 0:
            return f"{int(horas)} horas {int(segundos)} segundos"
        else:
            return f"{int(horas)} horas {int(minutos)} minutos {int(segundos)} segundos"

def process_chunk(chunk):
    # Agrega os dados dentro do chunk usando Pandas
    aggregated = chunk.groupby('station')['measure'].agg(['min', 'max', 'mean']).reset_index()
    return aggregated

def create_df_with_pandas(filename, total_linhas, chunksize=chunksize):
    total_chunks = total_linhas // chunksize + (1 if total_linhas % chunksize else 0)
    results = []

    with pd.read_csv(filename, sep=';', header=None, names=['station', 'measure'], chunksize=chunksize) as reader:
        # Envolvendo o iterador com tqdm para visualizar o progresso
        with Pool(CONCURRENCY) as pool:
            for chunk in tqdm(reader, total=total_chunks, desc="Processando"):
                # Processa cada chunk em paralelo
                result = pool.apply_async(process_chunk, (chunk,))
                results.append(result)

            results = [result.get() for result in results]

    final_df = pd.concat(results, ignore_index=True)

    final_aggregated_df = final_df.groupby('station').agg({
        'min': 'min',
        'max': 'max',
        'mean': 'mean'
    }).reset_index().sort_values('station')

    return final_aggregated_df

if __name__ == "__main__":
    import time

    print("Iniciando o processamento do arquivo.")
    start_time = time.time()
    df = create_df_with_pandas(filename, total_linhas, chunksize)
    end_time = time.time()
    tempo_em_segundos = end_time - start_time
    print(df.head())
    # Exibe o tempo real de geração    
    print(f"Tempo de processamento:{format_time(tempo_em_segundos)}")


Iniciando o processamento do arquivo.


Processando:  60%|█████████████████████████▏                | 6/10 [05:24<03:40, 55.25s/it]IOStream.flush timed out
Processando:  70%|█████████████████████████████▍            | 7/10 [06:20<02:46, 55.35s/it]IOStream.flush timed out
Processando:  90%|█████████████████████████████████████▊    | 9/10 [08:11<00:55, 55.40s/it]IOStream.flush timed out
Processando: 100%|█████████████████████████████████████████| 10/10 [09:20<00:00, 56.07s/it]


       station   min   max      mean
0     Aabenraa -99.9  99.9 -0.061535
1       Aalten -99.9  99.9 -0.161205
2    Abadiânia -99.9  99.9 -0.051514
3     Abalessa -99.9  99.9  0.211034
4  Abangaritos -99.9  99.9  0.066196
Tempo de processamento:-1709939632.573 segundos


## Leitura com Polars

In [None]:
# Instalando a lib polars
!poetry add polars

In [3]:
import polars as pl
@timer
def create_polars_df(): 
    pl.Config.set_streaming_chunk_size(5000000)
    # Leitura do arquivo CSV e definição do schema
    return (pl.scan_csv("data/measurements.txt", separator=";", has_header=False,
                        schema={"station": pl.String, "measure": pl.Float64})
                        .group_by("station").agg(
                                                 max_temp=pl.col("measure").max(),
                                                 min_temp=pl.col("measure").min(),
                                                 mean_temp=pl.col("measure").mean()
                                                ).sort("station").collect(streaming=True)
           )   
if __name__ == "__main__":    
    df = create_polars_df()
    print(df)
   


create_polars_df Tempo de processamento:26.558 segundos
shape: (8_836, 4)
┌────────────────┬──────────┬──────────┬───────────┐
│ station        ┆ max_temp ┆ min_temp ┆ mean_temp │
│ ---            ┆ ---      ┆ ---      ┆ ---       │
│ str            ┆ f64      ┆ f64      ┆ f64       │
╞════════════════╪══════════╪══════════╪═══════════╡
│ Aabenraa       ┆ 99.9     ┆ -99.9    ┆ -0.062295 │
│ Aalten         ┆ 99.9     ┆ -99.9    ┆ -0.161752 │
│ Abadiânia      ┆ 99.9     ┆ -99.9    ┆ -0.052772 │
│ Abalessa       ┆ 99.9     ┆ -99.9    ┆ 0.211089  │
│ Abangaritos    ┆ 99.9     ┆ -99.9    ┆ 0.063772  │
│ …              ┆ …        ┆ …        ┆ …         │
│ ’Aïn Abessa    ┆ 99.9     ┆ -99.9    ┆ -0.013338 │
│ ’Aïn Azel      ┆ 99.9     ┆ -99.9    ┆ -0.040762 │
│ ’Aïn Roua      ┆ 99.9     ┆ -99.9    ┆ -0.026898 │
│ ’s-Gravenzande ┆ 99.9     ┆ -99.9    ┆ -0.080973 │
│ ’s-Heerenberg  ┆ 99.9     ┆ -99.9    ┆ 0.168265  │
└────────────────┴──────────┴──────────┴───────────┘


## Leitura com Duckdb

In [None]:
!poetry add duckdb

In [7]:
import duckdb
@timer
def create_duckdb():
    duckdb.sql("""
        SELECT station,
            MIN(measure) AS min_temperature,
            CAST(AVG(measure) AS DECIMAL()) AS mean_temperature,
            MAX(measure) AS max_temperature
        FROM read_csv("data/measurements.txt", AUTO_DETECT=FALSE, sep=';', columns={'station':VARCHAR, 'measure': 'DECIMAL'})
        GROUP BY station
        ORDER BY station
    """).show()

if __name__ == "__main__":
    create_duckdb()   

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────────┬─────────────────┬──────────────────┬─────────────────┐
│     station     │ min_temperature │ mean_temperature │ max_temperature │
│     varchar     │  decimal(18,3)  │  decimal(18,3)   │  decimal(18,3)  │
├─────────────────┼─────────────────┼──────────────────┼─────────────────┤
│ Aabenraa        │         -99.900 │           -0.062 │          99.900 │
│ Aalten          │         -99.900 │           -0.162 │          99.900 │
│ Abadiânia       │         -99.900 │           -0.053 │          99.900 │
│ Abalessa        │         -99.900 │            0.211 │          99.900 │
│ Abangaritos     │         -99.900 │            0.064 │          99.900 │
│ Abano Terme     │         -99.900 │            0.172 │          99.900 │
│ Abaré           │         -99.900 │           -0.267 │          99.900 │
│ Abbeville       │         -99.900 │            0.074 │          99.900 │
│ Abbiategrasso   │         -99.900 │           -0.195 │          99.900 │
│ Abbots Langley  │      

In [9]:
import duckdb
@timer
def create_duckdb(): 
    conn = duckdb.connect(':memory:')
    start_time = time.time()
    display(conn.execute("""
            SELECT station,
                MIN(measure) AS min_temperature,
                CAST(AVG(measure) AS DECIMAL()) AS mean_temperature,
                MAX(measure) AS max_temperature
            FROM read_csv("data/measurements.txt", AUTO_DETECT=FALSE, sep=';', columns={'station':VARCHAR, 'measure': 'DECIMAL'})
            GROUP BY station
            ORDER BY station
        """).df())
    
if __name__ == "__main__":
    create_duckdb()   

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,station,min_temperature,mean_temperature,max_temperature
0,Aabenraa,-99.9,-0.062,99.9
1,Aalten,-99.9,-0.162,99.9
2,Abadiânia,-99.9,-0.053,99.9
3,Abalessa,-99.9,0.211,99.9
4,Abangaritos,-99.9,0.064,99.9
...,...,...,...,...
8831,’Aïn Abessa,-99.9,-0.013,99.9
8832,’Aïn Azel,-99.9,-0.041,99.9
8833,’Aïn Roua,-99.9,-0.027,99.9
8834,’s-Gravenzande,-99.9,-0.081,99.9


create_duckdb Tempo de processamento:19.675 segundos


In [9]:
display(conn.execute('SHOW TABLES').df())

Unnamed: 0,name
0,measurements


In [10]:
display(conn.execute('Select * from measurements limit 10').df())

Unnamed: 0,station,measure
0,Uppalapādu,-29.9
1,Kaman,-34.0
2,Tamu,-40.2
3,Quillabamba,50.6
4,Jabuticabal,15.8
5,Kilibo,-98.4
6,Tillaivilāgam,48.3
7,Ensenada,97.4
8,Indio,64.6
9,Buxton,-54.7


In [30]:
conn.close()

## Dask - min, max e mean em 1 bilhão de linhas

In [None]:
#necessário Instalar.
!pip install dask-expr

In [20]:
%%time
import dask
import dask.dataframe as dd
from tqdm import tqdm
dask.config.set({'dataframe.query-planning': True})
# Ler o arquivo txt diretamente em um DataFrame Dask
df = dd.read_csv('data/measurements.txt', delimiter=';', header=None, names=['City', 'Temperature'])
# min, max, e mean pela cidade ordenado pelo index
df.groupby('City').agg({'Temperature': ['max','min','mean']}).compute().sort_index()

CPU times: user 12min 50s, sys: 37.9 s, total: 13min 28s
Wall time: 6min 27s


Unnamed: 0_level_0,Temperature,Temperature,Temperature
Unnamed: 0_level_1,max,min,mean
City,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Aalborg,99.9,-99.9,0.177352
Aalten,99.9,-99.9,-0.107256
Aartselaar,99.9,-99.9,0.098415
Aasiaat,99.9,-99.9,-0.001204
Abaetetuba,99.9,-99.9,0.315703
...,...,...,...
‘Aqrah,99.9,-99.9,0.388061
’Ali Ben Sliman,99.9,-99.9,-0.074811
’Ayn Bni Mathar,99.9,-99.9,0.024268
’s-Gravendeel,99.9,-99.9,0.094909


In [1]:
import pandas as pd
import dask
dask.config.set({'dataframe.query-planning': True})

<dask.config.set at 0x7fdf0b4c51d0>

In [7]:
%%time
display(df.head())

Unnamed: 0,City,Temperature
0,Uppalapādu,-29.9
1,Kaman,-34.0
2,Tamu,-40.2
3,Quillabamba,50.6
4,Jabuticabal,15.8


CPU times: user 793 ms, sys: 59.1 ms, total: 853 ms
Wall time: 849 ms


In [8]:
import time
import dask
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd
start_time = time.time()
# Ler o arquivo txt diretamente em um DataFrame Dask
df = dd.read_csv('data/measurements.txt', delimiter=';', header=None, names=['City', 'Temperature'])
# min, max, e mean pela cidade ordenado pelo index
print(df.groupby('City').
agg({'Temperature': ['max','min','mean']}).
compute().
sort_index())
end_time = time.time() - start_time
print(f"tempo de processamento com Dask:{end_time:.2f} segundos")

                 min_temperature  max_temperature  avg_temperature
City                                                              
Aalborg                    -99.9             99.9         0.177352
Aalten                     -99.9             99.9        -0.107256
Aartselaar                 -99.9             99.9         0.098415
Aasiaat                    -99.9             99.9        -0.001204
Abaetetuba                 -99.9             99.9         0.315703
...                          ...              ...              ...
‘Aqrah                     -99.9             99.9         0.388061
’Ali Ben Sliman            -99.9             99.9        -0.074811
’Ayn Bni Mathar            -99.9             99.9         0.024268
’s-Gravendeel              -99.9             99.9         0.094909
’s-Gravenzande             -99.9             99.9         0.122677

[8912 rows x 3 columns]


In [9]:
type(df)

dask.dataframe.core.DataFrame

In [11]:
%%time
df_agrupado = df.groupby('City')

CPU times: user 336 µs, sys: 25 µs, total: 361 µs
Wall time: 376 µs


In [17]:
df_agrupado['Temperature'].min().compute()

City
Aalborg           -99.9
Aalten            -99.9
Aartselaar        -99.9
Aasiaat           -99.9
Abaetetuba        -99.9
                   ... 
‘Aqrah            -99.9
’Ali Ben Sliman   -99.9
’Ayn Bni Mathar   -99.9
’s-Gravendeel     -99.9
’s-Gravenzande    -99.9
Name: Temperature, Length: 8912, dtype: float64

In [None]:
import time
    start_time = time.time()
    create_duckdb()
    took = time.time() - start_time

    print(f"Duckdb Took: {took:.2f} sec")

In [4]:
conn.execute("""
        SELECT station,
            MIN(measure) AS min_temperature,
            CAST(AVG(measure) AS DECIMAL()) AS mean_temperature,
            MAX(measure) AS max_temperature
        FROM read_csv("data/measurements.txt", AUTO_DETECT=FALSE, sep=';', columns={'station':VARCHAR, 'measure': 'DECIMAL'})
        GROUP BY station
        ORDER BY station
    """).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,station,min_temperature,mean_temperature,max_temperature
0,Aalborg,-99.9,0.177,99.9
1,Aalten,-99.9,-0.107,99.9
2,Aartselaar,-99.9,0.098,99.9
3,Aasiaat,-99.9,-0.001,99.9
4,Abaetetuba,-99.9,0.316,99.9
...,...,...,...,...
8907,‘Aqrah,-99.9,0.388,99.9
8908,’Ali Ben Sliman,-99.9,-0.075,99.9
8909,’Ayn Bni Mathar,-99.9,0.024,99.9
8910,’s-Gravendeel,-99.9,0.095,99.9


In [18]:
%time df.groupby('City').agg({'Temperature': ['max','min','mean']}).compute().sort_index()

CPU times: user 12min 47s, sys: 39.3 s, total: 13min 27s
Wall time: 6min 26s


Unnamed: 0_level_0,Temperature,Temperature,Temperature
Unnamed: 0_level_1,max,min,mean
City,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Aalborg,99.9,-99.9,0.177352
Aalten,99.9,-99.9,-0.107256
Aartselaar,99.9,-99.9,0.098415
Aasiaat,99.9,-99.9,-0.001204
Abaetetuba,99.9,-99.9,0.315703
...,...,...,...
‘Aqrah,99.9,-99.9,0.388061
’Ali Ben Sliman,99.9,-99.9,-0.074811
’Ayn Bni Mathar,99.9,-99.9,0.024268
’s-Gravendeel,99.9,-99.9,0.094909


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Inicializar uma sessão Spark
spark = SparkSession.builder \
    .appName("Temperature Analysis") \
    .getOrCreate()

# Ler o arquivo CSV diretamente em um DataFrame Spark
df = spark.read.option("header", "false").option("delimiter", ";").csv("data/measurement.txt") \
    .toDF("City", "Temperature")

# Converter a coluna 'Temperature' para tipo numérico
df = df.withColumn("Temperature", col("Temperature").cast("float"))

# Calcular estatísticas usando Spark SQL
statistics = df.groupBy("City") \
    .agg({"Temperature": "min", "Temperature": "max", "Temperature": "avg"}) \
    .withColumnRenamed("min(Temperature)", "Min Temperature") \
    .withColumnRenamed("max(Temperature)", "Max Temperature") \
    .withColumnRenamed("avg(Temperature)", "Avg Temperature")

# Ordenar as estatísticas pela cidade
statistics_sorted = statistics.orderBy("City")

# Mostrar as estatísticas
statistics_sorted.show()

# Encerrar a sessão Spark
spark.stop()


                                                                                

+--------------+--------------------+
|          City|     Avg Temperature|
+--------------+--------------------+
|       Aalborg| 0.17735169638322507|
|        Aalten|-0.10725623892898627|
|    Aartselaar| 0.09841461554829589|
|       Aasiaat|-0.00120383189600...|
|    Abaetetuba| 0.31570318422244587|
|       Abaiara|-0.10785455016378943|
|Abasingammedda| 0.24278637758252167|
|          Abaí|  0.2265865596195253|
|        Abaíra|-0.07328732648552085|
|    Abbotsford|-0.15673792767205766|
|      Abdulino|-0.05927886913367...|
|  Abelardo Luz|-0.16398459605845808|
|      Abergele|0.011267956863166902|
|      Abertawe| 0.27357638913117077|
|         Abhia| -0.0993893431241328|
|         Abiko| -0.0918752473903614|
|       Abilene| -0.1804415891195874|
|       Aboisso|-0.43771594039828343|
|         Aboso| 0.03154420554692955|
|         Abram|-0.14073774133254255|
+--------------+--------------------+
only showing top 20 rows



In [2]:
!poetry add pyspark

Using version [39;1m^3.5.1[39;22m for [36mpyspark[39m

[34mUpdating dependencies[39m
[2K[34mResolving dependencies...[39m [39;2m(18.6s)[39;22m://files.pythonhosted.org/packages/73/e5/c9eb78cc982dafb7b5834bc5c368fe596216c8b9f7c4b4ffa104c4d2ab8f/pyspark-3.5.1.tar.gz  99%[39m [39;2m(10.6s)[39;22m[34mResolving dependencies...[39m [36mDownloading https://files.pythonhosted.org/packages/73/e5/c9eb78cc982dafb7b5834bc5c368fe596216c8b9f7c4b4ffa104c4d2ab8f/pyspark-3.5.1.tar.gz  29%[39m [39;2m(2.9s)[39;22m[34mResolving dependencies...[39m [36mDownloading https://files.pythonhosted.org/packages/73/e5/c9eb78cc982dafb7b5834bc5c368fe596216c8b9f7c4b4ffa104c4d2ab8f/pyspark-3.5.1.tar.gz  59%[39m [39;2m(5.6s)[39;22m[34mResolving dependencies...[39m [36mDownloading https://files.pythonhosted.org/packages/73/e5/c9eb78cc982dafb7b5834bc5c368fe596216c8b9f7c4b4ffa104c4d2ab8f/pyspark-3.5.1.tar.gz  86%[39m [39;2m(8.0s)[39;22m[34mResolving dependencies...[39m [39;2m(12.5s)[39;22

In [None]:
import polars as pl

# Ler o arquivo CSV diretamente em um DataFrame Polars
df = pl.read_csv('data/measurement.txt', has_header=True)

# Calcular estatísticas
statistics = df.groupby('City').agg(
    pl.min(df['Temperature']).alias('Min Temperature'),
    pl.max(df['Temperature']).alias('Max Temperature'),
    pl.avg(df['Temperature']).alias('Avg Temperature')
)

# Ordenar as estatísticas pela cidade
statistics_sorted = statistics.sort('City')

# Exibir as estatísticas
print(statistics_sorted)



In [3]:
import dask.dataframe as dd

# Ler o arquivo CSV diretamente em um DataFrame Dask
df = dd.read_csv('data/measurement.txt', delimiter=';', header=None, names=['City', 'Temperature'])

# Converter a coluna 'Temperature' para tipo numérico
df['Temperature'] = df['Temperature'].astype(float)

# Calcular estatísticas
statistics = df.groupby('City').agg(
    min_temperature=('Temperature', 'min'),
    max_temperature=('Temperature', 'max'),
    avg_temperature=('Temperature', 'mean')
).compute()

# Ordenar as estatísticas pela cidade
statistics_sorted = statistics.sort_index()

# Exibir as estatísticas
print(statistics_sorted)


In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd


                 min_temperature  max_temperature  avg_temperature
City                                                              
Aalborg                    -99.9             99.9         0.177352
Aalten                     -99.9             99.9        -0.107256
Aartselaar                 -99.9             99.9         0.098415
Aasiaat                    -99.9             99.9        -0.001204
Abaetetuba                 -99.9             99.9         0.315703
...                          ...              ...              ...
‘Aqrah                     -99.9             99.9         0.388061
’Ali Ben Sliman            -99.9             99.9        -0.074811
’Ayn Bni Mathar            -99.9             99.9         0.024268
’s-Gravendeel              -99.9             99.9         0.094909
’s-Gravenzande             -99.9             99.9         0.122677

[8912 rows x 3 columns]


In [5]:
import vaex
import time

def main(filename):
    start_time = time.time()

    # Leitura do arquivo CSV utilizando Vaex
    df = vaex.from_csv(filename, names=['city', 'temperature'], sep=';')

    # Cálculo das estatísticas
    min_temperature = df.groupby(df['city']).agg({'temperature': 'min'})
    max_temperature = df.groupby(df['city']).agg({'temperature': 'max'})
    mean_temperature = df.groupby(df['city']).agg({'temperature': 'mean'})

    # Concatenação dos resultados
    combined_results = vaex.concat([min_temperature, max_temperature, mean_temperature], axis=1)
    combined_results.columns = ['min_temperature', 'max_temperature', 'mean_temperature']

    # Exibição dos resultados
    print(combined_results)

    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {format_time(execution_time)}")

def format_time(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return "{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)

if __name__ == "__main__":
    filename = "data/measurements.txt"
    main(filename)


ModuleNotFoundError: No module named 'vaex'

In [2]:
!poetry add dask

Using version [39;1m^2024.2.1[39;22m for [36mdask[39m

[34mUpdating dependencies[39m
[2K[34mResolving dependencies...[39m [39;2m(1.0s)[39;22m

[39;1mPackage operations[39;22m: [34m7[39m installs, [34m0[39m updates, [34m0[39m removals

  [34;1m•[39;22m [39mInstalling [39m[36mlocket[39m[39m ([39m[39;1m1.0.0[39;22m[39m)[39m: [34mPending...[39m
  [34;1m•[39;22m [39mInstalling [39m[36mtoolz[39m[39m ([39m[39;1m0.12.1[39;22m[39m)[39m: [34mPending...[39m
  [34;1m•[39;22m [39mInstalling [39m[36mzipp[39m[39m ([39m[39;1m3.17.0[39;22m[39m)[39m: [34mPending...[39m
[1A[0J  [34;1m•[39;22m [39mInstalling [39m[36mzipp[39m[39m ([39m[39;1m3.17.0[39;22m[39m)[39m: [34mInstalling...[39m
[1A[0J  [32;1m•[39;22m [39mInstalling [39m[36mzipp[39m[39m ([39m[32m3.17.0[39m[39m)[39m
[2A[0J  [32;1m•[39;22m [39mInstalling [39m[36mzipp[39m[39m ([39m[32m3.17.0[39m[39m)[39m
[1A[0J  [34;1m•[39;22m [39mInstalling [

In [None]:
<a id="ancora01"></a>

In [None]:
<a id="ancora02"></a>

In [None]:
<a id="ancora03"></a>

In [None]:
<a id="ancora04"></a>

In [None]:
<a id="ancora05"></a>

In [None]:
<a id="ancora06"></a>

In [None]:
<a id="ancora07"></a>

In [None]:
<a id="ancora08"></a>

In [None]:
<a id="ancora09"></a>

In [None]:
<a id="ancora10"></a>

In [None]:
<a id="ancora11"></a>

In [None]:
<a id="ancora12"></a>

In [None]:
<a id="ancora13"></a>

In [None]:
<a id="ancora14"></a>

In [None]:
<a id="ancora15"></a>

In [None]:
<a id="ancora16"></a>

In [None]:
<a id="ancora17"></a>

In [None]:
<a id="ancora18"></a>

In [None]:
<a id="ancora19"></a>

In [None]:
<a id="ancora20"></a>