In [1]:
import random
from pathlib import Path
from time import perf_counter

import numpy as np
import pandas as pd

In [2]:
rng = np.random.default_rng(seed=0)
random.seed(0)

df = (
    pd.DataFrame(
        data={
            "time_stamp": pd.date_range(start="2022-01-01", end="2023-12-31", freq="s"),
            # "time_stamp": pd.date_range(start="2022-01-01", end="2023-12-31", freq="d"),
        }
    )
    .reset_index()
    .rename(columns={"index": "id"})
)
df.index = df["time_stamp"]
df.drop(columns=["time_stamp"], inplace=True)
df["status"] = [random.choice(["a", "b", "c", None]) for _ in range(df.__len__())]
df["temperature"] = 20 + np.sin(np.arange(0, df.__len__()) / np.pi) * rng.random(
    size=df.__len__()
)
df["humidity"] = 50 + 3 * np.cos(np.arange(0, df.__len__()) / np.pi) * rng.random(
    size=df.__len__()
)
df["ax"] = (
    1 * np.sin(np.arange(0, df.__len__()) / 10 * np.pi) * rng.random(size=df.__len__())
)
df["ay"] = (
    2 * np.cos(np.arange(0, df.__len__()) / 10 * np.pi) * rng.random(size=df.__len__())
)
df["az"] = (
    3 * np.cos(np.arange(0, df.__len__()) / 10 * np.pi) * rng.random(size=df.__len__())
)
df["gx"] = (
    0.3
    * np.sin(np.arange(0, df.__len__()) / 100 * np.pi)
    * rng.random(size=df.__len__())
)
df["gy"] = (
    0.2
    * np.cos(np.arange(0, df.__len__()) / 100 * np.pi)
    * rng.random(size=df.__len__())
)
df["gz"] = (
    0.1
    * np.cos(np.arange(0, df.__len__()) / 100 * np.pi)
    * rng.random(size=df.__len__())
)

In [3]:
df

Unnamed: 0_level_0,id,status,temperature,humidity,ax,ay,az,gx,gy,gz
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-01-01 00:00:00,0,,20.000000,50.997373,0.000000e+00,1.918437,2.334443,0.000000e+00,0.039687,0.084940
2022-01-01 00:00:01,1,,20.084433,52.020426,1.053799e-01,0.440928,2.163537,6.223359e-03,0.170616,0.006961
2022-01-01 00:00:02,2,a,20.024358,50.148385,3.240486e-01,0.775749,1.590396,1.397122e-02,0.176159,0.003978
2022-01-01 00:00:03,3,c,20.013491,50.605642,8.123136e-02,0.950801,0.747157,9.224194e-03,0.008427,0.041242
2022-01-01 00:00:04,4,,20.777532,50.284606,4.784953e-01,0.316534,0.455103,2.465079e-02,0.185672,0.043297
...,...,...,...,...,...,...,...,...,...,...
2023-12-30 23:59:56,62985596,a,19.948096,48.239712,-3.311479e-01,0.159452,0.090173,-1.841764e-02,0.084984,0.059392
2023-12-30 23:59:57,62985597,,19.732270,48.962753,-2.413562e-01,0.982792,0.254667,-4.170444e-03,0.018205,0.000250
2023-12-30 23:59:58,62985598,,19.923909,49.281474,-3.463108e-01,0.285800,1.427105,-1.888245e-03,0.125264,0.018564
2023-12-30 23:59:59,62985599,a,19.353285,49.540687,-5.775350e-02,1.725683,0.981417,-8.383588e-03,0.040183,0.075259


In [4]:
df.info()

# <class 'pandas.core.frame.DataFrame'>
# DatetimeIndex: 62985601 entries, 2022-01-01 00:00:00 to 2023-12-31 00:00:00
# Data columns (total 10 columns):
#  #   Column       Dtype
# ---  ------       -----
#  0   id           int64
#  1   status       object
#  2   temperature  float64
#  3   humidity     float64
#  4   ax           float64
#  5   ay           float64
#  6   az           float64
#  7   gx           float64
#  8   gy           float64
#  9   gz           float64
# dtypes: float64(8), int64(1), object(1)
# memory usage: 5.2+ GB


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62985601 entries, 2022-01-01 00:00:00 to 2023-12-31 00:00:00
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   status       object 
 2   temperature  float64
 3   humidity     float64
 4   ax           float64
 5   ay           float64
 6   az           float64
 7   gx           float64
 8   gy           float64
 9   gz           float64
dtypes: float64(8), int64(1), object(1)
memory usage: 5.2+ GB


In [5]:
Path("../data").mkdir(parents=True, exist_ok=True)

In [6]:
t_start = perf_counter()
df.to_csv("../data/large_table.csv", index=False)
print(f"elapsed: {perf_counter() - t_start}")
# elapsed: 1090.711631446

elapsed: 1090.711631446


In [7]:
t_start = perf_counter()
df.to_parquet("../data/large_table.parquet", index=None)
print(f"elapsed: {perf_counter() - t_start}")
# elapsed: 46.29157750000013

elapsed: 46.29157750000013


In [4]:
t_start = perf_counter()
df.to_pickle("../data/large_table.pkl")
print(f"elapsed: {perf_counter() - t_start}")
# elapsed: 45.324502749999965

elapsed: 45.324502749999965
