In [14]:
import pandas as pd
import numpy as np

In [15]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df

In [16]:
df = get_dataset(1000000)
df = set_dtypes(df)

# CSV

In [17]:
%timeit df.to_csv('test.csv', index=False)
%timeit df = pd.read_csv('test.csv')

8.52 s ± 233 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
563 ms ± 34.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int16         
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  bool          
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float32       
dtypes: bool(1), category(2), datetime64[ns](1), float32(1), int16(1)
memory usage: 16.2 MB


# Pickle

In [19]:
%timeit df.to_pickle('test.pickle')
%timeit df_pickle = pd.read_pickle('test.pickle')

10.8 ms ± 266 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
10.2 ms ± 508 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int16         
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  bool          
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float32       
dtypes: bool(1), category(2), datetime64[ns](1), float32(1), int16(1)
memory usage: 16.2 MB


# Parquet

In [25]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-11.0.0-cp38-cp38-win_amd64.whl (20.6 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-11.0.0


In [26]:
%timeit df.to_parquet('test.parquet')
%timeit df_parquet = pd.read_parquet('test.parquet')

146 ms ± 5.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
34.4 ms ± 1.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int16         
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  bool          
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float32       
dtypes: bool(1), category(2), datetime64[ns](1), float32(1), int16(1)
memory usage: 16.2 MB


# Comparison

In [29]:
ls -GFlash test.csv test.pickle test.parquet

 Volume in drive C is OS
 Volume Serial Number is 4228-B55E

 Directory of C:\Users\liran\git\Python_pack_automotive\efficient_coding\read_write_dataframe


 Directory of C:\Users\liran\git\Python_pack_automotive\efficient_coding\read_write_dataframe


 Directory of C:\Users\liran\git\Python_pack_automotive\efficient_coding\read_write_dataframe


 Directory of C:\Users\liran\git\Python_pack_automotive\efficient_coding\read_write_dataframe

02/03/2023  02:28 PM        42,112,184 test.csv
02/03/2023  02:30 PM         7,393,840 test.parquet
02/03/2023  02:29 PM        17,001,850 test.pickle
               3 File(s)     66,507,874 bytes
               0 Dir(s)  107,487,334,400 bytes free
