# First Notebook

In [None]:
import pandas as pd
import numpy as np
import fastparquet


## Create fake dataset

In [None]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [None]:
def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df

In [5]:
df = get_dataset(10_000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   size    10000 non-null  object        
 1   age     10000 non-null  int32         
 2   team    10000 non-null  object        
 3   win     10000 non-null  object        
 4   date    10000 non-null  datetime64[ns]
 5   prob    10000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int32(1), object(3)
memory usage: 429.8+ KB


In [None]:
df.head()

## CSV

In [None]:
print('Reading and writing CSV')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_csv('test.csv',index=False)
%time df_csv = pd.read_csv('test.csv')

In [None]:
!wsl ls -lh

In [None]:
%%timeit
print('Reading and writing Pickle')
df = get_dataset(5_000_000)
df = set_dtypes(df)
df.to_pickle('test.pickle')
df_pickle = pd.read_pickle('test.pickle')


In [None]:
%timeit df.to_pickle('test.pickle')

In [None]:
%time df_pickle = pd.read_pickle('test.pickle')

In [None]:
# %conda install fastparquet

In [6]:

df = get_dataset(5_000_000)
df = set_dtypes(df)

Reading and writing Parquet
Wall time: 933 ms
Wall time: 278 ms


In [12]:
print('Reading and writing Parquet')
%timeit df.to_parquet('test.parquet')
%timeit df_parquet = pd.read_parquet('test.parquet')

Reading and writing Parquet
751 ms ± 53.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
187 ms ± 7.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## read in specfic columns

In [15]:
%timeit pd.read_parquet('test.parquet',columns=['date','win'])

106 ms ± 4.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
df = get_dataset(5_000_000)
df = set_dtypes(df)

Reading and writing Feather


In [13]:
print('Reading and writing Feather')
%timeit df.to_feather('test.feather')
%timeit df_feather = pd.read_feather('test.feather')

Reading and writing Feather
177 ms ± 7.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
112 ms ± 9.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
%timeit pd.read_feather('test.feather',columns=['date','win'])

UsageError: %%timeit is a cell magic, but the cell body is empty. Did you mean the line magic %timeit (single %)?
