### pandas knowledge sharing - Aug 2022

In [15]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
import pathlib

Examples to help with large datasets:
- Chunking - an approach for handling datasets larger than you can read
- Sampling - pre-requisite to data exploration & modelling
- Parquet files - good for 'big data' analytics (reduced storage and faster than csv)
- Normalise column names - columns converted to lowercase with spaces converted to _

### Chunking

In [16]:
# Create a very large file
csv_filename = pathlib.Path("data", "largedatafile.csv")

In [17]:
X, y = make_blobs(n_samples=10_000_000)
np.savetxt(csv_filename, X, delimiter=',')

In [18]:
!ls ./data -al

total 516308
drwxr-xr-x 5 root root      4096 Aug  5 12:08 .
drwxr-xr-x 6 root root      4096 Aug  5 14:08 ..
drwxr-xr-x 2 root root      4096 Aug  5 12:08 .ipynb_checkpoints
drwxr-xr-x 2 root root      4096 Aug  4 13:53 2020DemocratConventionSpeeches
drwxr-xr-x 2 root root      4096 Aug  4 13:46 2020RepublicanConventionSpeeches
-rw-r--r-- 1 root root  15370240 Aug  4 14:47 en_core_web_sm-3.4.0.tar
-rw-r--r-- 1 root root 513304627 Aug  5 14:08 largedatafile.csv


In [19]:
# read the file in chunks

In [20]:
%%timeit
chunks = pd.read_csv(csv_filename, names=['col1', 'col2'], chunksize=1_000_000)
for chunk in chunks:
  chunk['product'] = chunk.col1 * chunk.col2

4.44 s ± 41.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
# compare to the traditional method to read the whole file

In [22]:
%%timeit
df = pd.read_csv(csv_filename, names=['col1', 'col2'])
df['product'] = df.col1 * df.col2

4.46 s ± 51.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Sampling

In [24]:
ROWS = 1000
df = pd.read_csv(csv_filename, 
                 names=['col1', 'col2'], 
                 skiprows=lambda i: i % ROWS != 0)
df.shape

(10000, 2)

### Parquet

In [25]:
# Create filenames
wide_csv_filename = pathlib.Path("data", "wide.csv")
wide_parquet_filename= pathlib.Path("data", "wide.parquet")

In [26]:
X, y = make_blobs(n_samples=1_000_000, n_features=20)
df = pd.DataFrame(X)
df.columns = [f'Col {x}' for x in range(len(df.columns))]
df.to_csv(wide_csv_filename)
df.to_parquet(wide_parquet_filename)

In [27]:
!ls ./data -al

total 1048808
drwxr-xr-x 5 root root      4096 Aug  5 14:10 .
drwxr-xr-x 6 root root      4096 Aug  5 14:09 ..
drwxr-xr-x 2 root root      4096 Aug  5 12:08 .ipynb_checkpoints
drwxr-xr-x 2 root root      4096 Aug  4 13:53 2020DemocratConventionSpeeches
drwxr-xr-x 2 root root      4096 Aug  4 13:46 2020RepublicanConventionSpeeches
-rw-r--r-- 1 root root  15370240 Aug  4 14:47 en_core_web_sm-3.4.0.tar
-rw-r--r-- 1 root root 513304627 Aug  5 14:08 largedatafile.csv
-rw-r--r-- 1 root root 379670062 Aug  5 14:10 wide.csv
-rw-r--r-- 1 root root 165609437 Aug  5 14:10 wide.parquet


#### Timing reading data

In [28]:
%%timeit
df = pd.read_csv(wide_csv_filename)

5.04 s ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
%%timeit
df = pd.read_parquet(wide_parquet_filename)

191 ms ± 30.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Timing read one column

In [31]:
%%timeit
df = pd.read_csv(wide_csv_filename, usecols=['Col 4'])

1.96 s ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [32]:
%%timeit
df = pd.read_parquet(wide_parquet_filename, columns=['Col 4'])

13 ms ± 125 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Normalise column names

In [33]:
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

In [34]:
df.columns

Index(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7',
       'col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_13', 'col_14',
       'col_15', 'col_16', 'col_17', 'col_18', 'col_19'],
      dtype='object')