In [13]:
import pandas as pd
url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url, dtype_backend='pyarrow', engine='pyarrow')
city_mpg = df.city08
highway_mpg = df.highway08
oil = df.barrels08

In [5]:
city_mpg.astype('int16[pyarrow]')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int16[pyarrow]

In [7]:
city_mpg.astype('int8[pyarrow]')

ArrowInvalid: Integer value 132 not in range: -128 to 127

In [9]:
import numpy as np
np.iinfo('int64')

iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)

In [11]:
np.iinfo('uint8')

iinfo(min=0, max=255, dtype=uint8)

In [15]:
np.finfo('float32')

finfo(resolution=1e-06, min=-3.4028235e+38, max=3.4028235e+38, dtype=float32)

In [17]:
city_mpg.nbytes

329152

In [19]:
city_mpg.astype('Int16').nbytes

123432

In [23]:
make = df.make
make.nbytes

425635

In [25]:
make.memory_usage()

425767

In [27]:
make.memory_usage(deep=True)

425767

In [29]:
make.astype(str).memory_usage()

329284

In [31]:
make.astype(str).memory_usage(deep=True)

2277247

In [39]:
(make
 .astype('category')
 .memory_usage(deep=True)
)

88701

In [43]:
(city_mpg
 .astype('category')
 .cat.as_ordered()
)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64[pyarrow]): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]

In [53]:
values = pd.Series(sorted(set(city_mpg)))
city_type = pd.CategoricalDtype(categories=values, ordered=True)
city_mpg.astype(city_type)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]

In [55]:
city_mpg.astype('category').cat.as_ordered()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64[pyarrow]): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]

In [9]:
city_mpg.to_frame()

Unnamed: 0,city08
0,19
1,9
2,23
3,10
4,17
...,...
41139,19
41140,20
41141,18
41142,18


In [19]:
# Exercises
# Covert a numeric column to a smaller type
# oil.isna().any()
oil.convert_dtypes(dtype_backend='pyarrow')

0        15.695714
1        29.964545
2        12.207778
3        29.964545
4        17.347895
           ...    
41139    14.982273
41140     14.33087
41141    15.695714
41142    15.695714
41143    18.311667
Name: barrels08, Length: 41144, dtype: double[pyarrow]