# Chapter 8: Conversion Methods

In [1]:
import pandas as pd
import numpy as np

url = "http://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip"
df = pd.read_csv(url)
city_mpg = df.city08
highway_mpg = df.highway08

  df = pd.read_csv(url)


## 8.1 Automatic Conversion

- Using the correct type can save significant amount of memory
- Default numeric type is 8 bytes wide (64 bits, ie int64 or float64).
- If we can use narrower type, we can cut back on memory usage thus giving us more memory to process more data

In [2]:
# change from int64 to Int64
city_mpg.convert_dtypes()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: Int64

In [3]:
# using as type to change form int64 to iNT16
city_mpg.astype('Int16')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: Int16

In [7]:
# use numpy to inspect the limits on integer and float types
np.iinfo('int64')

iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)

In [8]:
np.iinfo('uint8')

iinfo(min=0, max=255, dtype=uint8)

In [10]:
np.finfo('float16')

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

In [11]:
np.finfo('float64')

finfo(resolution=1e-15, min=-1.7976931348623157e+308, max=1.7976931348623157e+308, dtype=float64)

## 8.2 Memory Usage

- Using the ``.nbytes`` only shows how much memory the Pandas object is taking
- The ``make`` of the autos has strings and is stored as an object. To get the amount of memory that includes the strings, we use ``.memory_usage`` method
- ``.nbytes``: memory that the data is using but not the ancillary parts of the series
- ``.memory_usage`` includes the index memory and can include the contribution from object types

In [12]:
# memory usage of default numeric integers to Int16
city_mpg.nbytes

329152

In [13]:
# memory usage of Int16
city_mpg.astype('Int16').nbytes

123432

In [14]:
make = df.make
make.nbytes

329152

In [16]:
make.memory_usage()

329280

In [17]:
make.memory_usage(deep=True)

2606395

In [18]:
# convert to categorical saves a lot of memory
(make
.astype('category')
.memory_usage(deep=True))

95888

## 8.3 String and Category Types

In [19]:
# convert numeric series to strings
city_mpg.astype(str)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: object

In [20]:
# convert numeric series to category
city_mpg.astype('category')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6, 7, 8, 9, ..., 137, 138, 140, 150]

## 8.4 Ordered Categories

In [21]:
values = pd.Series(sorted(set(city_mpg)))

In [23]:
city_type = pd.CategoricalDtype(categories=values, 
                                ordered=True)

In [26]:
city_mpg.astype(city_type)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]