# Chapter 15: Categorical Manipulation

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

In [2]:
url = "http://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip"
df = pd.read_csv(url)
city_mpg = df.city08
highway_mpg = df.highway08
make = df.make

  df = pd.read_csv(url)


## 15.2 Frequency Counts

In [3]:
make.value_counts()

Chevrolet                      4003
Ford                           3371
Dodge                          2583
GMC                            2494
Toyota                         2071
                               ... 
Volga Associated Automobile       1
Panos                             1
Mahindra                          1
Excalibur Autos                   1
London Coach Co Inc               1
Name: make, Length: 136, dtype: int64

In [4]:
make.shape, make.nunique()

((41144,), 136)

## 15.3 Benefits of Categories

- Categorical values uses less memory
- Categorical computations can be faster for many operations

In [5]:
cat_make = make.astype('category')

In [6]:
make.memory_usage(deep=True)

2606395

In [7]:
cat_make.memory_usage(deep=True)

95888

In [8]:
%%timeit
cat_make.str.upper()

460 µs ± 19.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
%%timeit
make.str.upper()

11.5 ms ± 320 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 15.4 Conversion to Ordinal Categories

- A benefit of ordinal categoricals is that we can specify a lexical order to the items
- If the items have an order, we can use reducing operations like maximum and minimum

In [12]:
make_type = pd.CategoricalDtype(
    categories=sorted(make.unique()), ordered=True)

In [13]:
ordered_make = make.astype(make_type)

In [14]:
ordered_make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

In [15]:
# using reducing operations
ordered_make.max()

'smart'

In [16]:
ordered_make.sort_values()

20288    AM General
20289    AM General
369      AM General
358      AM General
19314    AM General
            ...    
31289         smart
31290         smart
29605         smart
22974         smart
26882         smart
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

## 15.5 The .cat Accessor

In [17]:
cat_make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']

In [18]:
cat_make.cat.rename_categories(
    [c.lower() for c in cat_make.cat.categories]
)

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['am general', 'asc incorporated', 'acura', 'alfa romeo', ..., 'volvo', 'wallace environmental', 'yugo', 'smart']

In [20]:
ordered_make.cat.reorder_categories(
    sorted(cat_make.cat.categories, key=str.lower)
)

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['Acura' < 'Alfa Romeo' < 'AM General' < 'American Motors Corporation' ... 'Volvo' < 'VPG' < 'Wallace Environmental' < 'Yugo']