In [14]:
import pandas as pd
url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url, engine='pyarrow', dtype_backend='pyarrow')


In [15]:
make = df.make
city_mpg = df.city08

In [16]:
make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: string[pyarrow]

In [17]:
make.value_counts()

make
Chevrolet                           4003
Ford                                3371
Dodge                               2583
GMC                                 2494
Toyota                              2071
                                    ... 
Grumman Allied Industries              1
Environmental Rsch and Devp Corp       1
General Motors                         1
Goldacre                               1
Isis Imports Ltd                       1
Name: count, Length: 136, dtype: int64[pyarrow]

In [18]:
make.shape, make.nunique() # number of entries,number of unique entries

((41144,), 136)

In [19]:
cat_make = make.astype('category')

In [20]:
make.memory_usage(deep=True)

425767

In [21]:
cat_make.memory_usage(deep=True)

88701

In [22]:
%%timeit
cat_make.str.upper()

334 μs ± 6.73 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [23]:
%%timeit
make.str.upper()

671 μs ± 87.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [24]:
old_make = make.astype(str)

In [25]:
%%timeit
old_make.str.upper()

3.79 ms ± 678 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
labels = ["Very Low", "Low", "Lower-Mid", "Mid", "Upper-Mid", "High", "Higher", "Very High", "Extreme", "Ultra"]
pd.cut(city_mpg, 10, labels=labels).value_counts(sort=False)

city08
Very Low     30872
Low           9667
Lower-Mid      367
Mid             54
Upper-Mid       11
High            48
Higher          32
Very High       26
Extreme         55
Ultra           12
Name: count, dtype: int64

In [27]:
make_type = pd.CategoricalDtype( # makes the categories
    categories=sorted(make.unique()), ordered=True # sorts them, by unique
)

In [28]:
ordered_make = make.astype(make_type)
ordered_make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

In [29]:
make.sort_values()

358      AM General
369      AM General
19314    AM General
19316    AM General
20288    AM General
            ...    
32557         smart
32757         smart
32758         smart
34309         smart
34310         smart
Name: make, Length: 41144, dtype: string[pyarrow]

In [30]:
(make
 .astype('category')
 .cat.as_ordered()
 )

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, string[pyarrow]): [AM General < ASC Incorporated < Acura < Alfa Romeo ... Volvo < Wallace Environmental < Yugo < smart]

In [31]:
ordered_make.max()

'smart'

In [32]:
# cat_make.max()
# results in error, as is not ordered

In [33]:
ordered_make.sort_values()

20288    AM General
20289    AM General
369      AM General
358      AM General
19314    AM General
            ...    
31289         smart
31290         smart
29605         smart
22974         smart
26882         smart
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

In [34]:
%%timeit
cat_make.cat.rename_categories(
    [c.lower() for c in cat_make.cat.categories]
)

114 μs ± 4.14 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [35]:
%%timeit
ordered_make.cat.rename_categories(
    {c: c.lower() for c in ordered_make.cat.categories}
)

67.7 μs ± 2.11 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [36]:
%%timeit
ordered_make.cat.rename_categories(
    lambda c: c.lower()
)

55.6 μs ± 156 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [37]:
ordered_make.cat.reorder_categories(
    sorted(cat_make.cat.categories, key=str.lower)
)

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['Acura' < 'Alfa Romeo' < 'AM General' < 'American Motors Corporation' ... 'Volvo' < 'VPG' < 'Wallace Environmental' < 'Yugo']

In [38]:
ordered_make.head(100).value_counts()

make
Dodge                        17
Oldsmobile                    8
Ford                          8
Buick                         7
Chevrolet                     5
                             ..
Grumman Allied Industries     0
Goldacre                      0
Geo                           0
Genesis                       0
smart                         0
Name: count, Length: 136, dtype: int64

In [39]:
(cat_make
 .head(100)
 .groupby(cat_make.head(100),observed=False)
 .first())

make
AM General                           <NA>
ASC Incorporated                     <NA>
Acura                                <NA>
Alfa Romeo                     Alfa Romeo
American Motors Corporation          <NA>
                                  ...    
Volkswagen                     Volkswagen
Volvo                               Volvo
Wallace Environmental                <NA>
Yugo                                 <NA>
smart                                <NA>
Name: make, Length: 136, dtype: category
Categories (136, string[pyarrow]): [AM General, ASC Incorporated, Acura, Alfa Romeo, ..., Volvo, Wallace Environmental, Yugo, smart]

In [40]:
(make
 .head(100)
 .groupby(make.head(100))
 .first())

make
Alfa Romeo          Alfa Romeo
Audi                      Audi
BMW                        BMW
Buick                    Buick
CX Automotive    CX Automotive
Cadillac              Cadillac
Chevrolet            Chevrolet
Chrysler              Chrysler
Dodge                    Dodge
Ferrari                Ferrari
Ford                      Ford
Hyundai                Hyundai
Infiniti              Infiniti
Lexus                    Lexus
Mazda                    Mazda
Mercury                Mercury
Nissan                  Nissan
Oldsmobile          Oldsmobile
Plymouth              Plymouth
Pontiac                Pontiac
Rolls-Royce        Rolls-Royce
Subaru                  Subaru
Toyota                  Toyota
Volkswagen          Volkswagen
Volvo                    Volvo
Name: make, dtype: string[pyarrow]

In [41]:
ordered_make.iloc[0]

'Alfa Romeo'

In [42]:
ordered_make.iloc[[0]]

0    Alfa Romeo
Name: make, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

In [43]:
s = pd.Series(["BMW", "Ford", "BMW", "Toyota"], index=[2000, 2001, 2002, 2003])

(s.groupby(s, observed=True)
.first()
 )

BMW          BMW
Ford        Ford
Toyota    Toyota
dtype: object

In [44]:
def generalise_topn(ser, n=5, other='Other'): # three parameters
    topn = ser.value_counts().index[:n] # the top n index values sorted by value_counts()
    if isinstance(ser.dtype, pd.CategoricalDtype): # checks whether ser.dtype is a CategoricalDtype
        ser = ser.cat.set_categories( # reset the categories to be:
            topn.set_categories(list(topn)+[other]) # in a list of the topn with the specified 'other'
        )
    return ser.where(ser.isin(topn), other) # where in topn, return as is, otherwise, return specified 'other'

In [45]:
make.value_counts().index[:5]

Index(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'], dtype='string[pyarrow]', name='make')

In [46]:
make.value_counts().index

Index(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota', 'BMW', 'Mercedes-Benz',
       'Nissan', 'Volkswagen', 'Mitsubishi',
       ...
       'Karma', 'Koenigsegg', 'Aurora Cars Ltd', 'RUF Automobile',
       'JBA Motorcars, Inc.', 'Grumman Allied Industries',
       'Environmental Rsch and Devp Corp', 'General Motors', 'Goldacre',
       'Isis Imports Ltd'],
      dtype='string[pyarrow]', name='make', length=136)

In [47]:
cat_make.pipe(generalise_topn, n=20, other='NA')

0            NA
1            NA
2         Dodge
3         Dodge
4        Subaru
          ...  
41139    Subaru
41140    Subaru
41141    Subaru
41142    Subaru
41143    Subaru
Name: make, Length: 41144, dtype: category
Categories (21, object): ['Chevrolet', 'Ford', 'Dodge', 'GMC', ..., 'Volvo', 'Hyundai', 'Chrysler', 'NA']

In [48]:
def generalise_topn(ser, n=5, other='Other'):
    topn = ser.value_counts().index[:n]
    if isinstance(ser.dtype, pd.CategoricalDtype):
        ser = ser.cat.set_categories(
            topn.set_categories(list(topn) + [other])
        )
    return ser.where(ser.isin(topn), other)

In [49]:
def generalise_topn(ser, n=5, other='Other'):
    topn = ser.value_counts().index[:5]
    if isinstance(ser.dtype, pd.CategoricalDtype):
        ser = ser.cat.set_categories(
            topn.set_categories(list(topn) + [other])
        )
    return ser.where(ser.isin(topn), other)

cat_make.pipe(generalise_topn, n=5, other='NA')

0           NA
1           NA
2        Dodge
3        Dodge
4           NA
         ...  
41139       NA
41140       NA
41141       NA
41142       NA
41143       NA
Name: make, Length: 41144, dtype: category
Categories (6, object): ['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota', 'NA']

In [53]:
def generalise_mapping(ser, mapping, default):
    seen = None
    res = ser.astype(str)
    for old, new in mapping.items():
        mask = ser.str.contains(old)
        if seen is None:
            seen = mask
        else:
            seen |= mask
        res = res.where(~mask, new)
    res = res.where(seen, default)
    return res.astype('category')





generalise_mapping(cat_make, {'Ford': 'US', 'Tesla': 'US',
                              'Chevrolet': 'US', 'Dodge': 'US',
                              'Oldsmobile': 'US', 'Plymouth': 'US',
                              'BMW': 'German'}, 'Other')

0        Other
1        Other
2           US
3           US
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: category
Categories (3, object): ['German', 'Other', 'US']

In [51]:
generalise_mapping(cat_make, {'Ford': 'US', 'Tesla': 'US',
                              'Chevrolet': 'US', 'Dodge': 'US',
                              'Oldsmobile': 'US', 'Plymouth': 'US',
                              'BMW': 'German'}, 'Other')

0        Other
1        Other
2           US
3           US
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: category
Categories (3, object): ['German', 'Other', 'US']

In [52]:
x = True
y = False

x |= y
x

True

In [None]:
# Convert a text column into a categorical column. How much memory did you save?

# make.dtype

categorical_make = make.astype('category')

## results:
# 5.96 ns ± 0.195 ns per loop (mean ± std. dev. of 7 runs, 100,000,000 loops each)
# vs
# 6.13 ns ± 0.255 ns per loop (mean ± std. dev. of 7 runs, 100,000,000 loops each)


In [81]:
#Convert a numeric column into a categorical column by binning it (pd.cut). How much memory did you save?

num_bites = city_mpg.memory_usage(deep=True)
cat_bites = pd.cut(city_mpg, 10).memory_usage(deep=True)

print(f'Numeric column bite usage: {num_bites}')
print(f'Binned column bite usage: {cat_bites}')
print(f'pd.cut resulting in: {num_bites - cat_bites} bites faster')

Numeric column bite usage: 329284
Binned column bite usage: 41736
pd.cut resulting in: 287548 bites faster


In [84]:
# Use the generalize_topn function to limit the amounts of categories in your column. How much memory did you save?

cat_make = make.astype('category')
cat_speed = cat_make.memory_usage(deep=True)

gen_speed = cat_make.pipe(generalise_topn, n=5, other='Other').memory_usage(deep=True)



print(f'Categorical column bite usage: {cat_speed}')
print(f'Generalised columns bite usage: {gen_speed}')
print(f'pd.cut resulting in: {cat_speed - gen_speed} bites faster')

Categorical column bite usage: 88701
Generalised columns bite usage: 41774
pd.cut resulting in: 46927 bites faster
