In [6]:
import numpy as np
import pandas as pd

df = pd.read_csv("http://bit.ly/drinksbycountry")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 9.1+ KB


In [8]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 30.4 KB


In [9]:
df.memory_usage()

Index                             80
country                         1544
beer_servings                   1544
spirit_servings                 1544
wine_servings                   1544
total_litres_of_pure_alcohol    1544
continent                       1544
dtype: int64

In [11]:
df.memory_usage(deep=True)

Index                              80
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

### What if we are able to store our strings as integers, as int are more space efficient
Example continent have limited 6 values
What if we can give int values to individual continent

In [13]:
sorted(df.continent.unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [14]:
df['continent'] = df.continent.astype('category')

In [15]:
df.dtypes

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [16]:
df.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [18]:
df.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [19]:
df.memory_usage(deep=True)

Index                              80
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [22]:
df['country'] = df.country.astype('category')

In [23]:
df.memory_usage(deep=True)

Index                              80
country                         18094
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [24]:
df.country.cat.categories

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', length=193)

### More Memory for country from 12k to 18k
Why?
as there were 193 strings, there is additional space taken by lookup

### Benefits of category?
It not just saves space, but also speeds up computation. Example group by will be fast

In [36]:
df1 = pd.DataFrame({'ID': [100,101,102,103],'quality':['good','very good','good','excellenet']})

In [38]:
df1

Unnamed: 0,ID,quality
0,100,good
1,101,very good
2,102,good
3,103,excellenet


In [37]:
# sort by category

df1.sort_values('quality')

Unnamed: 0,ID,quality
3,103,excellenet
0,100,good
2,102,good
1,101,very good


In [40]:
#Above is sorted by english and not by relevance
df1['quality'] = df1.quality.astype('category', categories = ['good','very good','excellent'],ordered=True)

  


In [41]:
df1.sort_values('quality')

Unnamed: 0,ID,quality
3,103,
0,100,good
2,102,good
1,101,very good
