## How to make Data Frames smaller and faster? 

In [1]:
import pandas as pd

In [2]:
drinks_path = 'http://bit.ly/drinksbycountry'
drinks = pd.read_csv(drinks_path)

In [3]:
drinks.head(5)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


### memory-usage functions and space-efficiency datatypes

In [4]:
#memory usage has '+' because pandas just checks cols (values are longer than col-values)
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


In [5]:
#show real memory usage
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [6]:
#Memory-Usage in Bytes
drinks.memory_usage(deep=True)  

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [7]:
drinks.memory_usage(deep=True).sum()

31224

In [8]:
#Integers are more Space-efficient than Strings
#use datatype category to save memory usage
sorted(drinks.continent.unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [9]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: object

### datatype category

In [10]:
 #IMPORTANT: PANDAS
 #category #change #datatype #data #prepare #pandas
 #Reduce Memory-Space needed for continent (before: 12000 bytes)
 #Save continent as category
 drinks['continent'] = drinks.continent.astype('category')

In [11]:
drinks.dtypes

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [12]:
#Save continent as category
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [13]:
#Show codes of continent with index (e.g. 1 = Asia)
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [14]:
 #Reduce Memory-Space needed for continent (before: 12000 bytes)
 #after: 744 bytes
drinks.memory_usage(deep=True)

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [15]:
drinks['country'] = drinks.country.astype('category')

In [16]:
#no memory saved for country
drinks.memory_usage(deep=True)

Index                             128
country                         18094
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [17]:
#bottom line: you use astype categories when a col has just a few different values (not when col has just unique values)
drinks.country.cat.categories

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', length=193)

### best practice: reduce space while reading

In [18]:
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   country                       193 non-null    category
 1   beer_servings                 193 non-null    int64   
 2   spirit_servings               193 non-null    int64   
 3   wine_servings                 193 non-null    int64   
 4   total_litres_of_pure_alcohol  193 non-null    float64 
 5   continent                     193 non-null    category
dtypes: category(2), float64(1), int64(3)
memory usage: 24.6 KB


In [19]:
#dtypes #cols #prepare #read #smaller #faster #bestpractice #pandas

#reduce count of cols read and use category as a space-efficient datatype
cols = ['beer_servings','continent']
dtypes = {'continent': 'category'}
small_drinks = pd.read_csv(drinks_path, usecols=cols, dtype=dtypes)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   beer_servings  193 non-null    int64   
 1   continent      193 non-null    category
dtypes: category(1), int64(1)
memory usage: 2.4 KB
