In [1]:
import pandas as pd

In [23]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

In [3]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [4]:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


### We can store arbitrary Python objects in a panda series and pandas basically just stores a reference to that object 

### There's a + near the memory usage because "info" runs so fast that it does not give us the whole len of the obj.s that are being referenced

In [6]:
drinks.info(memory_usage='deep')
# ^ enforcing to look into the actual obj.s

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [12]:
# how much does each series take?
drinks.memory_usage(deep=True)
# ^ the result is in bytes

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

# ints are more space-efficient than strs

In [24]:
# converting a col of data in str into categorical type 
drinks['continent'] = drinks.continent.astype('category')

In [14]:
drinks.dtypes

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [15]:
drinks.continent.head()
# the string values are stored as ints

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [33]:
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [17]:
drinks.memory_usage(deep=True)
# we would actually have ints that point to the lookup table
# So, the strs are stored once and the rest is the int storage, 
# thus more space-efficient

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         756
dtype: int64

In [25]:
drinks['country'] = drinks.country.astype('category')

In [28]:
drinks.memory_usage(deep=True)

Index                             128
country                         17142
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         756
dtype: int64

In [36]:
drinks.country.cat.categories

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', length=193)

# Why the size gotten by the country has increased?
<p>Coz every country was a different string and ints are pointing to the lookup table of so much strings</p>

<p>Solution: use category dtype when an obj col of different strs are not so much in number</p>
 

In [37]:
df = pd.DataFrame({'Restaurants':[1,2,3,4], 
                   'Stars':['Five', 'One', 'Four', 'Three']})
df

Unnamed: 0,Restaurants,Stars
0,1,Five
1,2,One
2,3,Four
3,4,Three


In [38]:
df.sort_values('Stars')

Unnamed: 0,Restaurants,Stars
0,1,Five
2,3,Four
1,2,One
3,4,Three


In [52]:
from pandas.api.types import CategoricalDtype
# defining ordered categories and telling a correct logical ordering
category_def = CategoricalDtype(categories=['One', 
                                          'Two', 
                                          'Three', 
                                          'Four', 
                                          'Five'],
                              ordered=True
                             )
df['Stars'] = df.Stars.astype(category_def)

df.Stars

0     Five
1      One
2     Four
3    Three
Name: Stars, dtype: category
Categories (5, object): ['One' < 'Two' < 'Three' < 'Four' < 'Five']

In [53]:
df.sort_values('Stars')

Unnamed: 0,Restaurants,Stars
1,2,One
3,4,Three
2,3,Four
0,1,Five


In [55]:
df.loc[df.Stars >= 'Four', :]

Unnamed: 0,Restaurants,Stars
0,1,Five
2,3,Four
