### inplace parameter

In [22]:
import pandas as pd

ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [23]:
ufo.drop('City',axis=1)
# in python if nothing is printed then by default it means the operation is inplace

Unnamed: 0,Colors Reported,Shape Reported,State,Time
0,,TRIANGLE,NY,6/1/1930 22:00
1,,OTHER,NJ,6/30/1930 20:00
2,,OVAL,CO,2/15/1931 14:00
3,,DISK,KS,6/1/1931 13:00
4,,LIGHT,NY,4/18/1933 19:00
...,...,...,...,...
18236,,TRIANGLE,IL,12/31/2000 23:00
18237,,DISK,IA,12/31/2000 23:00
18238,,,WI,12/31/2000 23:45
18239,RED,LIGHT,WI,12/31/2000 23:45


In [24]:
ufo.drop('City',axis=1,inplace=True)
# like here nothing is printed

### Making pandas dataframe smaller and faster

In [25]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [26]:
drinks.info()
# here it reveals the size of the dataframe, since object are references to others so panda does not trvaerse the entire tree to run info and get size of the dataframe but only does it on the surface so '+' is their

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


In [27]:
drinks.info(memory_usage='deep')
# this one tells the exct size

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [28]:
drinks.memory_usage(deep=True)
# this outputs the size of each series in bytes

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [29]:
drinks.memory_usage(deep=True).sum() # will output 30.5 KBs

31224

we can make our dataframe smaller if we convert strings to integer as integers are more space efficient

In [30]:
sorted(drinks.continent.unique())
# only six unique values in this series, so we can represent each unique value by an integer

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [31]:
drinks['continent']= drinks.continent.astype('category')

In [32]:
drinks.dtypes
# now category is a special type in pandas , and to use its methods use .cat just like we used .str for string methods
# now pandas have code the continent series i.e lookup table

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [33]:
drinks.continent.cat.codes.head()
# this is optimizing the table

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [34]:
drinks.memory_usage(deep=True)
# data usage for continent series has decreased from 12k to 756 bytes by using this refrencing method and this also saves time on doing operations on the series

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         756
dtype: int64

In [35]:
drinks['country'] = drinks.country.astype('category')
drinks.memory_usage(deep=True)

# this increased memory usage as in this case all strings were unique so we created a lookup table of all the categories and on top of it, we created the dataframe

Index                             128
country                         17142
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         756
dtype: int64

In [36]:
df = pd.DataFrame({'ID':[100,101,102,103],'quality':['good','very good','good','excellent']})
df

Unnamed: 0,ID,quality
0,100,good
1,101,very good
2,102,good
3,103,excellent


In [37]:
df.sort_values('quality')

Unnamed: 0,ID,quality
3,103,excellent
0,100,good
2,102,good
1,101,very good


In [38]:
from pandas.api.types import CategoricalDtype

quality_cat = CategoricalDtype(['good','very good','excellent'],ordered=True)
df['quality'] = df.quality.astype(quality_cat)


In [39]:
df.quality

0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): ['good' < 'very good' < 'excellent']

In [40]:
df.sort_values('quality') 
# now sorting is logical as custom catefory defined

Unnamed: 0,ID,quality
0,100,good
2,102,good
1,101,very good
3,103,excellent


In [41]:
df.loc[df.quality > 'good']

Unnamed: 0,ID,quality
1,101,very good
3,103,excellent
