In [1]:
import pandas as pd
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [2]:
# 查看数据集的信息
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 9.1+ KB


In [3]:
# 查看数据集准确大小
drinks.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 30.4 KB


In [4]:
# 查看各列的准确大小
drinks.memory_usage(deep = True)
# 可以发现object类型的列明显占据更大的存储空间
# 用什么方式转换一下，使其占据更小的空间呢？

Index                              80
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [5]:
drinks.memory_usage(deep = True).sum()

31176

In [8]:
# 以continent字段为例，剖析节省空间的方法
sorted(drinks.continent.unique())
# 每个样本都用大洲名称来存浪费空间，如果Africa - 0 Asia - 1 Europe - 2 ... 就节省空间了
# 那么我们如何转换呢？

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [9]:
# 把continent转换成category类型
drinks['continent'] = drinks.continent.astype('category')

In [10]:
drinks.continent.head()
# 发现continent的类型变成category

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [11]:
# 查看一下continent实际怎么存储的？
# 发现实际存储的是数字，因此可以节省空间
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [12]:
# 我们来看一下空间是否真的变小了
drinks.memory_usage(deep = True)

Index                              80
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [None]:
# 那么我们把country也变成category是否可以节省空间呢？
# 不行，因为country字段的取值太多了

In [13]:
df = pd.DataFrame({'ID':[100, 101, 102, 103], 'quality':['good', 'very good', 'good', 'excellent']})
df

Unnamed: 0,ID,quality
0,100,good
1,101,very good
2,102,good
3,103,excellent


In [15]:
# 根据quality排序
df.sort_values('quality')
# 我们发现quality的排序可能不是我们想要的，我们希望very good排在中间，这个字段是有序字段
# 怎么解决？用category类型

Unnamed: 0,ID,quality
3,103,excellent
0,100,good
2,102,good
1,101,very good


In [17]:
df['quality'] = df.quality.astype('category', categories = ['good', 'very good', 'excellent'], ordered = True)

TypeError: _astype() got an unexpected keyword argument 'categories'

In [18]:
# 发现quality变成category，而且是有序的 good < very good < excellent
df.quality

0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): [good < very good < excellent]

In [19]:
df.sort_values('quality')

Unnamed: 0,ID,quality
0,100,good
2,102,good
1,101,very good
3,103,excellent


In [20]:
# 转变成category后，还有个好处，就是筛选更方便
# 比如：挑选出quality比good更好的所有样本
df.loc[df.quality > 'good', :]

Unnamed: 0,ID,quality
1,101,very good
3,103,excellent
