> 主要是用来记录 pandas 有关用法，以举例的形式进行呈现。

In [17]:
import pandas as pd
import numpy as np
from datetime import datetime

# pandas

In [6]:
# 从字典对象创建DataFrame，并设置索引
dates = pd.date_range('today', periods=10)
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no'],
        'dates': dates}
indexs = [list('abcdefghij')]

df = pd.DataFrame(data=data, index=indexs)
df

Unnamed: 0,animal,age,visits,priority,dates
a,cat,2.5,1,yes,2020-09-24 11:16:45.911077
b,cat,3.0,3,yes,2020-09-25 11:16:45.911077
c,snake,0.5,2,no,2020-09-26 11:16:45.911077
d,dog,,3,yes,2020-09-27 11:16:45.911077
e,dog,5.0,2,no,2020-09-28 11:16:45.911077
f,cat,2.0,3,no,2020-09-29 11:16:45.911077
g,snake,4.5,1,no,2020-09-30 11:16:45.911077
h,cat,,1,yes,2020-10-01 11:16:45.911077
i,dog,7.0,2,no,2020-10-02 11:16:45.911077
j,dog,3.0,1,no,2020-10-03 11:16:45.911077


In [7]:
# 显示df的基础信息，包括行的数量；列名；每一列值的数量、类型
df.info()
# df.describe()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10 entries, ('a',) to ('j',)
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   animal    10 non-null     object        
 1   age       8 non-null      float64       
 2   visits    10 non-null     int64         
 3   priority  10 non-null     object        
 4   dates     10 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 506.0+ bytes


In [11]:
# 第1，5，7行的 animal 和 age 列
df.loc[df.index[[0, 4, 6]], ['animal', 'age']]

Unnamed: 0,animal,age
a,cat,2.5
e,dog,5.0
g,snake,4.5


In [23]:
# 插入新行 k 为['dog', 5.5, 2, 'no']，并删除
df.loc['k', :] = ['dog', 5.5, 2, 'no', datetime.now()]
df = df.drop('k')
df

Unnamed: 0,animal,age,visits,priority,dates
a,cat,2.5,1.0,yes,2020-09-24 11:16:45.911077
b,cat,3.0,3.0,yes,2020-09-25 11:16:45.911077
c,snake,0.5,2.0,no,2020-09-26 11:16:45.911077
d,dog,,3.0,yes,2020-09-27 11:16:45.911077
e,dog,5.0,2.0,no,2020-09-28 11:16:45.911077
f,cat,2.0,3.0,no,2020-09-29 11:16:45.911077
g,snake,4.5,1.0,no,2020-09-30 11:16:45.911077
h,cat,,1.0,yes,2020-10-01 11:16:45.911077
i,dog,7.0,2.0,no,2020-10-02 11:16:45.911077
j,dog,3.0,1.0,no,2020-10-03 11:16:45.911077


In [25]:
# 取出 age 缺失 的行
df[df['age'].isnull()]

Unnamed: 0,animal,age,visits,priority,dates
d,dog,,3.0,yes,2020-09-27 11:16:45.911077
h,cat,,1.0,yes,2020-10-01 11:16:45.911077


In [28]:
# 计算 visit 列总和
df['visits'].sum()

19.0

In [29]:
# 计算 animal 类别个数
df['animal'].value_counts()

dog      4
cat      4
snake    2
Name: animal, dtype: int64

In [30]:
# 计算各类 animal 的平均年龄
df.groupby('animal')['age'].mean()

animal
cat      2.5
dog      5.0
snake    2.5
Name: age, dtype: float64

In [31]:
# 按照 visits 列升序，再按照 age 列降序
df.sort_values(by=['visits', 'age'], ascending=[True, False])

Unnamed: 0,animal,age,visits,priority,dates
g,snake,4.5,1.0,no,2020-09-30 11:16:45.911077
j,dog,3.0,1.0,no,2020-10-03 11:16:45.911077
a,cat,2.5,1.0,yes,2020-09-24 11:16:45.911077
h,cat,,1.0,yes,2020-10-01 11:16:45.911077
i,dog,7.0,2.0,no,2020-10-02 11:16:45.911077
e,dog,5.0,2.0,no,2020-09-28 11:16:45.911077
c,snake,0.5,2.0,no,2020-09-26 11:16:45.911077
b,cat,3.0,3.0,yes,2020-09-25 11:16:45.911077
f,cat,2.0,3.0,no,2020-09-29 11:16:45.911077
d,dog,,3.0,yes,2020-09-27 11:16:45.911077
