In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__ # 查看Pandas版本

'0.25.3'

## series

In [1]:
# Series中可以存放不同的类型
ser = pd.Series([1, 3, 5.1, np.nan, '张三'], index = list(range(1, 6)))
ser

NameError: name 'pd' is not defined

In [4]:
# 取索引2的值，使用 []
ser[2]

3

In [5]:
# 修改索引2的值，使用 []
ser[2] = 'Jack'
ser

1       1
2    Jack
3     5.1
4     NaN
5      张三
dtype: object

In [7]:
# 取索引1，3，5，使用 []
ser[[1, 3, 5]]

1      1
3    5.1
5     张三
dtype: object

In [8]:
print(ser.values)
type(ser.values) # Series的类型

[1 'Jack' 5.1 nan '张三']


numpy.ndarray

In [10]:
# 定义一个Series，并对所有列加1
ser2 = pd.Series([18, 19, 17], index = range(1, 4))
ser2 + 1 # 对所有列加1，ser2本身并未作更改

1    19
2    20
3    18
dtype: int64

In [11]:
# ser2未作改变
ser2

1    18
2    19
3    17
dtype: int64

In [14]:
ser = pd.Series([1, 3, 5.1, 6, 9], index = list(range(1, 6)))
ser[ser % 2 == 1] # 此处是[]

1    1.0
2    3.0
5    9.0
dtype: float64

### 通过字典生成Series对象

In [16]:
# 通过字典生成Series对象
# beijing、shanghai、guangzhou为索引，9240、8960、7400为值
dic = {'beijing':9240, 'shanghai':8960, 'guangzhou': 7400}
ser3 = pd.Series(dic)
ser3

beijing      9240
shanghai     8960
guangzhou    7400
dtype: int64

In [17]:
ser3[['beijing','guangzhou']]

beijing      9240
guangzhou    7400
dtype: int64

### Series转其他类型

In [20]:
print('beijing' in ser3)
print(ser3.to_dict()) # 转字典
print(ser3.tolist()) # 转列表
print(ser3.to_json()) # 转json

True
{'beijing': 9240, 'shanghai': 8960, 'guangzhou': 7400}
[9240, 8960, 7400]
{"beijing":9240,"shanghai":8960,"guangzhou":7400}


In [21]:
ser3.to_frame() # 转DataFrame表 

Unnamed: 0,0
beijing,9240
shanghai,8960
guangzhou,7400


# DataFrame

### 用含日期时间索引与标签的 NumPy 数组生成 DataFrame

In [23]:
datas = pd.date_range('20190101', periods=6)
datas

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [25]:
df = pd.DataFrame(np.random.randn(6, 4), index = datas, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2019-01-01,0.732546,-0.611456,0.672733,0.325307
2019-01-02,-0.855926,-0.781192,-0.957641,0.384169
2019-01-03,-1.863531,0.039663,-0.486457,-2.928965
2019-01-04,2.583917,-0.621555,2.694782,0.577375
2019-01-05,0.281635,-0.064713,1.057266,-0.252323
2019-01-06,-0.668788,3.076215,-0.503385,-0.638602


### 用 Series 字典对象生成 DataFrame

In [26]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### DataFrame 的列有不同数据类型

In [27]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### IPython支持 tab 键自动补全列名与公共属性

## 查看数据

In [29]:
df.head()

Unnamed: 0,A,B,C,D
2019-01-01,0.732546,-0.611456,0.672733,0.325307
2019-01-02,-0.855926,-0.781192,-0.957641,0.384169
2019-01-03,-1.863531,0.039663,-0.486457,-2.928965
2019-01-04,2.583917,-0.621555,2.694782,0.577375
2019-01-05,0.281635,-0.064713,1.057266,-0.252323


In [35]:
df.tail(3)

Unnamed: 0,A,B,C,D
2019-01-04,2.583917,-0.621555,2.694782,0.577375
2019-01-05,0.281635,-0.064713,1.057266,-0.252323
2019-01-06,-0.668788,3.076215,-0.503385,-0.638602


In [32]:
df.index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [34]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

### DataFrame.to_numpy() 输出底层数据的 NumPy 对象
- NumPy 数组只有一种数据类型，DataFrame 每列的数据类型各不相同。
- DataFrame 的列由多种数据类型组成时，该操作耗费系统资源较大，这也是 Pandas 和 NumPy 的本质区别。
- Pandas 查找支持 DataFrame 里所有数据类型的 NumPy 数据类型。还有一种数据类型是 object，可以把 DataFrame 列里的值强制转换为 Python 对象。
- DataFrame.to_numpy() 的输出不包含行索引和列标签。

下面的 df 这个 DataFrame 里的值都是浮点数，DataFrame.to_numpy() 的操作会很快，而且不复制数据。

In [5]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(6, 4), index = pd.date_range('20190101', periods=6), columns = list('ABCD'))
df.to_numpy()

array([[-1.71499051,  0.94256846,  0.538325  , -0.66895854],
       [-0.54364367, -1.42667186,  0.7317733 ,  1.40133949],
       [ 3.61197637, -0.56630825,  0.81489105, -0.29826305],
       [-1.56784327, -1.08047619,  0.09456628,  0.15996781],
       [-0.26885637,  0.88649103, -0.16920089, -0.29695971],
       [ 0.86889926,  0.9945544 , -0.14170551, -1.92196584]])

df2 这个 DataFrame 包含了多种类型，DataFrame.to_numpy() 操作就会耗费较多资源。

In [6]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

describe() 可以快速查看数据的统计摘要：

In [7]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.064257,-0.04164,0.311442,-0.270807
std,1.977056,1.111446,0.43932,1.083322
min,-1.714991,-1.426672,-0.169201,-1.921966
25%,-1.311793,-0.951934,-0.082638,-0.576285
50%,-0.40625,0.160091,0.316446,-0.297611
75%,0.58446,0.928549,0.683411,0.045736
max,3.611976,0.994554,0.814891,1.401339


转置数据：

In [8]:
df.T

Unnamed: 0,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,2019-01-06
A,-1.714991,-0.543644,3.611976,-1.567843,-0.268856,0.868899
B,0.942568,-1.426672,-0.566308,-1.080476,0.886491,0.994554
C,0.538325,0.731773,0.814891,0.094566,-0.169201,-0.141706
D,-0.668959,1.401339,-0.298263,0.159968,-0.29696,-1.921966


按轴排序：

In [9]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2019-01-01,-0.668959,0.538325,0.942568,-1.714991
2019-01-02,1.401339,0.731773,-1.426672,-0.543644
2019-01-03,-0.298263,0.814891,-0.566308,3.611976
2019-01-04,0.159968,0.094566,-1.080476,-1.567843
2019-01-05,-0.29696,-0.169201,0.886491,-0.268856
2019-01-06,-1.921966,-0.141706,0.994554,0.868899


按值排序：

In [11]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2019-01-02,-0.543644,-1.426672,0.731773,1.401339
2019-01-04,-1.567843,-1.080476,0.094566,0.159968
2019-01-03,3.611976,-0.566308,0.814891,-0.298263
2019-01-05,-0.268856,0.886491,-0.169201,-0.29696
2019-01-01,-1.714991,0.942568,0.538325,-0.668959
2019-01-06,0.868899,0.994554,-0.141706,-1.921966


# 选择

## 获取数据

选择单列，产生 Series，与 df.A 等效：

In [12]:
df['A']

2019-01-01   -1.714991
2019-01-02   -0.543644
2019-01-03    3.611976
2019-01-04   -1.567843
2019-01-05   -0.268856
2019-01-06    0.868899
Freq: D, Name: A, dtype: float64

In [13]:
df[0:3]

Unnamed: 0,A,B,C,D
2019-01-01,-1.714991,0.942568,0.538325,-0.668959
2019-01-02,-0.543644,-1.426672,0.731773,1.401339
2019-01-03,3.611976,-0.566308,0.814891,-0.298263


In [15]:
df['20190102':'20190104']

Unnamed: 0,A,B,C,D
2019-01-02,-0.543644,-1.426672,0.731773,1.401339
2019-01-03,3.611976,-0.566308,0.814891,-0.298263
2019-01-04,-1.567843,-1.080476,0.094566,0.159968


In [22]:
df[['A','C']]

Unnamed: 0,A,C
2019-01-01,-1.714991,0.538325
2019-01-02,-0.543644,0.731773
2019-01-03,3.611976,0.814891
2019-01-04,-1.567843,0.094566
2019-01-05,-0.268856,-0.169201
2019-01-06,0.868899,-0.141706


In [26]:
df[2:3]

Unnamed: 0,A,B,C,D
2019-01-03,3.611976,-0.566308,0.814891,-0.298263


### .loc 按标签选择，标签即行列名字

In [28]:
df.loc['20190101']

A   -1.714991
B    0.942568
C    0.538325
D   -0.668959
Name: 2019-01-01 00:00:00, dtype: float64

In [29]:
df.loc[:, ['A', 'C']]

Unnamed: 0,A,C
2019-01-01,-1.714991,0.538325
2019-01-02,-0.543644,0.731773
2019-01-03,3.611976,0.814891
2019-01-04,-1.567843,0.094566
2019-01-05,-0.268856,-0.169201
2019-01-06,0.868899,-0.141706


In [30]:
df.loc['20190102':'20190104',['A', 'C']]

Unnamed: 0,A,C
2019-01-02,-0.543644,0.731773
2019-01-03,3.611976,0.814891
2019-01-04,-1.567843,0.094566


返回对象降维：

In [31]:
df.loc['20190101', ['A', 'B']]

A   -1.714991
B    0.942568
Name: 2019-01-01 00:00:00, dtype: float64

提取标量值：

In [32]:
df.loc['20190101', 'A']

-1.7149905142046848

快速访问标量，与上述方法等效：

In [33]:
df.at['20190101', 'A']

-1.7149905142046848

### .iloc 按位置选择，位置即物理存储位置

In [35]:
df.iloc[3]

A   -1.567843
B   -1.080476
C    0.094566
D    0.159968
Name: 2019-01-04 00:00:00, dtype: float64

In [36]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2019-01-04,-1.567843,-1.080476
2019-01-05,-0.268856,0.886491


In [37]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2019-01-02,-0.543644,0.731773
2019-01-03,3.611976,0.814891
2019-01-05,-0.268856,-0.169201


In [38]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2019-01-02,-0.543644,-1.426672,0.731773,1.401339
2019-01-03,3.611976,-0.566308,0.814891,-0.298263


In [39]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2019-01-01,0.942568,0.538325
2019-01-02,-1.426672,0.731773
2019-01-03,-0.566308,0.814891
2019-01-04,-1.080476,0.094566
2019-01-05,0.886491,-0.169201
2019-01-06,0.994554,-0.141706


In [40]:
df.iloc[1, 1]

-1.4266718551281794

In [41]:
df.iat[1, 1]

-1.4266718551281794

# 布尔索引

用单列的值选择数据：

In [42]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2019-01-03,3.611976,-0.566308,0.814891,-0.298263
2019-01-06,0.868899,0.994554,-0.141706,-1.921966


选择 DataFrame 里满足条件的值：

In [43]:
df[df > 0]

Unnamed: 0,A,B,C,D
2019-01-01,,0.942568,0.538325,
2019-01-02,,,0.731773,1.401339
2019-01-03,3.611976,,0.814891,
2019-01-04,,,0.094566,0.159968
2019-01-05,,0.886491,,
2019-01-06,0.868899,0.994554,,


In [46]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2019-01-01,-1.714991,0.942568,0.538325,-0.668959,one
2019-01-02,-0.543644,-1.426672,0.731773,1.401339,one
2019-01-03,3.611976,-0.566308,0.814891,-0.298263,two
2019-01-04,-1.567843,-1.080476,0.094566,0.159968,three
2019-01-05,-0.268856,0.886491,-0.169201,-0.29696,four
2019-01-06,0.868899,0.994554,-0.141706,-1.921966,three


In [47]:
df2[df2['E'].isin(['one', 'two'])]

Unnamed: 0,A,B,C,D,E
2019-01-01,-1.714991,0.942568,0.538325,-0.668959,one
2019-01-02,-0.543644,-1.426672,0.731773,1.401339,one
2019-01-03,3.611976,-0.566308,0.814891,-0.298263,two


# 赋值

In [49]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index = pd.date_range('20190102', periods=6))
s1

2019-01-02    1
2019-01-03    2
2019-01-04    3
2019-01-05    4
2019-01-06    5
2019-01-07    6
Freq: D, dtype: int64

In [50]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,E,F
2019-01-01,-1.714991,0.942568,0.538325,-0.668959,one,
2019-01-02,-0.543644,-1.426672,0.731773,1.401339,one,1.0
2019-01-03,3.611976,-0.566308,0.814891,-0.298263,two,2.0
2019-01-04,-1.567843,-1.080476,0.094566,0.159968,three,3.0
2019-01-05,-0.268856,0.886491,-0.169201,-0.29696,four,4.0
2019-01-06,0.868899,0.994554,-0.141706,-1.921966,three,5.0


In [51]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,E,F
2019-01-01,-1.714991,0.942568,0.538325,5,one,
2019-01-02,-0.543644,-1.426672,0.731773,5,one,1.0
2019-01-03,3.611976,-0.566308,0.814891,5,two,2.0
2019-01-04,-1.567843,-1.080476,0.094566,5,three,3.0
2019-01-05,-0.268856,0.886491,-0.169201,5,four,4.0
2019-01-06,0.868899,0.994554,-0.141706,5,three,5.0


In [53]:
df3 = df.copy()

TypeError: bad operand type for unary -: 'str'