In [1]:
import numpy as np
import pandas as pd

In [2]:
# 一维数组
p = pd.Series([0.25, 0.5, 1, 1.5, 2],
              index=['a', 'b', 'c', 'd', 'e'])
b = pd.Series([5, 5, 5, 5, 5],
              index=['a', 'b', 'c', 'd', 'e'])
e = pd.Series([6, 6, 6, 6, 6],
              index=['a', 'b', 'c', 'd', 'e'])

In [3]:
# 二维数组
r = pd.DataFrame({'值': p, '余项': b, '其他': e})
r
# print(r)

Unnamed: 0,值,余项,其他
a,0.25,5,6
b,0.5,5,6
c,1.0,5,6
d,1.5,5,6
e,2.0,5,6


In [4]:
# 获取索引标签
r.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [5]:
# 存放列标签的对象
r.columns

Index(['值', '余项', '其他'], dtype='object')

In [6]:
# 取值与切片获取的是列/获取二维中的一列
print(r['值'])
print(r[['值']])

a    0.25
b    0.50
c    1.00
d    1.50
e    2.00
Name: 值, dtype: float64
      值
a  0.25
b  0.50
c  1.00
d  1.50
e  2.00


In [7]:
# 切片————取得是二维中的值 显式索引
r[['余项', '其他']]

Unnamed: 0,余项,其他
a,5,6
b,5,6
c,5,6
d,5,6
e,5,6


### 创建DataFrame对象

In [8]:
# pd.DataFrame(data=,index=)
# 单个Series
pd.DataFrame(data=p, columns=['p'])

Unnamed: 0,p
a,0.25
b,0.5
c,1.0
d,1.5
e,2.0


In [9]:
# 列表字典
pd.DataFrame([{'a':i,'b':i+i,'c':i*i} for i in range(1,10)])

Unnamed: 0,a,b,c
0,1,2,1
1,2,4,4
2,3,6,9
3,4,8,16
4,5,10,25
5,6,12,36
6,7,14,49
7,8,16,64
8,9,18,81


In [10]:
# 缺失值为NaN
# 通过字典创建——见20行代码
# 通过numpy二维数组创建
pd.DataFrame(np.random.rand(3, 2), columns=['a', 'b'])

Unnamed: 0,a,b
0,0.147041,0.967765
1,0.675492,0.040561
2,0.999896,0.65957


In [11]:
# 通过numpy结构化数组创建
A = np.zeros(3, dtype=[('A', 'i8'),('B', 'i8')])
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0
1,0,0
2,0,0


---
### 数据取值与选择

In [12]:
# 犹如字典
area = pd.Series({'California': 423967, 'Texas': 695662, 'New Yourk': 141297,
                  'Florida': 170312, 'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New Yourk': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area': area, 'pop': pop})
print(data)
print(data['area'], data.area, data.area is data['area'],
      data.pop is data['pop'])  # 与python方法重名
data['density'] = data['pop']/data['area']
print(data)

              area       pop
California  423967  38332521
Texas       695662  26448193
New Yourk   141297  19651127
Florida     170312  19552860
Illinois    149995  12882135
California    423967
Texas         695662
New Yourk     141297
Florida       170312
Illinois      149995
Name: area, dtype: int64 California    423967
Texas         695662
New Yourk     141297
Florida       170312
Illinois      149995
Name: area, dtype: int64 True False
              area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New Yourk   141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


In [13]:
# 犹如二维数组
# data.values按行查看
print(data.values)

[[4.23967000e+05 3.83325210e+07 9.04139261e+01]
 [6.95662000e+05 2.64481930e+07 3.80187404e+01]
 [1.41297000e+05 1.96511270e+07 1.39076746e+02]
 [1.70312000e+05 1.95528600e+07 1.14806121e+02]
 [1.49995000e+05 1.28821350e+07 8.58837628e+01]]


In [14]:
# data.T转置
print(data.T)

           California         Texas     New Yourk       Florida      Illinois
area     4.239670e+05  6.956620e+05  1.412970e+05  1.703120e+05  1.499950e+05
pop      3.833252e+07  2.644819e+07  1.965113e+07  1.955286e+07  1.288214e+07
density  9.041393e+01  3.801874e+01  1.390767e+02  1.148061e+02  8.588376e+01


In [15]:
# 获取行数据，列数据
print(data.values[0], data['area'])

[4.23967000e+05 3.83325210e+07 9.04139261e+01] California    423967
Texas         695662
New Yourk     141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [16]:
# 隐式索引
print(data.iloc[:3, :2])
# 显式索引
print(data.loc[:'Florida', :'pop'])
# 混合式索引  pandas1.00后已不再适用
# print(data.ix[:3,:'pop'])
# 掩码与花哨索引
print(data.loc[data.density > 100, ['pop', 'density']])
# 调整修改值
data.iloc[0, 2] = 80
print(data)

              area       pop
California  423967  38332521
Texas       695662  26448193
New Yourk   141297  19651127
              area       pop
California  423967  38332521
Texas       695662  26448193
New Yourk   141297  19651127
Florida     170312  19552860
                pop     density
New Yourk  19651127  139.076746
Florida    19552860  114.806121
              area       pop     density
California  423967  38332521   80.000000
Texas       695662  26448193   38.018740
New Yourk   141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


In [17]:
# 其他取值
print(data['Florida':'Illinois'])  # 切片
print(data[1:3])                   # 索引切片
print(data[data.density > 100])    # 过滤

            area       pop     density
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763
             area       pop     density
Texas      695662  26448193   38.018740
New Yourk  141297  19651127  139.076746
             area       pop     density
New Yourk  141297  19651127  139.076746
Florida    170312  19552860  114.806121
