In [2]:
import pandas as pd
import numpy as np

In [8]:
arr = np.array([1, 2, 3])
df = pd.DataFrame({"A": arr, "B": arr.copy()}, copy=False)
df1 = pd.DataFrame({"A": arr, "B": arr.copy()})
print(df)
print(df1)
arr[0] = 0
print(df)
print(df1)

   A  B
0  1  1
1  2  2
2  3  3
   A  B
0  1  1
1  2  2
2  3  3
   A  B
0  0  1
1  2  2
2  3  3
   A  B
0  1  1
1  2  2
2  3  3


In [10]:
# NOTE:创建生成
# 字典
d = {'国家': ['中国', '美国', '日本'],
     '人口': [14.33, 3.29, 1.26]}
df = pd.DataFrame(d)
df

Unnamed: 0,国家,人口
0,中国,14.33
1,美国,3.29
2,日本,1.26


In [12]:
df = pd.DataFrame(d, index=['a', 'b', 'c'])
df

Unnamed: 0,国家,人口
a,中国,14.33
b,美国,3.29
c,日本,1.26


In [14]:
# Series 组成的字典
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [16]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [18]:
# ndarrays 或列表组成的字典
d = {'one': [1., 2., 3., 4.],
     'two': [4., 3., 2., 1.]}
print(pd.DataFrame(d))
print(pd.DataFrame(d, index=['a', 'b', 'c', 'd']))

   one  two
0  1.0  4.0
1  2.0  3.0
2  3.0  2.0
3  4.0  1.0
   one  two
a  1.0  4.0
b  2.0  3.0
c  3.0  2.0
d  4.0  1.0


In [20]:
# 同构的数组数据
# 创建一个空的 2x3 数组
data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'S10')])
print(pd.DataFrame(data))
# 给这个数据填入具体数据值
data[:] = [(1, 2., 'Hello'), (2, 3., "World")]
# 生成 DataFrame
print(pd.DataFrame(data))

   A    B    C
0  0  0.0  b''
1  0  0.0  b''
   A    B         C
0  1  2.0  b'Hello'
1  2  3.0  b'World'


In [21]:
# 指定索引
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [22]:
# 指定列名
pd.DataFrame(data, columns=['C', 'A', 'B'])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


In [24]:
# 字典和列表
# 定义一个字典列表
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

# 生成 DataFrame 对象
print(pd.DataFrame(data2))

   a   b     c
0  1   2   NaN
1  5  10  20.0


In [25]:
# 指定索引
pd.DataFrame(data2, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [26]:
# 指定列名
pd.DataFrame(data2, columns=['a', 'b'])

Unnamed: 0,a,b
0,1,2
1,5,10


In [27]:
# 元组组成的字典
# 一个双索引的例子
pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
              ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
              ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
              ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
              ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [28]:
# 由 Series 生成
s1 = pd.Series(['a', 'b', 'c', 'd', 'e'])
pd.DataFrame(s1)

Unnamed: 0,0
0,a
1,b
2,c
3,d
4,e


In [29]:
# 混杂的结构
# 从字典里生成
pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]))

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [34]:
# 如字典键为 [‘index’, ‘columns’, ‘data’, ‘index_names’, ‘column_names’]
data = {
    'index': [*'xy'], 
    'columns': [*'ab'],
    'data': [[1, 3], [2, 4]],
    'index_names': 'T',
    'column_names': 'S'
}
df = pd.DataFrame.from_dict(data, orient='tight')
df

S,a,b
T,Unnamed: 1_level_1,Unnamed: 2_level_1
x,1,3
y,2,4


In [32]:
# 从列表、元组、ndarray 中创建
pd.DataFrame.from_records([(1, 2., b'Hello'), (2, 3., b'World')])

Unnamed: 0,0,1,2
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [38]:
# 列内容为一个字典
# pd.json_normalize(df.columns)
# df.columns.apply(pd.Series)

In [39]:
#NOTE: DataFrame 的操作
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [41]:
df.columns

Index(['one', 'two'], dtype='object')

In [42]:
# 选择增加修改列
# 选择列，结果是一个 Series
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [44]:
# 定义一个固定值的列
df['foo'] = 'bar'
df

Unnamed: 0,one,two,foo
a,1.0,1.0,bar
b,2.0,2.0,bar
c,3.0,3.0,bar
d,,4.0,bar


In [45]:
# 定义的新列取某列的部分值
df['one_trunc'] = df['one'][:2]
df

Unnamed: 0,one,two,foo,one_trunc
a,1.0,1.0,bar,1.0
b,2.0,2.0,bar,2.0
c,3.0,3.0,bar,
d,,4.0,bar,


In [47]:
# 定义一个新列，由已有的两列相乘
df['three'] = df['one'] * df['two']
df

Unnamed: 0,one,two,foo,one_trunc,three
a,1.0,1.0,bar,1.0,1.0
b,2.0,2.0,bar,2.0,4.0
c,3.0,3.0,bar,,9.0
d,,4.0,bar,,


In [48]:
# 新增加的列返回的是一个逻辑运算值
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,foo,one_trunc,three,flag
a,1.0,1.0,bar,1.0,1.0,False
b,2.0,2.0,bar,2.0,4.0,False
c,3.0,3.0,bar,,9.0,True
d,,4.0,bar,,,False


In [49]:
# 在列索引位 1 处理插入名为 bar 的列，值取 df.one
df.insert(1, 'bar', df['one'])
df

Unnamed: 0,one,bar,two,foo,one_trunc,three,flag
a,1.0,1.0,1.0,bar,1.0,1.0,False
b,2.0,2.0,2.0,bar,2.0,4.0,False
c,3.0,3.0,3.0,bar,,9.0,True
d,,,4.0,bar,,,False


In [50]:
# 删除一个列
del df['two']
three = df.pop('three')
df

Unnamed: 0,one,bar,foo,one_trunc,flag
a,1.0,1.0,bar,1.0,False
b,2.0,2.0,bar,2.0,False
c,3.0,3.0,bar,,True
d,,,bar,,False


In [52]:
# 用方法链创建新列
# 定义一个名为 rate 的新列，并给定计算公式
(df.assign(rate=df['one']/df['bar'])
    .head())

Unnamed: 0,one,bar,foo,one_trunc,flag,rate
a,1.0,1.0,bar,1.0,False,1.0
b,2.0,2.0,bar,2.0,False,1.0
c,3.0,3.0,bar,,True,1.0
d,,,bar,,False,


In [53]:
# 可以用 lambda 进行计算，变量 x 是指整个 df 
df.assign(rate1=lambda x:x['one']/x['bar']).head()

Unnamed: 0,one,bar,foo,one_trunc,flag,rate1
a,1.0,1.0,bar,1.0,False,1.0
b,2.0,2.0,bar,2.0,False,1.0
c,3.0,3.0,bar,,True,1.0
d,,,bar,,False,


In [54]:
# 可指定多个
df.assign(rate2=lambda x:x['one']/x['bar'],
          rate3=lambda x:x['one']+x['bar']).head()

Unnamed: 0,one,bar,foo,one_trunc,flag,rate2,rate3
a,1.0,1.0,bar,1.0,False,1.0,2.0
b,2.0,2.0,bar,2.0,False,1.0,4.0
c,3.0,3.0,bar,,True,1.0,6.0
d,,,bar,,False,,


In [55]:
# 选择数据
df['one']
df.loc['a']
df[1:3]
df.iloc[3]
df[df.one > 1]

Unnamed: 0,one,bar,foo,one_trunc,flag
b,2.0,2.0,bar,2.0,False
c,3.0,3.0,bar,,True


In [56]:
# 数据的转置
df.T

Unnamed: 0,a,b,c,d
one,1.0,2.0,3.0,
bar,1.0,2.0,3.0,
foo,bar,bar,bar,bar
one_trunc,1.0,2.0,,
flag,False,False,True,False
