In [None]:
# 使用 panda 进行数据处理
## 原因：有key，容易处理

In [3]:
import pandas as pd
import numpy as np

# data build

In [13]:
data = np.random.randn(3,3)
data = pd.DataFrame(data,index=['a','b','c'],columns=['1','2','3'])
data

Unnamed: 0,1,2,3
a,-0.941224,1.688849,1.85978
b,-0.854122,-1.337089,0.337921
c,0.591162,-0.076909,-0.744964


# Index

In [None]:
## index 对应元组，不可变
ind = pd.Index([2, 3, 5, 7, 11]) 

In [14]:
for index in data:
    print(index)

1
2
3


In [35]:
list(data.items())

[('1',
  a   -0.941224
  b   -0.854122
  c    0.591162
  Name: 1, dtype: float64),
 ('2',
  a    1.688849
  b   -1.337089
  c   -0.076909
  Name: 2, dtype: float64),
 ('3',
  a    1.859780
  b    0.337921
  c   -0.744964
  Name: 3, dtype: float64)]

# save

In [24]:
# 保存为csv
data.to_csv('E:\\DATABASE\\test.csv')

# append, rec

In [39]:
### 增加、修改数据
data['4'] = [1.00, 2.0, 3.0] # data['4'] = 1.0 也可以 
data

Unnamed: 0,1,2,3,4
a,-0.941224,1.688849,1.85978,1.0
b,-0.854122,-1.337089,0.337921,2.0
c,0.591162,-0.076909,-0.744964,3.0


In [67]:
# 取值
data['1']['a'] # out: -0.9412243812313253
data['1'] 
# out:
# a   -0.941224
# b   -0.854122
# c    0.591162
# Name: 1, dtype: float64
## 如果不想要额外信息
data.loc['a','1'] # out: -0.9412243812313253
## 使用位置
data.iloc[0,0] # out: -0.9412243812313253

-0.9412243812313253

# 计算

In [54]:
# 计算
# 对于一元运算（像函数与三角函数） ，这些通用函
# 数将在输出结果中保留索引和列标签；而对于二元运算（如加法和乘法） ，Pandas 在传递
# 通用函数时会自动对齐索引进行计算
data['avr'] = data['1']/data['4']
data 

Unnamed: 0,1,2,3,4,avr
a,-0.941224,1.688849,1.85978,1.0,-0.941224
b,-0.854122,-1.337089,0.337921,2.0,-0.427061
c,0.591162,-0.076909,-0.744964,3.0,0.197054


In [57]:
data.T

Unnamed: 0,a,b,c
1,-0.941224,-0.854122,0.591162
2,1.688849,-1.337089,-0.076909
3,1.85978,0.337921,-0.744964
4,1.0,2.0,3.0
avr,-0.941224,-0.427061,0.197054


In [68]:
# 掩码
data[data>0.5]

Unnamed: 0,1,2,3,4,avr
a,,1.688849,1.85978,1.0,
b,,,,2.0,
c,0.591162,,,3.0,


In [73]:
data[(data['avr']<0) & (data['2']>0)]

Unnamed: 0,1,2,3,4,avr
a,-0.941224,1.688849,1.85978,1.0,-0.941224


In [74]:
np.sin(data * np.pi / 4) 

Unnamed: 0,1,2,3,4,avr
a,-0.673723,0.970288,0.993942,0.707107,-0.673723
b,-0.621633,-0.867497,0.262298,1.0,-0.329159
c,0.447795,-0.060367,-0.552277,0.707107,0.154149


In [76]:
# 对位索引进行计算：就是同个key，可以进行计算
area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 
                         'California': 423967}, name='area') 
population = pd.Series({'California': 38332521, 'Texas': 26448193, 
                               'New York': 19651127}, name='population')

population / area # 索引求并集，如果两个对象的行列索引可以是不同顺序的，结果的索引会自动按顺序排列。
A.add(B, fill_value=fill) # fill是A的均值，便是NaN用均值填充

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [None]:
# NaN 是一种特殊的浮点数，不是整数、字符串以及其他数据类型。
# 可以把 NaN 看作是一个数据类病毒——它会将与它接触过的数据同化。无论和 NaN 进行何种操作，最终结果都是 NaN

In [77]:
# 多维表格：index是多级的
# pop.index.names = ['state', 'year'] ## 在处理复杂的数据时，为等级设置名称是管理多个索引值的好办法
df = pd.DataFrame(np.random.rand(4, 2), 
                index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                columns=['data1', 'data2'])
df


Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.855088,0.126474
a,2,0.846651,0.270685
b,1,0.212049,0.774503
b,2,0.261329,0.960601


In [78]:
## 创建index
pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) 

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [82]:
# 多级行列索引生成 
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], 
                                   names=['year', 'visit']) 
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']], 
                                     names=['subject', 'type']) 
 
# 模拟数据 
data = np.round(np.random.randn(4, 6), 1) 
data[:, ::2] *= 10 
data += 37 
 
# 创建DataFrame 
health_data = pd.DataFrame(data, index=index, columns=columns) 
health_data 

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,37.0,35.9,49.0,36.6,39.0,37.6
2013,2,22.0,35.8,37.0,36.5,18.0,37.4
2014,1,30.0,38.0,39.0,38.3,35.0,38.4
2014,2,48.0,37.3,37.0,36.2,53.0,36.4


In [83]:
# 使用 IndexSlice 对象进行多维索引。Pandas 专门用它解决这类问题，例如：
idx = pd.IndexSlice 
health_data.loc[idx[:, 1], idx[:, 'HR']] 
# 层级数据维度转换的另一种方法是行列标签转换，可以通过 reset_index 方法实现
# pop.reset_index(name='population') 

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,37.0,49.0,39.0
2014,1,30.0,39.0,35.0


In [91]:
#计算
## 可以设置参数 level 在计算函数中，实现对数据子集的操作
## 可以设置参数 axis 计算行列计算，axis=0，计算的是行key，axis=1，计算的是列key的值
data_mean = health_data.mean(level='year') 
data_mean1 = health_data.mean(axis = 1, level='type') 
data_mean, data_mean1

(subject   Bob        Guido          Sue      
 type       HR   Temp    HR   Temp    HR  Temp
 year                                         
 2013     29.5  35.85  43.0  36.55  28.5  37.5
 2014     39.0  37.65  38.0  37.25  44.0  37.4,
 type               HR       Temp
 year visit                      
 2013 1      41.666667  36.700000
      2      25.666667  36.566667
 2014 1      34.666667  38.233333
      2      46.000000  36.633333)

In [96]:
# 简单构建
def make_df(cols, ind): 
    """一个简单的DataFrame""" 
    data = {c: [str(c) + str(i) for i in ind] 
            for c in cols} 
    return pd.DataFrame(data, ind) 

# DataFrame示例 
make_df('ABC', range(3)) 

df3 = make_df('AB', [0, 1]) 
df4 = make_df('CD', [0, 1]) 

# 有重复的就合并，不然另开空间
print(df3); print(df4); print(pd.concat([df3, df4], axis=0)); print(pd.concat([df3, df4], axis=1)) 

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
     A    B    C    D
0   A0   B0  NaN  NaN
1   A1   B1  NaN  NaN
0  NaN  NaN   C0   D0
1  NaN  NaN   C1   D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1


In [99]:
## 如果不需要的话，需要将verify_integrity 参数设置为 True，重复就会报错
## 有时索引无关紧要，那么合并时就可以忽略它们，可以通过设置 ignore_index 参数来实现
df4 = make_df('AB', [0, 1]) 
pd.concat([df3, df4], ignore_index=True) 

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A1,B1


In [100]:
## ! 另一种处理索引重复的方法是通过 keys 参数为数据源设置多级索引标签，这样结果数据就会带上多级索引：
print(pd.concat([df3, df4], keys=['x', 'y'])) 

      A   B
x 0  A0  B0
  1  A1  B1
y 0  A0  B0
  1  A1  B1


In [108]:
# 默认的合并方式是对所有输入列进行并集合并（join='outer'） ，
# 当然也可以用 join='inner' 实现对输入列的交集合并
df3 = make_df('AB', [0, 1]) 
df4 = make_df('BC', [0, 1]) 
print(pd.concat([df3, df4], join='inner'));
print(pd.concat([df3, df4], join='outer'))

    B
0  B0
1  B1
0  B0
1  B1
     A   B    C
0   A0  B0  NaN
1   A1  B1  NaN
0  NaN  B0   C0
1  NaN  B1   C1


In [None]:
# Pandas 的 append() 不
# 直接更新原有对象的值，而是为合并后的数据创建一个新对象。因此，它不能被称之为一
# 个非常高效的解决方案，因为每次合并都需要重新创建索引和数据缓存。总之，如果你需
# 要进行多个 append 操作，还是建议先创建一个 DataFrame 列表，然后用 concat() 函数一次
# 性解决所有合并任务。

In [6]:
import numpy as np
a = np.random.rand(3,2,3)
b = np.random.rand(3,2,3)
pro = a*b
a[0], b[0], pro[0]

(array([[0.55888651, 0.1926415 , 0.05577748],
        [0.7258802 , 0.10407722, 0.54523658]]),
 array([[0.83495295, 0.51410551, 0.61511304],
        [0.1499736 , 0.62341923, 0.67347409]]),
 array([[0.46664394, 0.09903806, 0.03430945],
        [0.10886287, 0.06488374, 0.36720271]]))