# Pandas
	以记录为数据单元，记录的序列为操作单元的数据处理平台
	支持索引，多键值索引下的表间运算（合并）（类似字典） 
	支持按行的向量广播计算，按属性的聚合运算
	索引值唯一，属性值可重复，属性值可实现表间的连接计算

In [2]:
import pandas as pd
# 记录是数据单元，序列data是操作单元
data = pd.Series([i*0.1 for i in range(10)])
print(data)
print(data.index)
print(data.values)
print(type(data.index))

0    0.0
1    0.1
2    0.2
3    0.3
4    0.4
5    0.5
6    0.6
7    0.7
8    0.8
9    0.9
dtype: float64
RangeIndex(start=0, stop=10, step=1)
[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]
<class 'pandas.core.indexes.range.RangeIndex'>


In [3]:
data = pd.Series([1, 3, 2], index=['a', 'b', 'c'])
print(data)
print(data.sort_values())  # 并不改变data本身
print(data['b'])  # 类似字典的索引  loc
print(data[0], data[1], data[2])  # 同样支持index索引: iloc


a    1
b    3
c    2
dtype: int64
a    1
c    2
b    3
dtype: int64
3
1 3 2


In [4]:
data = pd.Series([1, 3, 2], index=[1, 5, 2])
print(data[1])
print(data[2])
print(data[5])  # 如果index是数字的话，数字按照index走
print(data[1:3])  # 切片访问的是一段series
print(data.values)
print(type(data.values))  # pd的序列的value竟然是np的ndarray类型

1
2
3
5    3
2    2
dtype: int64
[1 3 2]
<class 'numpy.ndarray'>


In [5]:
# 布尔条件生成下标访问序列
print(data[data>1])
print(data[data>1]*2)  # 对访问序列操作

5    3
2    2
dtype: int64
5    6
2    4
dtype: int64


# loc, iloc, ix
	可以显式地使用loc和iloc来操作
	默认情况下，如果有显式的integer的index，会默认使用loc
### 最好显示使用
	loc 在index的标签上进行索引,范围包括start和end.
	iloc 在index的位置上进行索引,不包括end.
	ix 先在index的标签上索引，索引不到就在index的位置上索引(如果index非全整数),不包括end.

In [6]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
print(data[1])  # 相当于loc
print(data[1:3])
print(data.loc[1])  # 根据index索引
print(data.iloc[1])  # 根据python中的序列来走
print("*"*20)
data = pd.Series([1, 3, 5], index=['a', 'b', 'c'])
print(data[1])  # 相当于iloc
print(data[1:3])
# print(data.loc[1])  # 根据index索引
print(data.iloc[1])  # 根据python中的序列来走

a
3    b
5    c
dtype: object
a
b
********************
3
b    3
c    5
dtype: int64
3


In [7]:
print(1 in data)
print(1 in data.values)

False
True


In [8]:
for i in data:
	print(i, end=' ')
print('')
for i in data.values:
	print(i, end=' ')
print('')
for i in data.index:
	print(i, end=' ')

1 3 5 
1 3 5 
a b c 

In [51]:
sdata = {'a':1, 'c':3, 'b':2}  # 字典（知识库）
s1 = pd.Series(sdata)
states = ['a', 'b', 'd']
s2 = pd.Series(sdata, index=states)  # 查找字典，找不到就返回
s3 = s1+s2
s4 = s1-s2
print(s1)
print(s2)
print(s3)  # 自动对齐数据
print(s4)
print(pd.concat([s1, s2]))  # 和加不一样！！！
print(pd.concat([s1, s2])['a']) # 一个索引，对应两个值


a    1
c    3
b    2
dtype: int64
a    1.0
b    2.0
d    NaN
dtype: float64
a    2.0
b    4.0
c    NaN
d    NaN
dtype: float64
a    0.0
b    0.0
c    NaN
d    NaN
dtype: float64
a    1.0
c    3.0
b    2.0
a    1.0
b    2.0
d    NaN
dtype: float64
a    1.0
a    1.0
dtype: float64


# Merge

In [57]:
s1 = pd.DataFrame({'employee':['A', 'B', 'C', 'D'], 'group':['a', 'b', 'c', 'd']})
s2 = pd.DataFrame({'employee':['B', 'A', 'C', 'D'], 'age':['12', '14', '12', '10']})
s3 = pd.merge(s1, s2)
print(s3)

  employee group age
0        A     a  14
1        B     b  12
2        C     c  12
3        D     d  10


# Apply + 函数

In [63]:
s = pd.DataFrame({'A':['a', 'b', 'c', 'a', 'c'], 'B':[1, 2, 3, 4, 5]})
print(s)
grouped = s.groupby('A')  # 按照其中一个属性进行分类
for name, group in grouped:
	print(name)
	print(group)

d = grouped.apply(lambda x:x.head(1))
print(d)
s['C']=s['A'].apply(lambda x:x+'@@@@')
print(s)

   A  B
0  a  1
1  b  2
2  c  3
3  a  4
4  c  5
a
   A  B
0  a  1
3  a  4
b
   A  B
1  b  2
c
   A  B
2  c  3
4  c  5
     A  B
A        
a 0  a  1
b 1  b  2
c 2  c  3
   A  B      C
0  a  1  a@@@@
1  b  2  b@@@@
2  c  3  c@@@@
3  a  4  a@@@@
4  c  5  c@@@@


In [10]:
data1 = pd.Series([1, 2, 5], index=['a', 'b', 'c'])
data2 = pd.Series([3, 4], index=['c', 'd'])
frame = pd.DataFrame({"data1":data1, "data2":data2})
print(frame)
print(frame['data1'])
print(frame.data1)
frame['new_data']=frame['data1']-frame['data2']
print(frame)
print(frame.T)
print(frame.loc[frame.new_data==2, ['data1', 'data2', 'new_data']])  # 前筛选行，后面决定显示的列，loc还是要按照index索引
frame.iloc[0, 2]=1111  # iloc支持多维索引
print(frame)

   data1  data2
a    1.0    NaN
b    2.0    NaN
c    5.0    3.0
d    NaN    4.0
a    1.0
b    2.0
c    5.0
d    NaN
Name: data1, dtype: float64
a    1.0
b    2.0
c    5.0
d    NaN
Name: data1, dtype: float64
   data1  data2  new_data
a    1.0    NaN       NaN
b    2.0    NaN       NaN
c    5.0    3.0       2.0
d    NaN    4.0       NaN
            a    b    c    d
data1     1.0  2.0  5.0  NaN
data2     NaN  NaN  3.0  4.0
new_data  NaN  NaN  2.0  NaN
   data1  data2  new_data
c    5.0    3.0       2.0
   data1  data2  new_data
a    1.0    NaN    1111.0
b    2.0    NaN       NaN
c    5.0    3.0       2.0
d    NaN    4.0       NaN


In [11]:
print(frame.index)  # 坐标轴1
print(frame.columns)  # 坐标轴2
for i in frame.columns:
	print(frame[i])  # 每一组数据单独输出

Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['data1', 'data2', 'new_data'], dtype='object')
a    1.0
b    2.0
c    5.0
d    NaN
Name: data1, dtype: float64
a    NaN
b    NaN
c    3.0
d    4.0
Name: data2, dtype: float64
a    1111.0
b       NaN
c       2.0
d       NaN
Name: new_data, dtype: float64


In [12]:
# 字典列表生成frame
data = [{'a':i, 'b':i} for i in range(3)]
print(data)
print(pd.DataFrame(data))

[{'a': 0, 'b': 0}, {'a': 1, 'b': 1}, {'a': 2, 'b': 2}]
   a  b
0  0  0
1  1  1
2  2  2


In [13]:
# 2d numpy array到frame
import numpy as np
pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.986866,0.395466
b,0.037625,0.071548
c,0.935548,0.987789


In [14]:
# numpy的Ufunc使用：

frame = pd.DataFrame(np.random.rand(3, 4), columns=['a', 'b', 'c', 'd'], index=['A','B','C'])
print(frame)
# 广播机制
print(np.sin(frame))
print(frame/4)
df = frame-frame.iloc[0]
print(df)  # 按行broadcasting

          a         b         c         d
A  0.775817  0.953420  0.267237  0.430763
B  0.364259  0.941135  0.796725  0.666924
C  0.573409  0.898694  0.785494  0.660804
          a         b         c         d
A  0.700299  0.815400  0.264068  0.417564
B  0.356257  0.808227  0.715071  0.618572
C  0.542499  0.782514  0.707175  0.613752
          a         b         c         d
A  0.193954  0.238355  0.066809  0.107691
B  0.091065  0.235284  0.199181  0.166731
C  0.143352  0.224673  0.196374  0.165201
          a         b         c         d
A  0.000000  0.000000  0.000000  0.000000
B -0.411558 -0.012285  0.529488  0.236161
C -0.202407 -0.054726  0.518257  0.230042


#  支持+,-,*,/,//,%,**
并且会自动补全

In [15]:
# 自动对齐-补足（out join）
frame1 = pd.DataFrame(np.random.rand(3, 4), columns=['a', 'b', 'c', 'd'], index=['A','B','C'])
frame2 = pd.DataFrame(np.random.rand(2, 3), columns=['b', 'c', 'd'], index=['B','C'])
frame3 = frame1 + frame2
frame3

Unnamed: 0,a,b,c,d
A,,,,
B,,1.455249,1.73834,1.517386
C,,0.851711,0.810822,1.124408


In [16]:
index1 = pd.Index([1, 2, 3, 4, 5])
index2 = pd.Index([2, 3, 5, 7, 11])
print(index1 & index2)
print(index1 | index2)
print(index1 ^ index2)

Int64Index([2, 3, 5], dtype='int64')
Int64Index([1, 2, 3, 4, 5, 7, 11], dtype='int64')
Int64Index([1, 4, 7, 11], dtype='int64')


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [22]:
data = pd.Series([1, np.NaN, 'a', None])
print(data.isnull())
print(data[data.isnull()])
print(data[data.notnull()])
print(data.fillna(0))
print(data.fillna(method='ffill'))  # fill前一个元素

0    False
1     True
2    False
3     True
dtype: bool
1     NaN
3    None
dtype: object
0    1
2    a
dtype: object
0    1
1    0
2    a
3    0
dtype: object
0    1
1    1
2    a
3    a
dtype: object


# 层次-组合 索引（Hierarchical-Indexing）


In [42]:
index = [('a', 1), ('b', 2), ('c', 1)]
data = [1, 2, 3]
index = pd.MultiIndex.from_tuples(index)  # 多级索引，两个索引确认一个值
s = pd.Series(data, index=index)
print(s)
print(s[('c', 1)])
print(s[:, 1])


index = [[1, 1, 2, 3], [1, 2, 1, 2]]
popu = [5, 6, 7, 8]
pop = pd.Series(popu, index = index)
print(pop)
print(pop[:, 2])  # 类似隔一个切片

a  1    1
b  2    2
c  1    3
dtype: int64
3
a    1
c    3
dtype: int64
1  1    5
   2    6
2  1    7
3  2    8
dtype: int64
1    6
3    8
dtype: int64


In [44]:
s_ = s.unstack()  # 同维度到多维度
print(s_)
print(type(s_))

     1    2
a  1.0  NaN
b  NaN  2.0
c  3.0  NaN
<class 'pandas.core.frame.DataFrame'>


In [47]:
print(s_.stack())

a  1    1.0
b  2    2.0
c  1    3.0
dtype: float64

In [48]:
df = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.457505,0.802522
a,2,0.595729,0.22305
b,1,0.535122,0.422217
b,2,0.595524,0.837674


# 数据处理