In [66]:
# -*- coding:utf-8 -*-

import numpy as np
import pandas as pd

# 一个cell做多个输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

## pandas 数据结构介绍
> 书中讲述的较为简陋，具体的参考jupyter中[pandas test文档](http://localhost:8888/tree/1%20Data%20Manager/2%20Pandas%20Test)

### Series 
Series是一种类似于一维数组的对象，它由一组数据（各种Numpy数据类型）以及一组与之相关的数据标签（即索引）组成。
 - 1: series可以看做一个字典，索引为key，值为value;只不过不同的是，key值可以重复（索引可以重复）   
 - 2: Series在算术运算中，会自动对齐不同索引的数据（两个series相加， 会根据series的index进行匹配,如果没有匹配，返回NaN值）
 - 3: series 本身有name、 其index也有name 

### DataFrame 
DataFrame是一个表格型的数据结构，它含有一组有序的列，每列可以是不同的值类型（数值、字符串、布尔值等）。DataFrame既有行索引也有列索引，它可以被看做由Series组成的字典（共用同一个索引）

- 1、如果df已经定义，可以使用pd.DataFrame重定义
    (其实就是利用已有的df，对columns进行重新排序;前提还是那些columns 只不过顺序不同)  
- 2、将series赋值给df的某个列时，还是根据index进行匹配；（所以，新增列或列值改变，还是使用list等其他数据结构）    
- 3、列的删除使用del（改变原始数据、慎用），更多的还是使用drop函数  
- 4、df.values会返回一个二维的ndarray的类型，由于不同列的类型可能不同，保持数据类型向上兼容机制（保留范围最大的）

In [7]:
# series
ser1 = pd.Series([4,7,-5,3], index=['b','c','a','d'])
ser1.values; ser1.index

ser1[['a','c']]

ser2 = pd.Series([4,1,3,2], index=['a','c','d','f'])

ser1;ser2

ser1+ser2

array([ 4,  7, -5,  3], dtype=int64)

Index(['b', 'c', 'a', 'd'], dtype='object')

a   -5
c    7
dtype: int64

b    4
c    7
a   -5
d    3
dtype: int64

a    4
c    1
d    3
f    2
dtype: int64

a   -1.0
b    NaN
c    8.0
d    6.0
f    NaN
dtype: float64

In [12]:
# dataframe
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada','Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

frame

np.random.seed(123456)
dates = pd.date_range('20170101', periods=6)
df = pd.DataFrame(np.random.randint(0,100,size=(6,4)), columns=list('ABCD'), index=dates)
df

# df.columns = list('BADC')
pd.DataFrame(df, columns=list('DBAC'))

# 使用del函数删除列
del df['A']
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


Unnamed: 0,A,B,C,D
2017-01-01,65,49,56,43
2017-01-02,43,91,32,87
2017-01-03,36,8,74,10
2017-01-04,12,75,20,47
2017-01-05,50,86,34,14
2017-01-06,70,42,66,47


Unnamed: 0,D,B,A,C
2017-01-01,43,49,65,56
2017-01-02,87,91,43,32
2017-01-03,10,8,36,74
2017-01-04,47,75,12,20
2017-01-05,14,86,50,34
2017-01-06,47,42,70,66


Unnamed: 0,B,C,D
2017-01-01,49,56,43
2017-01-02,91,32,87
2017-01-03,8,74,10
2017-01-04,75,20,47
2017-01-05,86,34,14
2017-01-06,42,66,47


### 索引对象  
pandas的索引对象负责管理轴标签和其他元数据

- 索引是不可更改的，任何对索引的操作都是产生一个新的索引
- 索引可以包含重复的标签
- 索引可以看做为一个独立对象，在此基础上既可以进行增删改查等
- 索引常见的方法如下：![Index_Functiom](./index_func.jpg "Index 的常见函数")


In [15]:
# index操作
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

labels = pd.Index(np.arange(3))
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2

Index(['a', 'b', 'c'], dtype='object')

0    1.5
1   -2.5
2    0.0
dtype: float64

## 基本功能 
### 重新索引 
reindex函数 不会对原来的索引进行替换，只是对索引进行重排序，属于**调整数据行、列顺序 或者 对行进行插值**使用；使用场景较为有限   
> 如果有新的索引，会将改行置为NaN；也可以根据参数method指定填充方式      

![reindex 函数参数](./reindex.jpg) 

In [16]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a','c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

### 丢弃指定轴上的项
主要使用drop方法，drop方法返回的是一个在指定轴上删除了指定值的新对象;默认按行删除；
```python
drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')
```

In [18]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd','e'])
new_obj = obj.drop('c')
new_obj
obj.drop(['d', 'c'])

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

a    0.0
b    1.0
e    4.0
dtype: float64

### 索引、选取和过滤
Series索引（obj[...]）的工作方式类似于NumPy数组的索引，只不过Series的索引值不只是整数； 
> 主要是注意行列索引的选取、切片、布尔型过滤

In [24]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'NewYork'],
                    columns=['one', 'two', 'three','four'])

data[:2] # 行切片
data[['three', 'one']] # 列选取
data[data < 5]=100 # 布尔型过滤 且 赋值
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
NewYork,14,12


Unnamed: 0,one,two,three,four
Ohio,100,100,100,100
Colorado,100,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


### 用loc和iloc进行选取
> 使用轴标签（loc）或整数索引（iloc），从DataFrame选择行和列的子集。可以直接根据列名选取数据，但是在新版本的pandas不建议这么使用；

![df数据选取](./datasets/df_chose_data.png)

In [27]:
# loc根据行列索引的名称选取数据
data
data.loc['Colorado', ['two', 'three']]

# iloc根据行列索引进行选取
data.iloc[2, [3, 0, 1]]

# 两种都可适用于一个标签或者多个标签的切片
data.loc[:'Utah', 'two']
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three,four
Ohio,100,100,100,100
Colorado,100,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


two      5
three    6
Name: Colorado, dtype: int32

four    11
one      8
two      9
Name: Utah, dtype: int32

Ohio        100
Colorado      5
Utah          9
Name: two, dtype: int32

Unnamed: 0,one,two,three
Ohio,100,100,100
Colorado,100,5,6
Utah,8,9,10
NewYork,12,13,14


### 整数索引
对于整数索引，均可以使用loc和iloc进行获取数据，但是两者的含义不同

In [28]:
ser = pd.Series(np.arange(3.))
ser[:1]
ser.loc[:1]
ser.iloc[:1]

0    0.0
dtype: float64

0    0.0
1    1.0
dtype: float64

0    0.0
dtype: float64

### 算术运算和数据对齐 
pandas最重要的一个功能是，它可以对不同索引的对象进行算术运算。在将对象相加时，如果存在不同的索引对，则结果的索引就是该索引对的并集

In [29]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd','e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a', 'c', 'e', 'f', 'g'])

s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [30]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)),columns=list('bcd'),index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)),columns=list('bde'),index=['Utah', 'Ohio', 'Texas','Oregon'])

df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


### 在算术方法中填充值   
方法|说明
--|--
add,radd|加法
sub,rsub|减法
div,rdiv|除法
floordiv，rfloordiv| 地板除
mul，rmul|乘法
pow，rpow|指数

In [33]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),columns=list('abcde'))

df2.loc[1, 'b'] = np.nan
df1 + df2
df1.add(df2, fill_value=0)
df1.rdiv(1)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


### DataFrame和Series之间的运算
DataFrame的数据广播，会按照axis的指定指定扩充到不同的行列

In [35]:
arr = np.arange(12.).reshape((3, 4))
arr - arr[0]

frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas','Oregon'])

series = frame.iloc[0]
frame - series

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


### 函数应用和映射 
NumPy的ufuncs（元素级数组方法）也可用于操作pandas对象

In [38]:
frame = pd.DataFrame(np.random.randn(4, 3),columns=list('bde'),index=['Utah', 'Ohio', 'Texas','Oregon'])

np.abs(frame)
f = lambda x: x.max() - x.min()
frame.apply(f)
frame.apply(f, axis='columns')

format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.709661,1.669052,1.037882
Ohio,1.705775,0.919854,0.042379
Texas,1.247642,0.00992,0.290213
Oregon,0.495767,0.362949,1.548106


b    2.953418
d    2.588906
e    1.590484
dtype: float64

Utah      2.378713
Ohio      1.663397
Texas     1.257562
Oregon    1.185157
dtype: float64

Unnamed: 0,b,d,e
Utah,-0.71,1.67,1.04
Ohio,-1.71,-0.92,-0.04
Texas,1.25,-0.01,0.29
Oregon,0.5,0.36,1.55


### 排序和排名
- sort_index 根据索引进行排序
```python
sort_index(self, axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None)
```
- sort_values 根据值进行排序
```python
sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
```
- rank 通过“为各组分配一个平均排名”的方式破坏平级关系
```python
rank(self, axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False)
```

In [45]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0,1]})
frame.sort_values(by=['a', 'b'])

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

### 带有重复标签的轴索引
标签内容不是递增整数，是可以重复的内容；该种情况比较少，就算有也要处理为索引唯一
- 使用is_unique函数判断是否唯一
- 返回值根据数量分为series或者一个标量值
- 此时再结合索引下标进行二次数据选取

In [47]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj.index.is_unique

df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a','b', 'b'])
df.loc['b']

False

Unnamed: 0,0,1,2
b,0.017587,-0.016692,-0.575247
b,0.254161,-1.143704,0.215897


## 汇总和计算描述统计 
主要根据axis的指向进行行列统计计算，还有就是主要多层次索引的汇总统计是根据level参数指定
- NA值回自动被排除， 通过skipna选项可以禁用该功能
- 使用间接统计来完成 idxmin和idxmax

![DataFrame的统计函数](./datasets/df_func.png)

In [48]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

### 相关系数和协方差
> 该部分数据需要联网下载，但是无法翻墙，使用其他数据进行类似统计 

- pct_change 变化率
```python
pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, **kwargs)
```

- corr、cov 相关系数与协方差 (列与列之间、df之间、df与ser之间-corrwith)

In [67]:
df = pd.read_csv('./examples/volume.csv', sep=',')
df.rename(columns={'Unnamed: 0':'date'},inplace=True)
df['date'] = df['date'].astype(np.datetime64)
df.set_index(keys=['date'],drop=True, inplace=True )  # 行索引最好转换为date格式
df.head()

Unnamed: 0_level_0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990-02-01,2185600.0,4193200.0,14457600.0,6903600.0,5942400.0,89193600.0,2954400.0,154580000.0,2916400.0
1990-02-02,3103200.0,4248800.0,15302400.0,6064400.0,4732800.0,71395200.0,2424000.0,164400000.0,4250000.0
1990-02-05,1792800.0,3653200.0,9134400.0,5299200.0,3950400.0,59731200.0,2225400.0,130950000.0,5880800.0
1990-02-06,2205600.0,2640000.0,14389200.0,10808000.0,3761600.0,81964800.0,3270000.0,134070000.0,4750800.0
1990-02-07,3592800.0,11180800.0,18704400.0,12057600.0,5458400.0,134150400.0,4332600.0,186710000.0,4124800.0


In [70]:
res = df.pct_change()
res.tail()

Unnamed: 0_level_0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011-10-10,-0.341357,-0.175402,-0.143578,-0.15459,-0.425317,-0.207167,-0.243268,-0.203137,-0.132273
2011-10-11,0.851029,0.37038,-0.180845,-0.078533,0.07386,-0.071483,0.714431,-0.005015,-0.126791
2011-10-12,1.007081,0.027617,0.342517,0.001445,0.228925,0.351917,0.34441,0.210388,0.245038
2011-10-13,-0.612522,-0.316514,-0.253101,-0.175902,-0.170025,-0.165104,-0.500783,-0.171621,-0.12135
2011-10-14,-0.102525,0.347354,-0.021477,0.220267,-0.141664,0.162566,0.268509,-0.072038,-0.085449


In [74]:
# 单列之间
res.AA.corr(res.GE)
res.AA.cov(res.GE)

0.23864803610688046

0.06594345366010024

In [76]:
# df之间
res.corr()
res.cov()

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
AA,1.0,0.111456,0.238648,0.213539,0.221829,0.175389,0.197263,0.199378,0.231126
AAPL,0.111456,1.0,0.129822,0.150959,0.099118,0.1049,0.144051,0.133293,0.132127
GE,0.238648,0.129822,1.0,0.287316,0.272412,0.207238,0.275259,0.257867,0.344388
IBM,0.213539,0.150959,0.287316,1.0,0.228255,0.24803,0.245078,0.288018,0.273444
JNJ,0.221829,0.099118,0.272412,0.228255,1.0,0.191703,0.239998,0.218471,0.29182
MSFT,0.175389,0.1049,0.207238,0.24803,0.191703,1.0,0.197872,0.239275,0.171448
PEP,0.197263,0.144051,0.275259,0.245078,0.239998,0.197872,1.0,0.228018,0.250901
SPX,0.199378,0.133293,0.257867,0.288018,0.218471,0.239275,0.228018,1.0,0.247291
XOM,0.231126,0.132127,0.344388,0.273444,0.29182,0.171448,0.250901,0.247291,1.0


Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
AA,0.338392,0.066576,0.065943,0.058154,0.07466,0.058326,0.060864,0.049286,0.061279
AAPL,0.066576,1.054405,0.063322,0.072569,0.058887,0.061579,0.078456,0.058163,0.061837
GE,0.065943,0.063322,0.225636,0.063893,0.074867,0.056276,0.069351,0.052052,0.07456
IBM,0.058154,0.072569,0.063893,0.219169,0.061826,0.066381,0.060856,0.057299,0.058346
JNJ,0.07466,0.058887,0.074867,0.061826,0.334751,0.063408,0.07365,0.053714,0.076953
MSFT,0.058326,0.061579,0.056276,0.066381,0.063408,0.326816,0.059999,0.058128,0.044672
PEP,0.060864,0.078456,0.069351,0.060856,0.07365,0.059999,0.281326,0.051394,0.060654
SPX,0.049286,0.058163,0.052052,0.057299,0.053714,0.058128,0.051394,0.18058,0.047895
XOM,0.061279,0.061837,0.07456,0.058346,0.076953,0.044672,0.060654,0.047895,0.207732


In [77]:
# df与ser1之间 corrwith函数 也可循环控制获得 但是效率不高
res.corrwith(res.AA)

AA      1.000000
AAPL    0.111456
GE      0.238648
IBM     0.213539
JNJ     0.221829
MSFT    0.175389
PEP     0.197263
SPX     0.199378
XOM     0.231126
dtype: float64

### 唯一值、值计数以及成员资格
> 该部分书中给出的函数较少，需要多回顾pandas的api

- isin:series对象函数
- match：计算一个数组中各个值到另一个不同值数组之间的整数索引
- unique：唯一值
- value_counts：不同值的次数统计(最好按照顶级方法使用)

In [81]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c','c'])
obj.unique()
obj.value_counts()
pd.value_counts(obj.values, sort=False)
obj[obj.isin(['b', 'c'])]

array(['c', 'a', 'd', 'b'], dtype=object)

a    3
c    3
b    2
d    1
dtype: int64

b    2
d    1
c    3
a    3
dtype: int64

0    c
5    b
6    b
7    c
8    c
dtype: object

In [82]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})

data.apply(pd.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
