In [14]:
# -*-coding:utf8-*-

import numpy as np
import pandas as pd
np.random.seed(12345) # 示例中使用到随机数，设定种子 确保每次的值一致
pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option('display.max_rows', 10) # 设定df全部显示的行数
pd.set_option('display.float_format', lambda x: '%.4f' % x) # 不使用科学计数法

# 一个cell做多个输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 处理缺失数据
pandas 使用np.nan表示缺失数据, 也可直接使用None关键字进行替换,但两者表示的含义不同

方法|说明
--|--
dropna| 根据各标签的值中是否存在缺失数据对轴标签进行过滤，可通过阀值调节对缺失值的容忍度
fillna|用指定值或插值方法（ffill或bfill）进行填充缺失数据
isnull|返回一个含有布尔值的对象，这些布尔值表示那些值是缺失值/NA,该对象的类型与源类型一致
notnull|isnull的否定

In [5]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan,'avocado'])
string_data[string_data.isnull()]
string_data[0] = None
string_data[string_data.isnull()]

2    NaN
dtype: object

0    None
2     NaN
dtype: object

### 滤除缺失数据
可以通过pandas.isnull 或者布尔索引，但dropna方法更常用;使用细节还需查看API
```python
help(pd.DataFrame.dropna) # series的用法与df相似
dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False)    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        按行或者按列删除，默认为行
    how : {'any', 'all'}, default 'any'    
        * 'any' : 含有一个就删除
        * 'all' : 全是NA才删除
    thresh : int, optional
        整数，删除指定数目NA的行或列
    subset : array-like, optional
        针对指定的列进行缺失值删除
    inplace : bool, default False
        是否替换原始数据
```

In [9]:
# 针对series 
from numpy import nan as NA
data = pd.Series([1,NA,3,NA,7])
data[data.notnull()] # 索引过滤 还需在重新赋值
data.dropna(inplace=True) # 方法过滤 指定参数
data

0   1.0000
2   3.0000
4   7.0000
dtype: float64

0   1.0000
2   3.0000
4   7.0000
dtype: float64

In [13]:
# 针对DataFrame，dropna会删除含NA值的所有行或列，默认axis=0 丢弃所有的行
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
data
data.dropna()
data.dropna(axis=1, how='all') # 1 按列全部为NA的才删除

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
## 官网API示例
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
"toy": [np.nan, 'Batmobile', 'Bullwhip'],
"born": [pd.NaT, pd.Timestamp("1940-04-25"),pd.NaT]})

df

# 1 默认按行 how=any删除
df.dropna()

# 2 指定按列删除
df.dropna(axis='columns')  # axis=1

# 3 how=all
df.dropna(how='all')

# 4 指定NA数目的删除
df.dropna(thresh=2)

# 5 指定区域删除
df.dropna(subset=['name', 'born'])

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


Unnamed: 0,name
0,Alfred
1,Batman
2,Catwoman


Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


### 填充缺失值
主要使用 fillna 函数，具体参考其API用法    
**常用参数**：
- value : scalar定值, dict定列, Series, or DataFrame 指定填充    
- method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None 
- limit: 限制填充行列数
```python
help(pd.DataFrame.fillna) 
```

In [21]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
data

# 1 指定值填充
data.fillna(0)

# 2 对不同的列指定值填充
data.fillna({1: 1111, 2: 2222})

# 3 指定方法填充
data.fillna(method='ffill') # 向后填充

# 4 特定值填充（更常用）
data.fillna(data.mean())

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,1111.0,2222.0
2,,1111.0,2222.0
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,6.5,3.0
2,1.0,6.5,3.0
3,1.0,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,6.5,3.0
2,1.0,6.5,3.0
3,1.0,6.5,3.0


## 数据转换
### 移除重复数据
使用内置函数drop_duplicated()
```python
drop_duplicates(self, subset=None, keep='first', inplace=False)   
    Parameters
    ----------
    subset : column label or sequence of labels, optional
        Only consider certain columns for identifying duplicates, by
        default use all of the columns
    keep : {'first', 'last', False}, default 'first'
        - ``first`` : Drop duplicates except for the first occurrence.
        - ``last`` : Drop duplicates except for the last occurrence.
        - False : Drop all duplicates.
    inplace : boolean, default False
        Whether to drop duplicates in place or to return a copy
```


In [24]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})
data

data.duplicated() # 给出重复值判断
data.drop_duplicates() # 删除重复值
data['v1'] = range(7)
data.drop_duplicates(['k1'])# 根据某一列删除重复值
data.drop_duplicates(['k1', 'k2'], keep='last')  # keep保留最后一个


Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


### 利用函数或者映射进行数据转换
使用map方法今天数据替换（全部按列进行），后面还有applymap函数 以及和reduce函数的结合

In [26]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork','bacon','Pastrami', 'corned beef','Bacon','pastrami', 'honey ham', 'novalox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

data

meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,novalox,6.0


Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,novalox,6.0,


### 替换值
使用replace函数，尽管pd.DataFrame下有该函数，但是一般在pd.Series.str.replace使用更灵活    

该函数还支持指定列和正则替换（regex参数需开启）

In [28]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

data.replace({-999: np.nan, -1000: 0}) # 多值替换尽量使用字典参数

0   1.0000
1      nan
2   2.0000
3      nan
4   0.0000
5   3.0000
dtype: float64

In [30]:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
            'B': [5, 6, 7, 8, 9],
            'C': ['a', 'b', 'c', 'd', 'e']})

df

# 1 指定列的指定值 替换为一个值 
df.replace({'A': 0, 'B': 5}, 100)

# 2 指定列的多值替换
df.replace({'A': {0: 100, 4: 400}})

Unnamed: 0,A,B,C
0,0,5,a
1,1,6,b
2,2,7,c
3,3,8,d
4,4,9,e


Unnamed: 0,A,B,C
0,100,100,a
1,1,6,b
2,2,7,c
3,3,8,d
4,4,9,e


Unnamed: 0,A,B,C
0,100,5,a
1,1,6,b
2,2,7,c
3,3,8,d
4,400,9,e


In [31]:
df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
            'B': ['abc', 'bar', 'xyz']})
df

# 1 to_replace的正则替换 需开启regex参数
df.replace(to_replace=r'^ba.$', value='new', regex=True)

# 2 指定列的正则替换
df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True)

Unnamed: 0,A,B
0,bat,abc
1,foo,bar
2,bait,xyz


Unnamed: 0,A,B
0,new,abc
1,foo,new
2,bait,xyz


Unnamed: 0,A,B
0,new,abc
1,foo,bar
2,bait,xyz


### 重命名轴索引
使用rename方法或者直接暴力命名；rename方法可以指定行列和单一改变，暴力命名必须给出全部轴索引名称


In [34]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data

# 使用map暴力转换 
transform = lambda x: x[:4].upper()
data.index = data.index.map(transform)
data

# 使用rename转换
data.rename(index=str.title, columns=str.upper)

data.rename(index={'OHIO': 'INDIANA'},
    columns={'three': 'peekaboo'}) # 指定行列

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### 离散化和面元划分
主要使用的是pd.cut函数,该函数需要bins的划分区间，可以通过right参数该表右边闭端的开放（默认是左开右闭）
```python
cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise')
```  

此外还有pd.qcut函数，根据分位数来进行面元划分；定义整数为等分位数划分，也可根据自定义分位数区间进行划分
```python
qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
```


In [39]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
cats.codes # 类别排序属性 从0开始
cats.categories # 类别值  IntervalIndex有序索引类型  

# 类别命名 ，长度最好与bins要一致
group_names = ['Youth', 'YoungAdult', 'MiddleAged','Senior']
pd.cut(ages, bins, labels=group_names)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [40]:
# 若传入的bins为数值，会将series均分为指定的组数
data = np.random.rand(20)
pd.cut(data, 4, precision=2)  # precision=2 限定小数位数只有2位

[(0.73, 0.96], (0.25, 0.49], (0.0074, 0.25], (0.0074, 0.25], (0.49, 0.73], ..., (0.49, 0.73], (0.73, 0.96], (0.73, 0.96], (0.73, 0.96], (0.49, 0.73]]
Length: 20
Categories (4, interval[float64]): [(0.0074, 0.25] < (0.25, 0.49] < (0.49, 0.73] < (0.73, 0.96]]

### 监测和过滤异常值
书中该部分讲述的还是数据的条件过滤，异常值的条件都是自定义的，以后从其他方面补充异常值的检测和过滤

In [46]:
data = pd.DataFrame(np.random.randn(1000, 4))

data.describe()

data[(np.abs(data) > 3).any(1)]  # any(1) 行或列中包含一个即可

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.0173,0.0573,-0.0015,-0.002
std,0.9707,1.0023,1.0063,0.9699
min,-2.6764,-3.3985,-3.0226,-3.1677
25%,-0.6655,-0.6145,-0.6656,-0.6534
50%,-0.0586,0.0674,0.0171,-0.0011
75%,0.6428,0.6716,0.6197,0.6585
max,2.9326,3.0691,3.1995,3.0412


Unnamed: 0,0,1,2,3
48,-2.2737,-0.366,-3.0226,0.11
327,0.8886,-3.3985,0.5615,-0.9955
339,0.0899,-3.0989,-0.2817,-0.9189
523,0.6444,3.0691,1.0858,1.3552
558,0.1064,1.0883,3.1995,-3.1677
745,-0.8381,-3.0496,-1.5162,0.5652
786,-1.2352,0.724,3.0809,0.839
800,1.4067,-1.4174,-0.9875,3.0412


### 排列和随机采样
1、利用numpy.random.permutation函数可以轻松实现对Series或DataFrame的列的排列工作（permuting，随机重排序）（默认按行）   
2、使用sample方法进行采样(默认按行)

In [50]:
# 随机排列
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

sampler = np.random.permutation(5)
sampler

df.take(sampler)

# 采样
df.sample(3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


array([4, 3, 2, 0, 1])

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7


Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3


### 计算指标/哑变量
将分类变量（categorical variable）转换为“哑变量”或“指标矩阵”,使用虚拟度量转换[get_dummies方法](https://www.baidu.com/link?url=e9wYNcGv7pItxaO2vz-wf-_322aADTEo3-q4PF-JEn4VF5FidhDbd_y4zvt1BFxmECc9B2iRIwht-JJTnilQa61BdNQ8B54WmJhKXqIxdvq&wd=&eqid=ab7c31cb0003f075000000025c7f3b7a)  

```python
get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
```

In [55]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})

pd.get_dummies(df['key'])

dummies = pd.get_dummies(df['key'], prefix='key')

dummies

df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [56]:
values = np.random.rand(10)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,0,0,1
3,0,0,1,0,0
4,1,0,0,0,0
5,0,0,1,0,0
6,1,0,0,0,0
7,0,0,0,0,1
8,0,0,0,1,0
9,0,0,1,0,0


## 字符串操作
### 字符串对象方法
内置方法基本上可满足简需求，复杂的自定义后使用apply/applymap函数适用于整个数据

In [57]:
val = 'a,b,  guido'
val.split(',')

pieces = [x.strip() for x in val.split(',')]
pieces

first, second, third = pieces
first + '::' + second + '::' + third
'::'.join(pieces)

'guido' in val
val.index(',')
val.find(':')

val.replace(',', '::')
val.replace(',', '')


['a', 'b', '  guido']

['a', 'b', 'guido']

'a::b::guido'

'a::b::guido'

True

1

-1

'a::b::  guido'

'ab  guido'

### 正则表达式
使用re模块，该部分是一个大内容，需要另外的详细补充；具体参考有道云笔记Python文件夹下的常用模块部分；  

> findall返回的是字符串中所有的匹配项，而search则只返回第一个匹配项。match更加严格，它只匹配字符串的首部 

In [59]:
import re

# 1 直接使用
text = "foo bar\t baz \tqux"
re.split('\s+', text)

# 2 构建正则对象（更常用）
regex = re.compile('\s+')
regex.split(text)

# 3 查找所有 findall/match/search
regex.findall(text)


['foo', 'bar', 'baz', 'qux']

['foo', 'bar', 'baz', 'qux']

[' ', '\t ', ' \t']

In [62]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

# 1 findall查找所有
regex.findall(text)

# match严格匹配
m = regex.search(text)
text[m.start():m.end()]

print(regex.match(text))

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

'dave@google.com'

None


### pandas的矢量化字符串函数
使用pd.Series.str.func系列函数进行字符串操作;     
常用字符串函数：cat/count/extarct/endswith/startswith/get/isXXX/join/len/match/pad/repeat/slice/split/strip等，使用时查询其API根据参数进行使用

In [67]:
data = {'Dave': 'dave@google.com', 'Steve':'steve@gmail.com','Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

data.str.contains('gmail')

parttern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)

data.str.match(pattern, flags=re.IGNORECASE)

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

Dave     [dave@google.com]
Steve    [steve@gmail.com]
Rob        [rob@gmail.com]
Wes                    NaN
dtype: object

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object