In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [16]:
print('样例1：分组进行缺省值的填充')
s = Series(np.random.randn(6))
s

样例1：分组进行缺省值的填充


0    0.618274
1    1.043760
2   -0.294704
3    0.138775
4    0.502728
5    0.351959
dtype: float64

In [5]:
s[1:6:2]
#取分片的时候就是使用冒号 start:end:step 而且左闭右开

1    0.569780
3    0.058218
5   -1.073652
dtype: float64

In [6]:
s[::2] = np.nan
#意思就是从头开始到结束 每隔两个选一个切片 赋值为NaN
s

0         NaN
1    0.569780
2         NaN
3    0.058218
4         NaN
5   -1.073652
dtype: float64

In [7]:
s.fillna(s.mean())
#使用fillna可以方便地进行缺省值的填充

0   -0.148551
1    0.569780
2   -0.148551
3    0.058218
4   -0.148551
5   -1.073652
dtype: float64

In [8]:
state = ['Ohio','New York','Vermont','Florida','Oregon',
         'Nevada','California','Idaho']
group_key = ['East'] *4 +['West'] *4
data = Series(np.random.randn(8),index=state)
data

Ohio         -0.533911
New York      0.870873
Vermont      -2.476698
Florida      -0.637095
Oregon       -0.142212
Nevada        0.838121
California   -0.607705
Idaho         0.204598
dtype: float64

In [9]:
data[['Vermont','Nevada','Idaho']] = np.nan
data

Ohio         -0.533911
New York      0.870873
Vermont            NaN
Florida      -0.637095
Oregon       -0.142212
Nevada             NaN
California   -0.607705
Idaho              NaN
dtype: float64

In [10]:
grouped = data.groupby(group_key)
#还是上一节说的，如果不加axis=1，那么默认是按照index进行group的
#给的顺序会将index重命名 然后再进行分组
grouped.size()

East    4
West    4
dtype: int64

In [11]:
grouped.mean()
#这个是排除了NaN的均值

East   -0.100044
West   -0.374959
dtype: float64

In [13]:
fill_mean = lambda g:g.fillna(g.mean())
grouped.apply(fill_mean)
#为不同的组 赋予不同的均值
#分组之后进行apply方法即可

Ohio         -0.533911
New York      0.870873
Vermont      -0.100044
Florida      -0.637095
Oregon       -0.142212
Nevada       -0.374959
California   -0.607705
Idaho        -0.374959
dtype: float64

In [15]:
fill_value = {'East':0.5 , 'West': -1}
fill_func = lambda g:g.fillna(fill_value[g.name])
grouped.apply(fill_func)
#道理同上

Ohio         -0.533911
New York      0.870873
Vermont       0.500000
Florida      -0.637095
Oregon       -0.142212
Nevada       -1.000000
California   -0.607705
Idaho        -1.000000
dtype: float64

In [17]:
print('###############################################')
print('样例二：随机采样和排列')

###############################################
样例二：随机采样和排列


In [20]:
#红桃（Hearts） 黑桃(Spade) 梅花(Clubs) 方片(Diamonds)
suits = ['H', 'S', 'C', 'D']
card_value = (list(range(1,11))+[10]*3)*4
base_names = ['A']+ list(range(2,11)) + ['J','K','Q']

In [21]:
cards = []
for suit in suits:
    cards.extend(str(num)+suit for num in base_names)
deck = Series(card_value,index=cards)
deck

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
AS      1
2S      2
3S      3
4S      4
5S      5
6S      6
7S      7
8S      8
9S      9
10S    10
JS     10
KS     10
QS     10
AC      1
2C      2
3C      3
4C      4
5C      5
6C      6
7C      7
8C      8
9C      9
10C    10
JC     10
KC     10
QC     10
AD      1
2D      2
3D      3
4D      4
5D      5
6D      6
7D      7
8D      8
9D      9
10D    10
JD     10
KD     10
QD     10
dtype: int64

In [22]:
#随机抽出五张
def draw(deck,n=5):
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)

2S      2
10H    10
9C      9
8C      8
3S      3
dtype: int64

In [23]:
groups = ['H']*13+['S']*13+['C']*13+['D']*13
grouped_cards = deck.groupby(groups)

In [24]:
grouped_cards.size()

C    13
D    13
H    13
S    13
dtype: int64

In [25]:
grouped_cards.apply(draw,2)

C  KC    10
   JC    10
D  2D     2
   JD    10
H  QH    10
   5H     5
S  5S     5
   KS    10
dtype: int64

In [26]:
print('---------------------------')
#书上的做法：
get_suit = lambda card:card[-1]
#groupby的道理是一样的 也是对index重新提取的概念 他比我做的好得多
deck.groupby(get_suit).apply(draw,2)

---------------------------


C  2C      2
   4C      4
D  7D      7
   2D      2
H  4H      4
   10H    10
S  QS     10
   9S      9
dtype: int64

In [27]:
deck.groupby(get_suit,group_keys=False).apply(draw,2)
#这个就不是很明白了 禁止分组键有啥用啊。。

KC    10
9C     9
6D     6
4D     4
7H     7
8H     8
JS    10
8S     8
dtype: int64

In [28]:
print('#######################################')
print('样例3：分组加权平均数和相关关系')

#######################################
样例3：分组加权平均数和相关关系


In [29]:
df = DataFrame({'category':['a','a','a','a','b','b','b','b'],
                'data':np.random.randn(8),
                'weight':np.random.rand(8)})
df

Unnamed: 0,category,data,weight
0,a,0.542408,0.803703
1,a,-1.533706,0.860694
2,a,1.696185,0.867137
3,a,-0.496968,0.519866
4,b,-0.07358,0.239036
5,b,-0.138802,0.94205
6,b,0.665936,0.708077
7,b,0.989383,0.000881


In [30]:
grouped1 = df.groupby('category')
#split 按照category分组
grouped1.size()

category
a    4
b    4
dtype: int64

In [31]:
get_wavg = lambda g:np.average(g['data'],weights=g['weight'])
#apply定义加权平均 进行操作
grouped1.apply(get_wavg)
#最后combine合并到一个Series上

category
a    0.107607
b    0.171456
dtype: float64

In [32]:
close_px = pd.read_csv('../examples/stock_px.csv',parse_dates=True,index_col=0)
close_px

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990-02-01,4.98,7.86,2.87,16.79,4.27,0.51,6.04,328.79,6.12
1990-02-02,5.04,8.00,2.87,16.89,4.37,0.51,6.09,330.92,6.24
1990-02-05,5.07,8.18,2.87,17.32,4.34,0.51,6.05,331.85,6.25
1990-02-06,5.01,8.12,2.88,17.56,4.32,0.51,6.15,329.66,6.23
1990-02-07,5.04,7.77,2.91,17.93,4.38,0.51,6.17,333.75,6.33
...,...,...,...,...,...,...,...,...,...
2011-10-10,10.09,388.81,16.14,186.62,64.43,26.94,61.87,1194.89,76.28
2011-10-11,10.30,400.29,16.14,185.00,63.96,27.00,60.95,1195.54,76.27
2011-10-12,10.05,402.19,16.40,186.12,64.33,26.96,62.70,1207.25,77.16
2011-10-13,10.10,408.43,16.22,186.82,64.23,27.18,62.36,1203.66,76.37


In [33]:
rets = close_px.pct_change().dropna()
#pct_change是当前元素相较于先前元素的百分比
#自然就是本问题中的日收益率
rets

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990-02-02,0.012048,0.017812,0.000000,0.005956,0.023419,0.000000,0.008278,0.006478,0.019608
1990-02-05,0.005952,0.022500,0.000000,0.025459,-0.006865,0.000000,-0.006568,0.002810,0.001603
1990-02-06,-0.011834,-0.007335,0.003484,0.013857,-0.004608,0.000000,0.016529,-0.006599,-0.003200
1990-02-07,0.005988,-0.043103,0.010417,0.021071,0.013889,0.000000,0.003252,0.012407,0.016051
1990-02-08,0.000000,-0.007722,0.003436,-0.003904,0.018265,0.000000,0.008104,-0.002367,0.003160
...,...,...,...,...,...,...,...,...,...
2011-10-10,0.039135,0.051406,0.041290,0.023192,0.020592,0.026286,0.013930,0.034125,0.036977
2011-10-11,0.020813,0.029526,0.000000,-0.008681,-0.007295,0.002227,-0.014870,0.000544,-0.000131
2011-10-12,-0.024272,0.004747,0.016109,0.006054,0.005785,-0.001481,0.028712,0.009795,0.011669
2011-10-13,0.004975,0.015515,-0.010976,0.003761,-0.001554,0.008160,-0.005423,-0.002974,-0.010238


In [34]:
spx_corr = lambda x:x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x:x.year)

#我真得注意一下这个groupby参数是labmda表达式的方法了
#通过一种映射关系将index进行映射后 再进行分组 很不错

by_year.size()

1990    230
1991    253
1992    254
1993    253
1994    252
1995    252
1996    254
1997    253
1998    252
1999    252
2000    252
2001    248
2002    252
2003    252
2004    252
2005    252
2006    251
2007    251
2008    253
2009    252
2010    252
2011    199
dtype: int64

In [35]:
by_year.apply(spx_corr)
#计算了每个列相较于SPX的关系

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990,0.595024,0.545067,0.752187,0.738361,0.801145,0.586691,0.783168,1.0,0.517586
1991,0.453574,0.365315,0.759607,0.557046,0.646401,0.524225,0.641775,1.0,0.569335
1992,0.39818,0.498732,0.632685,0.262232,0.51574,0.492345,0.473871,1.0,0.318408
1993,0.259069,0.238578,0.447257,0.211269,0.451503,0.425377,0.385089,1.0,0.318952
1994,0.428549,0.26842,0.572996,0.385162,0.372962,0.436585,0.450516,1.0,0.395078
1995,0.291532,0.161829,0.519126,0.41639,0.315733,0.45366,0.413144,1.0,0.368752
1996,0.292344,0.191482,0.750724,0.388497,0.569232,0.564015,0.421477,1.0,0.538736
1997,0.564427,0.211435,0.827512,0.646823,0.703538,0.606171,0.509344,1.0,0.695653
1998,0.533802,0.379883,0.815243,0.623982,0.591988,0.698773,0.494213,1.0,0.369264
1999,0.099033,0.425584,0.710928,0.486167,0.517061,0.631315,0.336593,1.0,0.315383


In [36]:
by_year.apply(lambda g:g['AAPL'].corr(g['MSFT']))

1990    0.408271
1991    0.266807
1992    0.450592
1993    0.236917
1994    0.361638
1995    0.258642
1996    0.147539
1997    0.196144
1998    0.364106
1999    0.329484
2000    0.275298
2001    0.563156
2002    0.571435
2003    0.486262
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

In [37]:
print('##########################################')

##########################################
样例4：面向分组的线性回归


In [38]:
tips = pd.read_csv('../examples/tips.csv')
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.50,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3
240,27.18,2.00,Yes,Sat,Dinner,2
241,22.67,2.00,Yes,Sat,Dinner,2
242,17.82,1.75,No,Sat,Dinner,2


In [39]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.50,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.139780
4,24.59,3.61,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3,0.203927
240,27.18,2.00,Yes,Sat,Dinner,2,0.073584
241,22.67,2.00,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,No,Sat,Dinner,2,0.098204


In [44]:
#我服了 我是真不知道透视表是啥，我在学什么东西呢！？

tips.pivot_table(index=['smoker','day'])
#透视表按照smoker和day做

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No,Fri,2.25,2.8125,0.15165,18.42
No,Sat,2.555556,3.102889,0.158048,19.661778
No,Sun,2.929825,3.167895,0.160113,20.506667
No,Thur,2.488889,2.673778,0.160298,17.113111
Yes,Fri,2.066667,2.714,0.174783,16.813333
Yes,Sat,2.47619,2.875476,0.147906,21.276667
Yes,Sun,2.578947,3.516842,0.18725,24.12
Yes,Thur,2.352941,3.03,0.163863,19.190588


In [45]:
tips.pivot_table(['tip_pct','size'],index=['day'],columns=['smoker'])
#可以通过index和columns选取行和列保留的属性 第一个参数选取data的属性

Unnamed: 0_level_0,size,size,tip_pct,tip_pct
smoker,No,Yes,No,Yes
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Fri,2.25,2.066667,0.15165,0.174783
Sat,2.555556,2.47619,0.158048,0.147906
Sun,2.929825,2.578947,0.160113,0.18725
Thur,2.488889,2.352941,0.160298,0.163863


In [46]:
#可以使用margins属性
tips.pivot_table(['tip_pct','size'],index=['day'],columns=['smoker'],margins=True)

Unnamed: 0_level_0,size,size,size,tip_pct,tip_pct,tip_pct
smoker,No,Yes,All,No,Yes,All
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,2.25,2.066667,2.105263,0.15165,0.174783,0.169913
Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Thur,2.488889,2.352941,2.451613,0.160298,0.163863,0.161276
All,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


In [47]:
#如果使用其他的聚合函数使用aggfunc参数
tips.pivot_table(['tip_pct','size'],index=['day'],columns=['smoker'],aggfunc=len,margins=True)

Unnamed: 0_level_0,size,size,size,tip_pct,tip_pct,tip_pct
smoker,No,Yes,All,No,Yes,All
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,4,15,19,4.0,15.0,19.0
Sat,45,42,87,45.0,42.0,87.0
Sun,57,19,76,57.0,19.0,76.0
Thur,45,17,62,45.0,17.0,62.0
All,151,93,244,151.0,93.0,244.0


In [51]:
data1 = DataFrame({'Sample':range(10),
                   'Gender':['Female']*6+['Male']*4,
                  'Handedness':['Right_handed']*3+['Left_handed']*7})
data1

Unnamed: 0,Sample,Gender,Handedness
0,0,Female,Right_handed
1,1,Female,Right_handed
2,2,Female,Right_handed
3,3,Female,Left_handed
4,4,Female,Left_handed
5,5,Female,Left_handed
6,6,Male,Left_handed
7,7,Male,Left_handed
8,8,Male,Left_handed
9,9,Male,Left_handed


In [52]:
pd.crosstab(data1['Gender'],data1['Handedness'],margins=True)
#这个交叉表还是比较好理解的

Handedness,Left_handed,Right_handed,All
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,3,3,6
Male,4,0,4
All,7,3,10


In [54]:
pd.crosstab([tips['time'],tips['day']],tips['smoker'],margins=True)
#第一个参数是index部分的 第二个参数是column的

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
