# 分组变换和分析

In [1]:
import random 
random.seed(0)
import string
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
N = 1000
def rands(n):
    choices = string.ascii_uppercase
    return ''.join([random.choice(choices) for _ in range(n)])
tickers = np.array([rands(5) for _ in range(N)])

In [3]:
tickers[:5]

array(['MYNBI', 'QPMZJ', 'PLSGQ', 'EJEYD', 'TZIRW'], 
      dtype='<U5')

In [4]:
M = 500
df = DataFrame({'Momentum': np.random.randn(M) / 200 + 0.03,
                'Value': np.random.randn(M) / 200 + 0.08,
                'ShortInterest': np.random.randn(M) / 200 - 0.02},
                index=tickers[:M])

In [5]:
ind_names = np.array(['FINANCIAL', 'TECH'])
sampler = np.random.randint(0, len(ind_names), N)
industries = Series(ind_names[sampler], index=tickers,
                    name='industry')

In [6]:
by_industry = df.groupby(industries)

In [7]:
by_industry.mean()

Unnamed: 0_level_0,Momentum,ShortInterest,Value
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FINANCIAL,0.029694,-0.020027,0.079644
TECH,0.029799,-0.020607,0.080358


In [8]:
by_industry.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Momentum,ShortInterest,Value
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FINANCIAL,count,234.0,234.0,234.0
FINANCIAL,mean,0.029694,-0.020027,0.079644
FINANCIAL,std,0.004941,0.004896,0.005232
FINANCIAL,min,0.017471,-0.032172,0.063125
FINANCIAL,25%,0.02607,-0.023297,0.07604
FINANCIAL,50%,0.029737,-0.020471,0.079791
FINANCIAL,75%,0.032699,-0.017053,0.082714
FINANCIAL,max,0.043193,-0.002768,0.096528
TECH,count,266.0,266.0,266.0
TECH,mean,0.029799,-0.020607,0.080358


In [9]:
np.random.randint(0, 10, 10)

array([9, 6, 8, 2, 3, 1, 3, 9, 5, 4])

In [10]:
industries

MYNBI    FINANCIAL
QPMZJ    FINANCIAL
PLSGQ         TECH
EJEYD    FINANCIAL
TZIRW    FINANCIAL
ZTEJD         TECH
XCVKP         TECH
RDLNK         TECH
TUGRP    FINANCIAL
OQIBZ    FINANCIAL
RACXM    FINANCIAL
WZVUA         TECH
TPKHX    FINANCIAL
KWCGS         TECH
HHZEZ         TECH
ROCCK    FINANCIAL
QPDJR    FINANCIAL
JWDRK    FINANCIAL
RGZTR         TECH
SJOCT    FINANCIAL
ZMKSH    FINANCIAL
JFGFB    FINANCIAL
TVIPC    FINANCIAL
CVYEE         TECH
BCWRV         TECH
MWQIQ    FINANCIAL
ZHGVS         TECH
NSIOP         TECH
VUWZL    FINANCIAL
CKTDP    FINANCIAL
           ...    
XWPSF         TECH
ZSYAT         TECH
LGJNC         TECH
HDFZT    FINANCIAL
OUEPK    FINANCIAL
OWEPZ         TECH
IPKWQ    FINANCIAL
XZHYB    FINANCIAL
LQIMR    FINANCIAL
UTDJE    FINANCIAL
QEECK    FINANCIAL
GBAPP         TECH
CSQBE    FINANCIAL
UOSES         TECH
ICJLE         TECH
OLBKD    FINANCIAL
DTVDJ         TECH
VJSLF         TECH
YAVYP         TECH
MMUGC    FINANCIAL
NIPNV    FINANCIAL
PMENZ       

In [11]:
def zscore(group):
    return (group - group.mean()) / group.std()

In [12]:
df_stand = by_industry.apply(zscore)

In [13]:
df_stand

Unnamed: 0,Momentum,ShortInterest,Value
MYNBI,0.515442,-0.029325,-0.994165
QPMZJ,-0.513274,-0.738045,-0.796180
PLSGQ,-0.737467,1.101616,0.188692
EJEYD,2.180658,0.019972,1.357586
TZIRW,-1.603186,-1.024600,0.451378
ZTEJD,0.108872,0.027014,0.267969
XCVKP,1.500445,0.140730,-0.772672
RDLNK,2.018450,-0.053477,-1.228884
TUGRP,-1.004737,-1.114027,0.139618
OQIBZ,-1.893138,0.186817,-1.281643


In [14]:
df_stand.groupby(industries).agg(['mean', 'std'])

Unnamed: 0_level_0,Momentum,Momentum,ShortInterest,ShortInterest,Value,Value
Unnamed: 0_level_1,mean,std,mean,std,mean,std
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
FINANCIAL,-2.432053e-15,1.0,-1.272131e-15,1.0,-5.023522e-15,1.0
TECH,-1.16096e-15,1.0,-1.997149e-15,1.0,1.549304e-15,1.0


In [15]:
ind_rank = by_industry.rank(ascending=False)

In [16]:
by_industry

<pandas.core.groupby.DataFrameGroupBy object at 0x10dfbfda0>

In [17]:
ind_rank

Unnamed: 0,Momentum,ShortInterest,Value
MYNBI,69.0,114.0,197.0
QPMZJ,156.0,180.0,187.0
PLSGQ,205.0,32.0,113.0
EJEYD,4.0,110.0,19.0
TZIRW,226.0,200.0,72.0
ZTEJD,127.0,134.0,106.0
XCVKP,13.0,120.0,214.0
RDLNK,5.0,140.0,240.0
TUGRP,193.0,209.0,103.0
OQIBZ,231.0,96.0,214.0


In [18]:
ind_rank.groupby(industries).agg(['min', 'max'])

Unnamed: 0_level_0,Momentum,Momentum,ShortInterest,ShortInterest,Value,Value
Unnamed: 0_level_1,min,max,min,max,min,max
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
FINANCIAL,1.0,234.0,1.0,234.0,1.0,234.0
TECH,1.0,266.0,1.0,266.0,1.0,266.0


In [19]:
df_stand

Unnamed: 0,Momentum,ShortInterest,Value
MYNBI,0.515442,-0.029325,-0.994165
QPMZJ,-0.513274,-0.738045,-0.796180
PLSGQ,-0.737467,1.101616,0.188692
EJEYD,2.180658,0.019972,1.357586
TZIRW,-1.603186,-1.024600,0.451378
ZTEJD,0.108872,0.027014,0.267969
XCVKP,1.500445,0.140730,-0.772672
RDLNK,2.018450,-0.053477,-1.228884
TUGRP,-1.004737,-1.114027,0.139618
OQIBZ,-1.893138,0.186817,-1.281643


In [20]:
df_stand.rank(ascending=False)

Unnamed: 0,Momentum,ShortInterest,Value
MYNBI,156.0,252.0,428.0
QPMZJ,342.0,380.0,402.0
PLSGQ,380.0,60.0,208.0
EJEYD,8.0,245.0,38.0
TZIRW,479.0,427.0,161.0
ZTEJD,229.0,243.0,197.0
XCVKP,34.0,218.0,397.0
RDLNK,10.0,256.0,449.0
TUGRP,414.0,438.0,223.0
OQIBZ,489.0,211.0,455.0


In [21]:
help(df_stand.rank)

Help on method rank in module pandas.core.generic:

rank(axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False) method of pandas.core.frame.DataFrame instance
    Compute numerical data ranks (1 through n) along axis. Equal values are
    assigned a rank that is the average of the ranks of those values
    
    Parameters
    ----------
    axis: {0 or 'index', 1 or 'columns'}, default 0
        index to direct ranking
    method : {'average', 'min', 'max', 'first', 'dense'}
        * average: average rank of group
        * min: lowest rank in group
        * max: highest rank in group
        * first: ranks assigned in order they appear in the array
        * dense: like 'min', but rank always increases by 1 between groups
    numeric_only : boolean, default None
        Include only float, int, boolean data. Valid only for DataFrame or
        Panel objects
    na_option : {'keep', 'top', 'bottom'}
        * keep: leave NA values where they are
   

In [22]:
by_industry.apply(lambda x: zscore(x.rank()))

Unnamed: 0,Momentum,ShortInterest,Value
MYNBI,0.716458,0.051703,-1.174400
QPMZJ,-0.568734,-0.923270,-1.026676
PLSGQ,-0.929395,1.319351,0.266470
EJEYD,1.676659,0.110792,1.455074
TZIRW,-1.602797,-1.218717,0.672141
ZTEJD,0.084490,-0.006499,0.357460
XCVKP,1.566323,0.175480,-1.046382
RDLNK,1.670311,-0.084490,-1.384343
TUGRP,-1.115310,-1.351667,0.214199
OQIBZ,-1.676659,0.317605,-1.425529


## 分组因子暴露

In [23]:
from numpy.random import randn
fac1, fac2, fac3 = np.random.randn(3, 1000)

ticker_subset = tickers.take(np.random.permutation(N)[:1000])

port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + randn(1000),
              index=ticker_subset)

factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3},
                    index=ticker_subset)

In [24]:
factors.corrwith(port)

f1    0.391251
f2   -0.683493
f3    0.151038
dtype: float64

In [25]:
help(factors.corrwith)

Help on method corrwith in module pandas.core.frame:

corrwith(other, axis=0, drop=False) method of pandas.core.frame.DataFrame instance
    Compute pairwise correlation between rows or columns of two DataFrame
    objects.
    
    Parameters
    ----------
    other : DataFrame
    axis : {0 or 'index', 1 or 'columns'}, default 0
        0 or 'index' to compute column-wise, 1 or 'columns' for row-wise
    drop : boolean, default False
        Drop missing indices from result, default returns union of all
    
    Returns
    -------
    correls : Series



In [26]:
pd.ols(y=port, x=factors)

  exec(code_obj, self.user_global_ns, self.user_ns)



-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <f1> + <f2> + <f3> + <intercept>

Number of Observations:         1000
Number of Degrees of Freedom:   4

R-squared:         0.6503
Adj R-squared:     0.6492

Rmse:              1.0141

F-stat (3, 996):   617.3491, p-value:     0.0000

Degrees of Freedom: model 3, resid 996

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
            f1     0.6937     0.0328      21.15     0.0000     0.6294     0.7580
            f2    -1.1881     0.0324     -36.65     0.0000    -1.2516    -1.1245
            f3     0.2979     0.0328       9.09     0.0000     0.2337     0.3622
     intercept     0.0367     0.0321       1.14     0.2525    -0.0261     0.0996
---------------------------------End of Summary--------------

In [27]:
pd.ols(y=port, x=factors).beta

  exec(code_obj, self.user_global_ns, self.user_ns)


f1           0.693699
f2          -1.188084
f3           0.297913
intercept    0.036726
dtype: float64

In [28]:
def beta_exposure(chunk, factors=None):
    return pd.ols(y=chunk, x=factors).beta

In [29]:
by_ind = port.groupby(industries)

In [30]:
exposure = by_ind.apply(beta_exposure, factors=factors)

  return func(g, *args, **kwargs)


In [31]:
exposure

industry            
FINANCIAL  f1           0.641070
           f2          -1.180111
           f3           0.305814
           intercept    0.017842
TECH       f1           0.744861
           f2          -1.197747
           f3           0.288145
           intercept    0.050241
dtype: float64

In [32]:
exposure.unstack()

Unnamed: 0_level_0,f1,f2,f3,intercept
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FINANCIAL,0.64107,-1.180111,0.305814,0.017842
TECH,0.744861,-1.197747,0.288145,0.050241


## 十分位和四分位分析

In [44]:
import tushare as ts
data = ts.get_k_data('399300', index=True, start='2006-01-03', end='2012-07-27')

In [45]:
data.index = pd.DatetimeIndex(data['date'])
del data['date']
data

Unnamed: 0_level_0,open,close,high,low,volume,code
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2006-01-04,926.56,941.43,941.43,926.41,21126900.0,sz399300
2006-01-05,943.85,959.13,959.62,942.31,27311700.0,sz399300
2006-01-06,961.91,970.03,975.63,956.04,30286400.0,sz399300
2006-01-09,971.09,975.25,976.27,966.05,24329600.0,sz399300
2006-01-10,975.28,978.15,978.28,964.43,22735300.0,sz399300
2006-01-11,977.64,973.48,983.39,963.39,26524100.0,sz399300
2006-01-12,972.39,983.72,984.08,968.54,21896700.0,sz399300
2006-01-13,985.04,978.81,988.11,973.68,22281900.0,sz399300
2006-01-16,977.89,961.44,977.89,960.88,20305200.0,sz399300
2006-01-17,959.40,964.29,965.98,952.23,16037300.0,sz399300


In [70]:
px = data['close']
returns = px.pct_change()
def to_index(rets):
    index = (1 + rets).cumprod()
    first_loc = max(index.notnull().argmax() - 1, 0)
    index.values[first_loc] = 1
    return index

In [71]:
def trend_signal(rets, lookback, lag):
    signal = rets.rolling(lookback, min_periods=lookback - 5).sum()
    return signal.shift(lag)

In [72]:
signal = trend_signal(returns, 100, 3)
trade_friday = signal.resample('W-FRI').resample('B').mean().ffill()

.resample() is now a deferred operation
You called resample(...) on this deferred object which materialized it into a series
by implicitly taking the mean.  Use .resample(...).mean() instead
  from ipykernel import kernelapp as app


In [73]:
trade_rets = trade_friday.shift(1) * returns

In [74]:
to_index(trade_rets).plot()

ValueError: Cannot add integral value to Timestamp without freq.

此处报错原因未知