In [1]:
import csf

In [2]:
def get_raw_factor(factors, index_code, start_date, end_date, freq='M'):
    """
    :param factors: str or list, 因子代码"M009006"或因子代码列表["M009006", "M009007"]
    :param index_code: str, 指数代码，如"000300"
    :param start_date: str, 开始日期，如"2008-04-30"
    :param end_date: str，结束日期，如"2015-12-31"
    :param freq: str，换仓周期，周"W"、月"M"、季"Q"，每个周期的最后一个交易日
    :param filter: dict, 股票筛选
    :return: pd.DataFrame，因子值
    """
    temp = csf.get_stock_factor(factors=factors, index=index_code,
                                start_date=start_date, end_date=end_date, freq=freq)
    df = pd.pivot_table(temp, values='value', index=['date', 'code'], columns=['cd'])
    return df

In [3]:
def cut_group(data_, num_group, col_name=None, ascending=False):
    """
    Cut the given data to defined groups asc or desc.

    :param data: pd.Series of pd.DataFrame
    :param col_name: str
        the column used to rank
    :param num_group: int
        cut the data to num_group groups
    :param ascending: Bool
        True: ascending, False: Descending

    :return

    """
    data = data_.copy()
#     if isinstance(data, pd.DataFrame):
#         data = data.loc[:, col_name]

    data_len = len(data)
    avg_element = data_len // num_group
    remains = data_len % num_group
    each_group = [avg_element] * num_group
    if remains:
        for idx in range(0, remains):
            each_group[idx] += 1
    each_group = np.array(each_group)
    each_group = each_group.cumsum()
    try:
        idx = data.loc[:, col_name].rank(method='first', na_option='bottom', ascending=ascending)
    except:
        print(idx)
        print(data.index[0])
        print(col_name)
        print('error occurred in cut_group')
    groups = pd.Series(index=idx.index)
    start = 0
    for grp, end in enumerate(each_group):
        mask = (idx > start) & (idx <= end)
        groups[mask] = ''.join(['Q', str(grp + 1)])
        start = end
    groups = groups.tolist()
    data[col_name + 'g'] = groups 
    return data



In [4]:
df = get_raw_factor(['M009006','M009007'], index_code='000300', start_date='2016-01-01', end_date='2016-07-01')


In [5]:
df = df.dropna()

In [6]:
dfg=df.groupby(level=0).apply(lambda frame: cut_group(frame, 5, 'M009006'))

In [7]:
group_mean=dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g')['M009006'].mean())

In [8]:
group_mean.ix[:,-1]

date
2007-04-30     7.953750
2007-05-31   -16.032679
2007-06-29   -33.302500
2007-07-31     1.914483
2007-08-31     0.177759
2007-09-28   -30.468214
2007-10-31   -44.386071
2007-11-30   -47.491228
2007-12-28    14.968333
2008-01-31   -51.245965
2008-02-29   -36.566786
2008-03-31   -57.341607
2008-04-30   -28.444561
2008-05-30   -45.761636
2008-06-30   -68.050926
2008-07-31   -15.677544
2008-08-29   -57.935517
2008-09-26   -30.182807
2008-10-31   -68.128103
2008-11-28    -7.530000
2008-12-31   -34.684828
2009-01-23   -10.555172
2009-02-27    -7.859483
2009-03-31     4.840345
2009-04-30   -24.319310
2009-05-27   -18.180862
2009-06-30   -23.802281
2009-07-31    -8.982881
2009-08-31   -68.395690
2009-09-30    -8.805254
2009-10-30    -8.627414
2009-11-30    -7.293793
2009-12-31   -43.243333
2010-01-29   -56.946552
2010-02-26   -30.826667
2010-03-31   -38.058621
2010-04-30   -72.094915
2010-05-31   -50.185517
2010-06-30   -57.791754
2010-07-30    12.437544
2010-08-31   -35.169655
2010-09-30 

In [9]:
cut_group(df.ix['2007-04-30'], 5, 'M009006')

cd,M009006,M009007,M009006g
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000002,21.23,62.9149,Q5
000009,51.82,80.1228,Q2
000012,20.16,64.3646,Q5
000021,47.97,61.5163,Q3
000022,18.21,62.9863,Q5
000024,45.86,62.2100,Q3
000027,31.48,58.2184,Q4
000029,59.48,82.8193,Q2
000031,68.18,85.3876,Q1
000036,47.87,58.9861,Q3


In [10]:
df.head()

Unnamed: 0_level_0,cd,M009006,M009007
date,code,Unnamed: 2_level_1,Unnamed: 3_level_1
2007-04-30,2,21.23,62.9149
2007-04-30,9,51.82,80.1228
2007-04-30,12,20.16,64.3646
2007-04-30,21,47.97,61.5163
2007-04-30,22,18.21,62.9863


In [11]:
len(df.index.get_level_values(1).unique())

593

In [12]:
type(group_mean['Q1'])

pandas.core.series.Series

In [13]:
group_mean=dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g').apply(lambda df: df.corr()))

In [14]:
group_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,cd,M009006,M009007
date,M009006g,cd,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-04-30,Q1,M009006,1.000000,-0.141978
2007-04-30,Q1,M009007,-0.141978,1.000000
2007-04-30,Q2,M009006,1.000000,0.050454
2007-04-30,Q2,M009007,0.050454,1.000000
2007-04-30,Q3,M009006,1.000000,0.111943
2007-04-30,Q3,M009007,0.111943,1.000000
2007-04-30,Q4,M009006,1.000000,-0.074850
2007-04-30,Q4,M009007,-0.074850,1.000000
2007-04-30,Q5,M009006,1.000000,0.467204
2007-04-30,Q5,M009007,0.467204,1.000000


In [15]:
def __get_secus_and_caps(df):
    from collections import namedtuple
    SecuCap = namedtuple('SecuCap', ['secu', 'cap'])
    flat_df = df.dropna().reset_index()
    return SecuCap(flat_df.code, flat_df.M009006)

In [38]:
ans=dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g').apply(lambda df: __get_secus_and_caps(df)))

In [71]:
type(ans.ix[0,0][1])

pandas.core.series.Series

In [18]:
dfg1=dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g')).

In [None]:
dfg.index.g

In [77]:
ans1 = dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g').apply(lambda df: [pd.Series(df.index.get_level_values(1).values)]))

In [80]:
type(ans1.ix[0,0])

pandas.core.series.Series

In [62]:
ans1.applymap(lambda x: x.values() = x.values()/x.sum())

M009006g,Q1,Q2,Q3,Q4,Q5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2007-04-30,"[000031, 000069, 000089, 000099, 000422, 00052...","[000009, 000029, 000059, 000066, 000401, 00052...","[000021, 000024, 000036, 000068, 000503, 00054...","[000027, 000061, 000400, 000528, 000550, 00072...","[000002, 000012, 000022, 000039, 000063, 00008..."
2007-05-31,"[000001, 000009, 000024, 000031, 000039, 00006...","[000029, 000063, 000066, 000401, 000422, 00052...","[000027, 000503, 000539, 000562, 000601, 00061...","[000002, 000022, 000059, 000088, 000089, 00009...","[000012, 000021, 000036, 000060, 000550, 00058..."
2007-06-29,"[000001, 000027, 000157, 000538, 000550, 00056...","[000002, 000022, 000063, 000410, 000422, 00052...","[000021, 000031, 000039, 000059, 000069, 00008...","[000012, 000024, 000060, 000503, 000559, 00062...","[000066, 000068, 000089, 000100, 000425, 00048..."
2007-07-31,"[000001, 000002, 000031, 000046, 000060, 00006...","[000012, 000024, 000066, 000503, 000559, 00056...","[000009, 000027, 000029, 000402, 000422, 00052...","[000021, 000036, 000059, 000063, 000157, 00040...","[000068, 000088, 000089, 000410, 000538, 00065..."
2007-08-31,"[000059, 000488, 000528, 000559, 000636, 00071...","[000002, 000089, 000401, 000562, 000617, 00062...","[000021, 000066, 000068, 000157, 000400, 00042...","[000009, 000012, 000029, 000031, 000036, 00006...","[000001, 000024, 000027, 000039, 000046, 00040..."
2007-09-28,"[000039, 000046, 000060, 000068, 000401, 00048...","[000001, 000009, 000012, 000422, 000425, 00050...","[000021, 000024, 000027, 000623, 000680, 00071...","[000029, 000059, 000063, 000069, 000089, 00015...","[000002, 000031, 000036, 000066, 000088, 00040..."
2007-10-31,"[000001, 000002, 000024, 000061, 000063, 00008...","[000009, 000021, 000031, 000059, 000060, 00006...","[000036, 000046, 000488, 000617, 000680, 00069...","[000012, 000029, 000401, 000410, 000422, 00053...","[000027, 000039, 000066, 000503, 000539, 00055..."
2007-11-30,"[000036, 000488, 000503, 000581, 000625, 00062...","[000059, 000061, 000063, 000066, 000528, 00053...","[000012, 000029, 000039, 000069, 000089, 00041...","[000157, 000401, 000425, 000527, 000539, 00054...","[000001, 000002, 000009, 000021, 000024, 00003..."
2007-12-28,"[000021, 000027, 000059, 000401, 000422, 00055...","[000039, 000410, 000488, 000541, 000581, 00065...","[000029, 000060, 000066, 000088, 000157, 00050...","[000001, 000031, 000061, 000528, 000539, 00056...","[000002, 000012, 000036, 000069, 000402, 00042..."
2008-01-31,"[000031, 000061, 000063, 000069, 000338, 00040...","[000001, 000024, 000029, 000046, 000059, 00015...","[000002, 000009, 000012, 000088, 000089, 00040...","[000036, 000539, 000543, 000559, 000562, 00062...","[000027, 000039, 000572, 000625, 000758, 00079..."


In [46]:
temp=ans1.ix['2007-04-30',:].groupby('M009006g')

In [57]:
temp.apply(lambda df: df.index.values)

M009006g
Q1    [000031, 000069, 000089, 000099, 000422, 00052...
Q2    [000009, 000029, 000059, 000066, 000401, 00052...
Q3    [000021, 000024, 000036, 000068, 000503, 00054...
Q4    [000027, 000061, 000400, 000528, 000550, 00072...
Q5    [000002, 000012, 000022, 000039, 000063, 00008...
dtype: object

In [34]:
a=ans.ix[0,0]

In [37]:
type(a[0])

pandas.core.series.Series

In [65]:
example = dfg.ix['2008-05-30',:].head(20)

In [66]:
csf.get_stock_factor_list()

Unnamed: 0,code,parent,level,szh
0,M001,,1,估值
1,M002,,1,盈利能力
2,M003,,1,成长能力
3,M004,,1,资本结构
4,M005,,1,运营
5,M006,,1,流动性
6,M008,,1,动量型
7,M007,,1,趋势型
8,M011,,1,波动型
9,M010,,1,成交量型


In [73]:
dict_a = dict(zip(['a','b','c'],[1,2,3]))
dict_b = dict(zip(['d','b','c'],[4,5,6]))

In [75]:
dict_a - dict_b

TypeError: unsupported operand type(s) for -: 'dict' and 'dict'

In [None]:
df = get_raw_factor(['M002006L','M001007'], index_code='000300', start_date='2016-01-01', end_date='2016-07-01')
