In [82]:
import csf

In [83]:
def get_raw_factor(factors, index_code, start_date, end_date, freq='M'):
    """
    :param factors: str or list, 因子代码"M009006"或因子代码列表["M009006", "M009007"]
    :param index_code: str, 指数代码，如"000300"
    :param start_date: str, 开始日期，如"2008-04-30"
    :param end_date: str，结束日期，如"2015-12-31"
    :param freq: str，换仓周期，周"W"、月"M"、季"Q"，每个周期的最后一个交易日
    :param filter: dict, 股票筛选
    :return: pd.DataFrame，因子值
    """
    temp = csf.get_stock_factor(factors=factors, index=index_code,
                                start_date=start_date, end_date=end_date, freq=freq)
    df = pd.pivot_table(temp, values='value', index=['date', 'code'], columns=['cd'])
    return df

In [84]:
def cut_group(data_, num_group, col_name=None, ascending=False):
    """
    Cut the given data to defined groups asc or desc.

    :param data: pd.Series of pd.DataFrame
    :param col_name: str
        the column used to rank
    :param num_group: int
        cut the data to num_group groups
    :param ascending: Bool
        True: ascending, False: Descending

    :return

    """
    data = data_.copy()
#     if isinstance(data, pd.DataFrame):
#         data = data.loc[:, col_name]

    data_len = len(data)
    avg_element = data_len // num_group
    remains = data_len % num_group
    each_group = [avg_element] * num_group
    if remains:
        for idx in range(0, remains):
            each_group[idx] += 1
    each_group = np.array(each_group)
    each_group = each_group.cumsum()
    try:
        idx = data.loc[:, col_name].rank(method='first', na_option='bottom', ascending=ascending)
    except:
        print(idx)
        print(data.index[0])
        print(col_name)
        print('error occurred in cut_group')
    groups = pd.Series(index=idx.index)
    start = 0
    for grp, end in enumerate(each_group):
        mask = (idx > start) & (idx <= end)
        groups[mask] = ''.join(['Q', str(grp + 1)])
        start = end
    groups = groups.tolist()
    data[col_name + 'g'] = groups 
    return data



In [85]:
df = get_raw_factor(['M009006','M009007'], index_code='000300', start_date='2016-01-01', end_date='2016-07-01')


In [86]:
df = df.dropna()

In [87]:
dfg=df.groupby(level=0).apply(lambda frame: cut_group(frame, 5, 'M009006'))

In [88]:
group_mean=dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g')['M009006'].mean())

In [89]:
group_mean.ix[:,-1]

date
2007-04-30     7.953750
2007-05-31   -16.032679
2007-06-29   -33.302500
2007-07-31     1.914483
2007-08-31     0.177759
2007-09-28   -30.468214
2007-10-31   -44.386071
2007-11-30   -47.491228
2007-12-28    14.968333
2008-01-31   -51.245965
2008-02-29   -36.566786
2008-03-31   -57.341607
2008-04-30   -28.444561
2008-05-30   -45.761636
2008-06-30   -68.050926
2008-07-31   -15.677544
2008-08-29   -57.935517
2008-09-26   -30.182807
2008-10-31   -68.128103
2008-11-28    -7.530000
2008-12-31   -34.684828
2009-01-23   -10.555172
2009-02-27    -7.859483
2009-03-31     4.840345
2009-04-30   -24.319310
2009-05-27   -18.180862
2009-06-30   -23.802281
2009-07-31    -8.982881
2009-08-31   -68.395690
2009-09-30    -8.805254
2009-10-30    -8.627414
2009-11-30    -7.293793
2009-12-31   -43.243333
2010-01-29   -56.946552
2010-02-26   -30.826667
2010-03-31   -38.058621
2010-04-30   -72.094915
2010-05-31   -50.185517
2010-06-30   -57.791754
2010-07-30    12.437544
2010-08-31   -35.169655
2010-09-30 

In [90]:
cut_group(df.ix['2007-04-30'], 5, 'M009006')

cd,M009006,M009007,M009006g
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000002,21.23,62.9149,Q5
000009,51.82,80.1228,Q2
000012,20.16,64.3646,Q5
000021,47.97,61.5163,Q3
000022,18.21,62.9863,Q5
000024,45.86,62.2100,Q3
000027,31.48,58.2184,Q4
000029,59.48,82.8193,Q2
000031,68.18,85.3876,Q1
000036,47.87,58.9861,Q3


In [91]:
df.head()

Unnamed: 0_level_0,cd,M009006,M009007
date,code,Unnamed: 2_level_1,Unnamed: 3_level_1
2007-04-30,2,21.23,62.9149
2007-04-30,9,51.82,80.1228
2007-04-30,12,20.16,64.3646
2007-04-30,21,47.97,61.5163
2007-04-30,22,18.21,62.9863


In [92]:
len(df.index.get_level_values(1).unique())

593

In [93]:
type(group_mean['Q1'])

pandas.core.series.Series

In [94]:
group_mean=dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g').apply(lambda df: df.corr()))

In [95]:
group_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,cd,M009006,M009007
date,M009006g,cd,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-04-30,Q1,M009006,1.000000,-0.141978
2007-04-30,Q1,M009007,-0.141978,1.000000
2007-04-30,Q2,M009006,1.000000,0.050454
2007-04-30,Q2,M009007,0.050454,1.000000
2007-04-30,Q3,M009006,1.000000,0.111943
2007-04-30,Q3,M009007,0.111943,1.000000
2007-04-30,Q4,M009006,1.000000,-0.074850
2007-04-30,Q4,M009007,-0.074850,1.000000
2007-04-30,Q5,M009006,1.000000,0.467204
2007-04-30,Q5,M009007,0.467204,1.000000


In [96]:
def __get_secus_and_caps(df):
    from collections import namedtuple
    SecuCap = namedtuple('SecuCap', ['secu', 'cap'])
    flat_df = df.dropna().reset_index()
    return SecuCap(flat_df.code, flat_df.M009006)

In [97]:
ans=dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g').apply(lambda df: __get_secus_and_caps(df)))

In [106]:
dfg.ix['2007-05-31',:] ['M009006'] - dfg.ix['2007-04-30',:] ['M009006']

code
000001       NaN
000002    -17.19
000009     -5.25
000012    -45.36
000021    -60.68
000022    -17.60
000024      5.40
000027    -19.10
000029    -39.69
000031    -37.55
000036    -61.27
000039     25.37
000059    -56.98
000060       NaN
000061       NaN
000063     -4.98
000066    -36.51
000068    -10.29
000069    -23.24
000088     -3.87
000089    -61.26
000099    -73.39
000100       NaN
000157     52.61
000400    -28.50
000401    -24.47
000402     36.69
000410       NaN
000422    -60.43
000425       NaN
000488       NaN
000503    -31.27
000520    -49.37
000527      5.62
000528    -11.89
000538    -22.06
000539    -68.76
000541     -7.02
000550    -44.98
000559    -66.61
000562    -53.29
000581    -83.12
000601       NaN
000617    -51.79
000623    -87.14
000625     -6.38
000629       NaN
000630    -33.52
000636    -45.47
000651    -36.27
000680    -52.89
000682    -51.09
000698    -22.93
000708    -41.60
000709    -38.63
000717    -70.74
000725       NaN
000729    -15.24
000751   

In [98]:
type(ans.ix[0,0][1])

pandas.core.series.Series

In [99]:
dfg1=dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g')).

SyntaxError: invalid syntax (<ipython-input-99-6ab9c17cdcfc>, line 1)

In [None]:
dfg.index.g

In [None]:
ans1 = dfg.groupby(level=0).apply(lambda frame: frame.groupby('M009006g').apply(lambda df: [pd.Series(df.index.get_level_values(1).values)]))

In [None]:
type(ans1.ix[0,0])

In [None]:
ans1.applymap(lambda x: x.values() = x.values()/x.sum())

In [None]:
temp=ans1.ix['2007-04-30',:].groupby('M009006g')

In [None]:
temp.apply(lambda df: df.index.values)

In [None]:
a=ans.ix[0,0]

In [None]:
type(a[0])

In [None]:
example = dfg.ix['2008-05-30',:].head(20)

In [None]:
csf.get_stock_factor_list()

In [None]:
dict_a = dict(zip(['a','b','c'],[1,2,3]))
dict_b = dict(zip(['d','b','c'],[4,5,6]))

In [None]:
dict_a - dict_b

In [None]:
df = get_raw_factor(['M002006L','M001007'], index_code='000300', start_date='2016-01-01', end_date='2016-07-01')


In [107]:
df1=dfg.ix['2007-05-31',:] 
df2 = dfg.ix['2007-04-30',:] 

In [111]:
df1

cd,M009006,M009007,M009006g
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000001,39.50,65.7558,Q1
000002,4.04,72.2222,Q4
000009,46.57,66.7520,Q1
000012,-25.20,48.4722,Q5
000021,-12.71,52.4398,Q5
000022,0.61,49.0316,Q4
000024,51.26,79.4116,Q1
000027,12.38,59.7586,Q3
000029,19.79,72.4244,Q2
000031,30.63,67.7482,Q1


In [114]:
pd.concat??

In [113]:
pd.concat([df1,df2], axis=1, )

cd,M009006,M009007,M009006g,M009006.1,M009007.1,M009006g.1
000001,39.50,65.7558,Q1,,,
000002,4.04,72.2222,Q4,21.23,62.9149,Q5
000009,46.57,66.7520,Q1,51.82,80.1228,Q2
000012,-25.20,48.4722,Q5,20.16,64.3646,Q5
000021,-12.71,52.4398,Q5,47.97,61.5163,Q3
000022,0.61,49.0316,Q4,18.21,62.9863,Q5
000024,51.26,79.4116,Q1,45.86,62.2100,Q3
000027,12.38,59.7586,Q3,31.48,58.2184,Q4
000029,19.79,72.4244,Q2,59.48,82.8193,Q2
000031,30.63,67.7482,Q1,68.18,85.3876,Q1
