2regime - Bull and Bear Market Portfolio Data Process

original data: regime 1: bull market, regime 2: bear market

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [2]:
# original dataset
# regime 1: bull, regime 2: bear
df0 = pd.read_csv('BB_state.csv')
df0['state_name'] = df0['state'].map({1: 'bull', 2: 'bear'})
df0

Unnamed: 0,Date,state,state_name
0,19630701,2,bear
1,19630801,1,bull
2,19630901,2,bear
3,19631001,1,bull
4,19631101,2,bear
...,...,...,...
665,20181201,2,bear
666,20190101,1,bull
667,20190201,1,bull
668,20190301,1,bull


In [3]:
# to make the rate \lambda in increasing order (1/20, 1)
# and make state index as 0, 1 to keep the same with original synthetic data
# currently, regime 0: bull, regime 1: bear
df0['state'] = df0['state'].replace({1: 0, 2: 1})
df0

Unnamed: 0,Date,state,state_name
0,19630701,1,bear
1,19630801,0,bull
2,19630901,1,bear
3,19631001,0,bull
4,19631101,1,bear
...,...,...,...
665,20181201,1,bear
666,20190101,0,bull
667,20190201,0,bull
668,20190301,0,bull


In [4]:
# given + upcoming: 200401-200712, 200801-200912 (48+24)
df_used = df0[(df0['Date'] >= 20040101) & (df0['Date'] < 20100101)].reset_index(drop=True)
df_used

Unnamed: 0,Date,state,state_name
0,20040101,0,bull
1,20040201,0,bull
2,20040301,1,bear
3,20040401,1,bear
4,20040501,0,bull
...,...,...,...
67,20090801,0,bull
68,20090901,0,bull
69,20091001,1,bear
70,20091101,0,bull


In [5]:
states = df_used['state'].to_numpy()
states

array([0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0])

In [6]:
states.shape

(72,)

In [7]:
# 设定随机种子，确保结果可重复
rng = np.random.default_rng(seed=41)

# 初始化 xi 数组
xi = np.empty(len(states))

# 遍历生成 xi
# regime 0: rate 1/20, regime 1: rate 1 (increasing rate lambda)
for i, s in enumerate(states):
    scale = 20 if s == 0 else 1 
    xi[i] = stats.expon(scale=scale).rvs(random_state=rng)

In [8]:
xi

array([9.64510987e+01, 7.01716134e+00, 4.76077589e-01, 6.43226840e-01,
       2.17524349e+00, 5.22300384e+01, 8.42623709e-01, 9.11521764e+00,
       5.53651616e+01, 3.14385592e+01, 2.07986962e+00, 6.10115362e+00,
       1.27325721e+00, 6.74453533e+01, 1.86217939e+00, 4.73683334e-02,
       1.08726592e+01, 2.36426701e-01, 5.39145957e+00, 6.23013410e-01,
       6.27914644e+01, 1.73982719e-01, 6.31436751e+00, 7.40979617e-02,
       6.40749793e+01, 1.27485336e+01, 5.70419123e+00, 6.96355492e+00,
       2.52204420e+00, 1.63766098e+00, 3.89728043e+01, 2.57632985e+01,
       1.45568033e+01, 7.96667039e+00, 6.46730306e+01, 7.31228342e+00,
       1.02837190e+01, 2.97399140e-01, 1.36412321e+00, 5.54025877e+01,
       2.01588568e+01, 1.64347422e+00, 1.44014483e+00, 5.23381973e+01,
       5.17729851e+00, 5.33620953e+01, 3.76083545e-01, 1.33582302e+00,
       1.11531686e+00, 5.81076715e-01, 7.47891052e-02, 1.10714763e+01,
       7.99362634e+01, 5.07701627e-01, 1.85315692e+00, 7.32406288e+00,
      

In [9]:
xi.shape

(72,)

In [10]:
df_used['xi'] = xi
df_used

Unnamed: 0,Date,state,state_name,xi
0,20040101,0,bull,20.888867
1,20040201,0,bull,21.635663
2,20040301,1,bear,1.358685
3,20040401,1,bear,0.112948
4,20040501,0,bull,71.558513
...,...,...,...,...
67,20090801,0,bull,4.271365
68,20090901,0,bull,12.735650
69,20091001,1,bear,0.180205
70,20091101,0,bull,15.635695


In [11]:
# states and xi are the data I need

In [None]:
df_used.groupby('state')['xi'].mean()
# state 0: bull, state 1: bear, demand 0 is obviously larger than demand 1, seems correct

state
0    19.032489
1     0.823471
Name: xi, dtype: float64

In [9]:
train_df = df_used[df_used['Date'] <= 20071231].reset_index(drop=True)
train_df

Unnamed: 0,Date,state,state_name,xi
0,20040101,0,bull,20.888867
1,20040201,0,bull,21.635663
2,20040301,1,bear,1.358685
3,20040401,1,bear,0.112948
4,20040501,0,bull,71.558513
5,20040601,0,bull,128.047844
6,20040701,1,bear,1.735861
7,20040801,0,bull,8.099634
8,20040901,0,bull,8.282643
9,20041001,0,bull,31.774256


In [10]:
test_df = df_used[df_used['Date'] > 20071231].reset_index(drop=True)
test_df

Unnamed: 0,Date,state,state_name,xi
0,20080101,1,bear,0.354211
1,20080201,1,bear,0.174267
2,20080301,1,bear,0.654483
3,20080401,0,bull,7.386915
4,20080501,0,bull,2.762548
5,20080601,1,bear,0.131824
6,20080701,1,bear,3.164098
7,20080801,0,bull,12.444615
8,20080901,1,bear,1.728779
9,20081001,1,bear,0.138131
