In [1]:
import numpy as np
import pandas as pd
import os 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.ensemble import HistGradientBoostingClassifier
import matplotlib.pyplot as plt

In [2]:
class trasaction_model : 
    def __init__(self, df, model, research_set, start_year, end_year, lag_day, stocks_in_profolio):
        self.df = df
        self.model = model
        self.research_set = research_set
        self.start_year = start_year
        self.end_year = end_year
        self.lag_day = lag_day
        self.stocks_in_profolio = stocks_in_profolio
    @property
    def predicted_data(self ) :
        
        trained_data = pd.DataFrame()
        for y in range( self.start_year, self.end_year-2) :
            train_start = pd.to_datetime(f'{y}-01-01')
            train_end = pd.to_datetime(f'{y+2}-12-31')
            trasaction_start = pd.to_datetime(f'{y+3}-01-01')
            trasaction_end = pd.to_datetime(f'{y+3}-12-31')
            x_train = self.df[(self.df['date']>train_start ) & (self.df['date'] < train_end )].iloc[:,5:37]
            x_transaction = self.df[(self.df['date']>trasaction_start) & (self.df['date'] < trasaction_end )].iloc[:,5:37]
            y_train = self.df[(self.df['date']>train_start ) & (self.df['date'] < train_end )].iloc[:,-1]
            
            model = self.model #n_estimators:樹的數量；max_depth:樹的最大深度
            model.fit(x_train, y_train) # 訓練模型
            y_pred = model.predict_proba(x_transaction) # 預測
            win_prob = pd.DataFrame(y_pred[:,1], columns=['win_prob'])
            research_data = self.df[(self.df['date']>trasaction_start) & (self.df['date'] < trasaction_end )].reset_index()
            combined = pd.concat([research_data, win_prob], axis=1, ignore_index=True)
            combined.columns = ['time'] + self.df.columns.to_list() + ['win_prob']
            trained_data = pd.concat([combined, trained_data], axis = 0)
        return trained_data
    
    @property
    def average_return(self) :


        self.predicted_data.reset_index(drop = True, inplace=True)

        self.predicted_data.sort_values(by = ['time','TICKER'])
        #trained_data.set_index('time', inplace = True)

        self.predicted_data[f't-{self.lag_day}_position'] = 0
        long = self.predicted_data.groupby('date')['win_prob'].apply(lambda x: x.nlargest(self.stocks_in_profolio).index)
        self.predicted_data.loc[long.explode(), f't-{self.lag_day}_position'] = 1
        short = self.predicted_data.groupby('date')['win_prob'].apply(lambda x: x.nsmallest(self.stocks_in_profolio).index)
        self.predicted_data.loc[short.explode(), f't-{self.lag_day}_position'] = -1

        self.predicted_data['t_position'] = self.predicted_data.groupby('TICKER')[f't-{self.lag_day}_position'].transform(lambda x : x.shift(-self.lag_day))
        self.predicted_data['individual_ret'] = self.predicted_data['ret1']*self.predicted_data['t_position']/10
        profolio_ret = self.predicted_data.groupby('date')['individual_ret'].sum()
        total_ret = profolio_ret.sum()
        average_ret = total_ret/len(self.stocks_in_profolio)
        return average_ret
        

In [2]:
def ret_calculate(df, period_list, price_col, tickers_col) :
    df = df.copy().sort_index()
    for period in  period_list : 
        df[f'ret{period}'] = df.groupby(tickers_col)[price_col].transform(lambda x : (x/x.shift(period))-1).round(6)
    return df

In [3]:
path = r"C:\Users\USER\Desktop\TMBA\data"
data_path = os.path.join(path, r'S&P 500 Historical Components & Changes(08-17-2024).csv')
crsp_path = os.path.join(path, r'hxdyufxq2lneg0ly.csv')
crsp = pd.read_csv(crsp_path)
crsp['time'] = pd.to_datetime(crsp['date'])
crsp = crsp.set_index('time').sort_index()
df = pd.read_csv(data_path)
df['time'] = pd.to_datetime(df['date'])
df.set_index('time', inplace= True)
df['tickers'] = df['tickers'].str.split(',')
df

  crsp = pd.read_csv(crsp_path)


Unnamed: 0_level_0,date,tickers
time,Unnamed: 1_level_1,Unnamed: 2_level_1
1996-01-02,1996-01-02,"[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
1996-01-03,1996-01-03,"[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
1996-01-04,1996-01-04,"[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
1996-01-10,1996-01-10,"[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
1996-01-11,1996-01-11,"[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
...,...,...
2024-03-25,2024-03-25,"[A, AAL, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADB..."
2024-04-03,2024-04-03,"[A, AAL, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADB..."
2024-05-08,2024-05-08,"[A, AAL, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADB..."
2024-06-24,2024-06-24,"[A, AAL, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADB..."


In [4]:
crsp['true_PERMNO'] = crsp.groupby('TICKER')['PERMNO'].transform('last')#同個TICKER可能會有不同permno，以最近的為準
crsp = crsp[(crsp['true_PERMNO'] == crsp['PERMNO'])]#找出存活最久的同TICKER的PERMNO
crsp['TICKER'] = crsp.groupby('PERMNO')['TICKER'].transform('last')#同個PERMNO統一成一個TICKER，以最近的為準

columns = ['date', 'TICKER','PRC', 'VOL','RET']
period_list = [i for i in range(1,21)] + [i for i in range(40,260,20)]
crsp_df = crsp[columns]
crsp_df.drop_duplicates(['date','TICKER'],inplace=True)
crsp_df = crsp_df[crsp_df['RET'] != 'C']
crsp_df['time'] = pd.to_datetime(crsp_df['date'])
crsp_df = crsp_df[crsp_df['VOL'] > 0]
crsp_df = crsp_df.set_index('time').sort_index()
crsp_df = ret_calculate(crsp_df, period_list, ['PRC'], ['TICKER'])
#crsp_df = ret_calculate(crsp_df, [1], 'PRC', 'TICKER')
crsp_df['RET'] = crsp_df['RET'].astype('float64')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crsp_df.drop_duplicates(['date','TICKER'],inplace=True)


In [5]:

df1 = df.explode('tickers')
df1['start'] = df1.groupby('tickers')['date'].transform('first')
df1['end'] = df1.groupby('tickers')['date'].transform('last')
df1

Unnamed: 0_level_0,date,tickers,start,end
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996-01-02,1996-01-02,AAL,1996-01-02,2024-07-08
1996-01-02,1996-01-02,AAMRQ,1996-01-02,2003-03-10
1996-01-02,1996-01-02,AAPL,1996-01-02,2024-07-08
1996-01-02,1996-01-02,ABI,1996-01-02,2008-11-20
1996-01-02,1996-01-02,ABS,1996-01-02,2006-05-31
...,...,...,...,...
2024-07-08,2024-07-08,XYL,2011-11-01,2024-07-08
2024-07-08,2024-07-08,YUM,1997-10-07,2024-07-08
2024-07-08,2024-07-08,ZBH,2001-08-07,2024-07-08
2024-07-08,2024-07-08,ZBRA,2019-12-23,2024-07-08


In [6]:
crsp['DLSTCD'].info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 5434665 entries, 1996-01-02 to 2023-12-29
Series name: DLSTCD
Non-Null Count  Dtype  
--------------  -----  
1192 non-null   float64
dtypes: float64(1)
memory usage: 82.9 MB


In [7]:
d = crsp[crsp['TICKER'] =='AIG']
np.shape(d)

(7048, 64)

In [8]:
a = df1['tickers'].unique()
np.shape(a)

(1168,)

In [6]:
tickers = df1.drop_duplicates(['tickers', 'start', 'end'])
tickers

Unnamed: 0_level_0,date,tickers,start,end
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996-01-02,1996-01-02,AAL,1996-01-02,2024-07-08
1996-01-02,1996-01-02,AAMRQ,1996-01-02,2003-03-10
1996-01-02,1996-01-02,AAPL,1996-01-02,2024-07-08
1996-01-02,1996-01-02,ABI,1996-01-02,2008-11-20
1996-01-02,1996-01-02,ABS,1996-01-02,2006-05-31
...,...,...,...,...
2024-05-08,2024-05-08,VST,2024-05-08,2024-07-08
2024-06-24,2024-06-24,CRWD,2024-06-24,2024-07-08
2024-06-24,2024-06-24,GDDY,2024-06-24,2024-07-08
2024-06-24,2024-06-24,KKR,2024-06-24,2024-07-08


crspa = crsp.copy()
crspa['true_PERMNO'] = crspa.groupby('TICKER')['PERMNO'].transform('last')#同個TICKER可能會有不同permno，以最近的為準
crspa = crspa[(crspa['true_PERMNO'] == crspa['PERMNO'])]#找出存活最久的同TICKER的PERMNO
crspa['TICKER'] = crspa.groupby('PERMNO')['TICKER'].transform('last')#同個PERMNO統一成一個TICKER，以最近的為準
#crspa = crspa[(crspa['true_PERMNO'] == crspa['PERMNO'])]#找出存活最久的同TICKER的PERMNO

n = crspa[crspa['TICKER']=='PHA']
c= crspa[crspa['COMNAM']=='MONSANTO CO']

In [7]:
#輸出S&P500過去的成分股列表
my_list = tickers['tickers'].values.tolist()
with open("output.txt", "w") as file:
    # 逐行寫入
    for item in my_list:
        file.write(item + "\n")

In [8]:
ticker_list = tickers.values.tolist()
sp500 = []
for stock in ticker_list :
    a = crsp_df[crsp_df['TICKER'] == stock[1]].sort_index().loc[stock[2] : stock[3]].values.tolist()
    sp500 += a
sp_500 = pd.DataFrame(sp500, columns=crsp_df.columns)
sp_500

Unnamed: 0,date,TICKER,PRC,VOL,RET,ret1,ret2,ret3,ret4,ret5,...,ret60,ret80,ret100,ret120,ret140,ret160,ret180,ret200,ret220,ret240
0,2013-12-10,AAL,24.88,18390299.0,0.011382,,,,,,...,,,,,,,,,,
1,2013-12-11,AAL,25.99,38395689.0,0.044614,0.044614,,,,,...,,,,,,,,,,
2,2013-12-12,AAL,25.45,19632931.0,-0.020777,-0.020777,0.022910,,,,...,,,,,,,,,,
3,2013-12-13,AAL,26.23,12305108.0,0.030648,0.030648,0.009234,0.054260,,,...,,,,,,,,,,
4,2013-12-16,AAL,26.61,13068161.0,0.014487,0.014487,0.045580,0.023855,0.069534,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3030140,2023-12-22,UBER,61.71,14715609.0,0.004068,0.004068,0.016974,-0.006600,-0.000324,-0.002425,...,0.337451,0.326811,0.314097,0.413422,0.526719,0.634702,0.979153,0.909344,0.675536,1.200785
3030141,2023-12-26,UBER,61.98,16077192.0,0.004375,0.004375,0.008461,0.021424,-0.002254,0.004050,...,0.347684,0.312302,0.350033,0.471859,0.539876,0.596189,0.952741,0.992285,0.726943,1.186243
3030142,2023-12-27,UBER,63.28,20805951.0,0.020974,0.020975,0.025442,0.029613,0.042848,0.018674,...,0.385289,0.345238,0.400000,0.474715,0.622980,0.656978,1.033419,1.053212,0.844898,1.179814
3030143,2023-12-28,UBER,63.14,13795595.0,-0.002212,-0.002212,0.018716,0.023173,0.027335,0.040541,...,0.418558,0.356391,0.404672,0.475923,0.568306,0.627739,1.064073,0.951174,0.888158,1.144701


In [9]:
#period_list = [i for i in range(1,21)] + [i for i in range(40,260,20)]
sp_500 = sp_500.reset_index(drop = True)
sp_500['time'] = pd.to_datetime(sp_500['date'])
sp_500['date'] = pd.to_datetime(sp_500['date'])
sp_500.set_index('time', inplace= True)
pre_data = sp_500.copy()
pre_data

Unnamed: 0_level_0,date,TICKER,PRC,VOL,RET,ret1,ret2,ret3,ret4,ret5,...,ret60,ret80,ret100,ret120,ret140,ret160,ret180,ret200,ret220,ret240
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-12-10,2013-12-10,AAL,24.88,18390299.0,0.011382,,,,,,...,,,,,,,,,,
2013-12-11,2013-12-11,AAL,25.99,38395689.0,0.044614,0.044614,,,,,...,,,,,,,,,,
2013-12-12,2013-12-12,AAL,25.45,19632931.0,-0.020777,-0.020777,0.022910,,,,...,,,,,,,,,,
2013-12-13,2013-12-13,AAL,26.23,12305108.0,0.030648,0.030648,0.009234,0.054260,,,...,,,,,,,,,,
2013-12-16,2013-12-16,AAL,26.61,13068161.0,0.014487,0.014487,0.045580,0.023855,0.069534,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,2023-12-22,UBER,61.71,14715609.0,0.004068,0.004068,0.016974,-0.006600,-0.000324,-0.002425,...,0.337451,0.326811,0.314097,0.413422,0.526719,0.634702,0.979153,0.909344,0.675536,1.200785
2023-12-26,2023-12-26,UBER,61.98,16077192.0,0.004375,0.004375,0.008461,0.021424,-0.002254,0.004050,...,0.347684,0.312302,0.350033,0.471859,0.539876,0.596189,0.952741,0.992285,0.726943,1.186243
2023-12-27,2023-12-27,UBER,63.28,20805951.0,0.020974,0.020975,0.025442,0.029613,0.042848,0.018674,...,0.385289,0.345238,0.400000,0.474715,0.622980,0.656978,1.033419,1.053212,0.844898,1.179814
2023-12-28,2023-12-28,UBER,63.14,13795595.0,-0.002212,-0.002212,0.018716,0.023173,0.027335,0.040541,...,0.418558,0.356391,0.404672,0.475923,0.568306,0.627739,1.064073,0.951174,0.888158,1.144701


In [10]:
pre_data['max'] = pre_data.groupby('TICKER')['ret1'].transform('max')
pre_data['min'] = pre_data.groupby('TICKER')['ret1'].transform('min')
pre_data = pre_data[(pre_data['max'] <= pre_data['RET'].max()) & (pre_data['min'] >= pre_data['RET'].min())]

for period in period_list : 
    pre_data[f'ret{period}'] = (pre_data[f'ret{period}']-pre_data[f'ret{period}'].mean())/pre_data[f'ret{period}'].std()

pre_data.describe()

In [11]:
lag_day = 1

pre_data['median'] = pre_data.groupby('date')['ret1'].transform('median')
pre_data['win'] = (pre_data['ret1'] > pre_data['median']).astype(int)
pre_data.sort_index(inplace=True)
pre_data['prdicted_win'] = pre_data.groupby('TICKER')['win'].transform(lambda x : x.shift(-lag_day))
pre_data.dropna(inplace= True)
pre_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_data['median'] = pre_data.groupby('date')['ret1'].transform('median')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_data['win'] = (pre_data['ret1'] > pre_data['median']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_data.sort_index(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[ro

Unnamed: 0_level_0,date,TICKER,PRC,VOL,RET,ret1,ret2,ret3,ret4,ret5,...,ret160,ret180,ret200,ret220,ret240,max,min,median,win,prdicted_win
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-12-11,1996-12-11,VZ,59.87500,669600.0,-0.008282,-0.008282,-0.006224,-0.008282,-0.018443,-0.024440,...,-0.104673,-0.058939,-0.092803,-0.114603,-0.116236,0.146324,-0.500342,-0.010797,1,1.0
1996-12-11,1996-12-11,PDG,22.62500,768500.0,-0.042328,-0.042328,-0.010929,-0.005495,-0.005495,-0.026882,...,-0.199115,-0.236287,-0.219828,-0.184685,-0.112745,0.306878,-0.156000,-0.010797,0,1.0
1996-12-11,1996-12-11,FCX,30.75000,282300.0,-0.008065,-0.008065,-0.023810,-0.031496,-0.023810,-0.008065,...,-0.105455,-0.061069,-0.057471,0.037975,0.103139,0.296846,-0.506794,-0.010797,1,1.0
1996-12-11,1996-12-11,NEE,44.00000,316300.0,-0.008451,-0.008451,-0.016760,-0.016760,-0.032967,-0.046070,...,0.032258,-0.016760,-0.032967,-0.043478,-0.040872,0.139428,-0.749612,-0.010797,1,1.0
1996-12-11,1996-12-11,PCH,43.75000,63300.0,-0.011299,-0.011299,-0.011299,-0.002849,-0.014085,-0.005682,...,0.032448,0.008646,0.060606,0.067073,0.086957,0.089445,-0.063309,-0.010797,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-28,2023-12-28,CSGP,88.01000,1125033.0,0.004910,0.004910,0.005484,0.006174,0.010796,0.017457,...,0.166468,0.283132,0.301538,0.141801,0.085873,0.074362,-0.080828,0.001445,1,0.0
2023-12-28,2023-12-28,BR,205.83000,535483.0,0.007637,0.007637,0.018860,0.030851,0.048228,0.048228,...,0.329222,0.414348,0.502080,0.393851,0.428284,0.134394,-0.116667,0.001445,1,1.0
2023-12-28,2023-12-28,BXP,71.95000,844533.0,0.016729,0.003067,-0.001527,0.015239,0.009825,0.017249,...,0.421656,0.394921,0.346622,-0.017479,0.010534,0.239316,-0.171963,0.001445,1,0.0
2023-12-28,2023-12-28,BKNG,3550.46997,137656.0,0.004479,0.004479,-0.004536,0.004177,0.008785,0.023423,...,0.341866,0.393844,0.434099,0.465302,0.542784,0.219491,-0.172816,0.001445,1,1.0


In [13]:
a = pd.DataFrame(pre_data.groupby('date')['TICKER'].unique())
a = a.explode('TICKER').reset_index()
number = a.groupby('date').size()

In [16]:
research_set = 1
start_year = 2010
end_year = 2022
max_depth = 20
tree_num = 100

rf_predict_data = pd.DataFrame()
for y in range(start_year, end_year-2) :
    train_start = pd.to_datetime(f'{y}-01-01')
    train_end = pd.to_datetime(f'{y+2}-12-31')
    trasaction_start = pd.to_datetime(f'{y+3}-01-01')
    trasaction_end = pd.to_datetime(f'{y+3}-12-31')
    x_train = pre_data[(pre_data['date']>train_start ) & (pre_data['date'] < train_end )].iloc[:,5:36]
    x_transaction = pre_data[(pre_data['date']>trasaction_start) & (pre_data['date'] < trasaction_end )].iloc[:,5:36]
    y_train = pre_data[(pre_data['date']>train_start ) & (pre_data['date'] < train_end )].iloc[:,-1]
    y_transaction = pre_data[(pre_data['date']>trasaction_start) & (pre_data['date'] < trasaction_end )].iloc[:,-1]
    rf = RandomForestClassifier(n_estimators= tree_num, max_features='sqrt', max_depth = max_depth) #n_estimators:樹的數量；max_depth:樹的最大深度
    rf.fit(x_train, y_train) # 訓練模型
    y_pred = rf.predict_proba(x_transaction) # 預測
    win_prob = pd.DataFrame(y_pred[:,1], columns=['win_prob'])
    research_data = pre_data[(pre_data['date']>trasaction_start) & (pre_data['date'] < trasaction_end )].reset_index()
    combined = pd.concat([research_data, win_prob], axis=1, ignore_index=True)
    combined.columns = ['time'] + pre_data.columns.to_list() + ['win_prob']
    rf_predict_data = pd.concat([combined, rf_predict_data], axis = 0)   
    
    


In [17]:
profolio_NUM = 10

rf_predict_data.reset_index( inplace=True)

rf_predict_data = rf_predict_data.set_index(['TICKER','time']).sort_index()
#rf_predict_data['win_prob'] = rf_predict_data.groupby('TICKER')['win_prob'].transform(lambda x : x.shift(lag_day))
rf_predict_data[f't-{lag_day}_position'] = 0
long = rf_predict_data.groupby('date')['win_prob'].apply(lambda x: x.nlargest(profolio_NUM).index)
rf_predict_data.loc[long.explode(), f't-{lag_day}_position'] = 1
short = rf_predict_data.groupby('date')['win_prob'].apply(lambda x: x.nsmallest(profolio_NUM).index)
rf_predict_data.loc[short.explode(), f't-{lag_day}_position'] = -1
#rf_predict_data[(rf_predict_data['win_prob'] < 0.6) & (rf_predict_data['win_prob'] > 0.4)]['t_position'] = 0

rf_predict_data['t_position'] = rf_predict_data.groupby('TICKER')[f't-{lag_day}_position'].transform(lambda x : x.shift(lag_day))
rf_predict_data['profolio_num'] = rf_predict_data.groupby(['date','t_position'])['t_position'].transform(lambda x : x.count())
rf_predict_data['individual_ret'] = rf_predict_data['ret1']*rf_predict_data['t_position']/rf_predict_data['profolio_num']

rf_profolio_ret = rf_predict_data.groupby('date')['individual_ret'].sum()
rf_total_ret = rf_profolio_ret.sum()
rf_average_ret = rf_total_ret/len(rf_profolio_ret)
rf_average_ret

0.0009927709080047788

In [18]:
rf_profolio_ret.describe()

count    2511.000000
mean        0.000993
std         0.013388
min        -0.095190
25%        -0.005225
50%         0.000996
75%         0.007333
max         0.124709
Name: individual_ret, dtype: float64

In [19]:
research_set = 1
start_year = 2010
end_year = 2022
max_depth = 20
tree_num = 100
trained_data = pd.DataFrame()
for y in range(start_year, end_year-2) :
    train_start = pd.to_datetime(f'{y}-01-01')
    train_end = pd.to_datetime(f'{y+2}-12-31')
    trasaction_start = pd.to_datetime(f'{y+3}-01-01')
    trasaction_end = pd.to_datetime(f'{y+3}-12-31')
    x_train = pre_data[(pre_data['date']>train_start ) & (pre_data['date'] < train_end )].iloc[:,5:36]
    x_transaction = pre_data[(pre_data['date']>trasaction_start) & (pre_data['date'] < trasaction_end )].iloc[:,5:36]
    y_train = pre_data[(pre_data['date']>train_start ) & (pre_data['date'] < train_end )].iloc[:,-1]
    y_transaction = pre_data[(pre_data['date']>trasaction_start) & (pre_data['date'] < trasaction_end )].iloc[:,-1]
    GBC = HistGradientBoostingClassifier(learning_rate= 0.1, max_iter = 100, max_depth= 3, min_samples_leaf= 15)
    GBC.fit(x_train, y_train) # 訓練模型
    y_pred = GBC.predict_proba(x_transaction) # 預測
    win_prob = pd.DataFrame(y_pred[:,1], columns=['win_prob'])
    research_data = pre_data[(pre_data['date']>trasaction_start) & (pre_data['date'] < trasaction_end )].reset_index()
    combined = pd.concat([research_data, win_prob], axis=1, ignore_index=True)
    combined.columns = ['time'] + pre_data.columns.to_list() + ['win_prob']
    trained_data = pd.concat([combined, trained_data], axis = 0)  
    
    
profolio_level = 10

trained_data.reset_index(drop = True, inplace=True)

trained_data = trained_data.set_index(['TICKER', 'time']).sort_index()
#trained_data.set_index('time', inplace = True)

trained_data['win_prob'] = trained_data.groupby('TICKER')['win_prob'].transform(lambda x : x.shift(lag_day))
trained_data[f't_position'] = 0
long = trained_data.groupby('date')['win_prob'].apply(lambda x: x.nlargest(profolio_level).index)
trained_data.loc[long.explode(), f't_position'] = 1
short = trained_data.groupby('date')['win_prob'].apply(lambda x: x.nsmallest(profolio_level).index)
trained_data.loc[short.explode(), f't_position'] = -1
trained_data[(trained_data['win_prob'] < 0.6) & (trained_data['win_prob'] > 0.4)]['t_position'] = 0


trained_data['profolio_num'] = trained_data.groupby(['date','t_position'])['t_position'].transform(lambda x : x.count())
trained_data['individual_ret'] = trained_data['ret1']*trained_data['t_position']/trained_data['profolio_num']

profolio_ret = trained_data.groupby('date')['individual_ret'].sum()
total_ret = profolio_ret.sum()
average_ret = total_ret/len(profolio_ret)
average_ret 
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trained_data[(trained_data['win_prob'] < 0.6) & (trained_data['win_prob'] > 0.4)]['t_position'] = 0


0.0008401360015929908

In [20]:
#A = trained_data.loc['WM']

In [21]:
profolio_ret.describe()

count    2511.000000
mean        0.000840
std         0.015307
min        -0.118775
25%        -0.006400
50%         0.000757
75%         0.008495
max         0.105689
Name: individual_ret, dtype: float64

In [22]:
c = np.cumprod(1+profolio_ret)

In [23]:
max_depth = 20
tree_num = 1000
predictors = 31
n_estimators = int(predictors**0.5)

GBC = HistGradientBoostingClassifier(learning_rate= 0.1, max_iter = 100, max_depth= 3, min_samples_leaf= 15)
RF = RandomForestClassifier(n_estimators= n_estimators, random_state=tree_num, max_depth = max_depth)

In [24]:
trasaction_model(df= pre_data , model = GBC, research_set= research_set, start_year=start_year,
                 end_year= end_year, lag_day= lag_day, stocks_in_profolio= 100).average_return()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  self.predicted_data.loc[long.explode(), f't-{self.lag_day}_position'] = 1
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work t

KeyError: 'Column not found: t-1_position'

In [None]:
trasaction_model(df= pre_data, model = RF, research_set= research_set, start_year=start_year,
                 end_year= end_year, lag_day= lag_day, stocks_in_profolio= 100).average_return