## Trained Models Using Market Data

In [2]:
import pandas as pd,numpy as np
df_returns=pd.read_pickle('returns')
index_adj=df_returns.index[61:]
col_filter=df_returns.dropna(axis=1).columns
df_returns_filter=df_returns[col_filter].loc[index_adj]
df_close=pd.read_pickle('close')[df_returns_filter.columns]
df_high=pd.read_pickle('high')[df_returns_filter.columns]
df_low=pd.read_pickle('low')[df_returns_filter.columns]
df_volume=pd.read_pickle('volume')[df_returns_filter.columns]
df_open=pd.read_pickle('open')[df_returns_filter.columns]

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    LogisticRegression(),
    MLPClassifier(),
    KNeighborsClassifier(),
    LinearSVC(multi_class='crammer_singer'),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [12]:
accuracy=[]
for accu_num in range(10):
    num=10
    df_returns_filter_mean=df_returns_filter.rolling(window=3).mean()
    df_returns_rank=df_returns_filter_mean.rank(method='first',axis=1)
    threshold_1=df_returns_rank.quantile(0.33,axis=1)
    threshold_2=df_returns_rank.quantile(0.67,axis=1)
    df_tmp=df_returns_rank.copy()
    df_tmp=df_tmp.sub(threshold_1,axis=0)
    df_tmp[df_tmp<0]=-1
    df_tmp[df_tmp>0]=0
    df_tmp2=df_returns_rank.copy()
    df_tmp2=df_tmp2.sub(threshold_2,axis=0)
    df_tmp2[df_tmp2>0]=1
    df_tmp2[df_tmp2<0]=0
    df_returns_adj=df_tmp+df_tmp2
    index_tag=[i for i in range(len(df_returns_adj.index)) if i % num==0 and i != 0]
    index_label=df_returns_adj.index[index_tag]
    Y_init=df_returns_adj.loc[index_label].values
    Y=Y_init.flatten()
    df_list=[df_close,df_high,df_low,df_volume,df_open]
    for n in range(len(df_list)):
        X_temp=df_list[n].values
        X_temp=((X_temp.transpose()-np.mean(X_temp,axis=1))/np.std(X_temp,axis=1)).transpose() 
        #num=10
        num_delete=len(X_temp) % num
        width=X_temp.shape[1]
        temp=np.reshape(X_temp[:-num_delete,:],(-1,num,width))
        X_init=np.transpose(temp, [0, 2, 1])
        X=np.reshape(X_init,(-1, X_init.shape[2]))
        if n ==0:
            X_train=X.copy()
        else:
            X_train=np.concatenate((X_train,X),axis=1)
    from sklearn.model_selection import train_test_split #random_state=1234
    X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.25,random_state=None,stratify=Y)
    pred_list=[]
    accu_temp=[]
    for  clf in classifiers:
        clf.fit(X_train, y_train)
        prediction = clf.predict(X_test)
        print(np.mean(prediction == y_test))
        pred_list.append(prediction)
        accu_temp.append(np.mean(prediction == y_test))
    accuracy.append(accu_temp)

0.3598738667717777
0.36342136381553014
0.3519905400078833
0.3259755616870319
0.3338588884509263
0.36105636578636185
0.3642096964919196
0.33977138352384706
0.3417422152148207
0.3618446984627513
0.3685455262120615
0.3488372093023256
0.33701221915648405
0.33937721718565234
0.3586913677571935
0.34844304296413087
0.33267638943634215
0.35041387465510443
0.36815135987386677
0.33307055577453687
0.35514387071344106
0.33070555774536853
0.3563263697280252
0.34134804887662595
0.3445013795821837
0.33070555774536853
0.3531730390224675
0.3713046905794245
0.3488372093023256
0.34371304690579424
0.3519905400078833
0.35514387071344106
0.3460780449349626
0.35474970437524633
0.34410721324398896
0.3429247142294048
0.35396137169885694
0.34134804887662595
0.3401655498620418
0.34134804887662595
0.35672053606621995
0.3401655498620418
0.3512022073314939
0.33937721718565234
0.3543555380370516
0.3665746945210879
0.3448955459203784
0.33937721718565234
0.34844304296413087
0.35829720141899885
0.36775719353567204
0.36

In [17]:
np.array(accuracy).mean(axis=0)

array([0.36338195, 0.35187229, 0.34919196, 0.34197872, 0.34647221,
       0.34820654, 0.35356721, 0.33551439, 0.35080804])

## Trained Models Using Factors

### Test on Different Factors

In [21]:
df_returns=pd.read_pickle('returns')
df_RSI=pd.read_pickle('RSI')
df_MACD=pd.read_pickle('MACD')
df_CCI=pd.read_pickle('CCI')
df_CMO=pd.read_pickle('CMO')
df_ATR=pd.read_pickle('ATR')
df_BOP=pd.read_pickle('BOP')
df_MFI=pd.read_pickle('MFI')
df_ADOSC=pd.read_pickle('ADOSC')
df_BETA=pd.read_pickle('BETA')

col_filter=df_returns.dropna(axis=1).columns
index_adj=df_returns.index[61:]
df_RSI_filter=df_RSI[col_filter].loc[index_adj]
df_MACD_filter=df_MACD[col_filter].loc[index_adj]
df_CCI_filter=df_CCI[col_filter].loc[index_adj]
df_CMO_filter=df_CMO[col_filter].loc[index_adj]
df_ATR_filter=df_ATR[col_filter].loc[index_adj]
df_BOP_filter=df_BOP[col_filter].loc[index_adj]
df_MFI_filter=df_MFI[col_filter].loc[index_adj]
df_ADOSC_filter=df_ADOSC[col_filter].loc[index_adj]
df_BETA_filter=df_BETA[col_filter].loc[index_adj]
df_returns_filter=df_returns[col_filter].loc[index_adj]

In [25]:
pred_temp3=[]
for cla_num in range(len(classifiers)):
    num=10
    df_returns_filter_mean=df_returns_filter.rolling(window=3).mean()
    df_returns_rank=df_returns_filter_mean.rank(method='first',axis=1)
    threshold_1=df_returns_rank.quantile(0.33,axis=1)
    threshold_2=df_returns_rank.quantile(0.67,axis=1)
    df_tmp=df_returns_rank.copy()
    df_tmp=df_tmp.sub(threshold_1,axis=0)
    df_tmp[df_tmp<0]=-1
    df_tmp[df_tmp>0]=0
    df_tmp2=df_returns_rank.copy()
    df_tmp2=df_tmp2.sub(threshold_2,axis=0)
    df_tmp2[df_tmp2>0]=1
    df_tmp2[df_tmp2<0]=0
    df_returns_adj=df_tmp+df_tmp2
    index_tag=[i for i in range(len(df_returns_adj.index)) if i % num==0 and i != 0]
    index_label=df_returns_adj.index[index_tag]
    Y_init=df_returns_adj.loc[index_label].values
    Y=Y_init.flatten()
    df_list=[df_RSI_filter,df_MACD_filter,df_CCI_filter,df_CMO_filter,df_ATR_filter,df_BOP_filter,df_MFI_filter,df_ADOSC_filter,df_BETA_filter]
    pred_temp2=[]
    for n in range(len(df_list)):
        X_temp=df_list[n].values
        X_temp=((X_temp.transpose()-np.mean(X_temp,axis=1))/np.std(X_temp,axis=1)).transpose() 
        #num=10
        num_delete=len(X_temp) % num
        width=X_temp.shape[1]
        temp=np.reshape(X_temp[:-num_delete,:],(-1,num,width))
        X_init=np.transpose(temp, [0, 2, 1])
        X=np.reshape(X_init,(-1, X_init.shape[2]))
        pred_temp=[]
        for train_times in range(10):
            from sklearn.model_selection import train_test_split #random_state=1234
            X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.25,random_state=None,stratify=Y)
            clf=classifiers[cla_num] 
            clf.fit(X_train, y_train)
            prediction = clf.predict(X_test)
            #print(np.mean(prediction == y_test))
            pred_temp.append(np.mean(prediction == y_test))
        print(np.mean(pred_temp)) 
        pred_temp2.append(np.mean(pred_temp))       
    pred_temp3.append(pred_temp2)

0.5347260543949546
0.4646826960977533
0.48939692550256203
0.52089081592432
0.39901458415451324
0.4963342530547892
0.4162790697674418
0.41623965313362243
0.3528182893180922
0.5309420575482854
0.40547891210090664
0.4856129286558928
0.5301537248718959
0.371344107213244
0.49743791880173427
0.4190382341348049
0.40784391013007487
0.35124162396531333
0.42380764682696104
0.3517540402049665
0.39944816712652736
0.41765865195112334
0.34666929444225464
0.43870713441072134
0.3642491131257391
0.3655892786756011
0.34004729996058336
0.49645250295624754
0.45419787150177376
0.46409144659046114
0.49936933385888843
0.38127709893575085
0.476271186440678
0.3843910130074891
0.4003941663381947
0.3361450532124557
0.4280646432794639
0.36795427670476943
0.397910918407568
0.42609381158849036
0.34237288135593225
0.4096176586519511
0.35124162396531333
0.3561292865589279
0.34059913283405596
0.4605045329128893
0.3705163579030351
0.4305478912100907
0.46420969649191957
0.3513598738667718
0.4575088687426093
0.3652345289

In [27]:
np.array(pred_temp3)

array([[0.53472605, 0.4646827 , 0.48939693, 0.52089082, 0.39901458,
        0.49633425, 0.41627907, 0.41623965, 0.35281829],
       [0.53094206, 0.40547891, 0.48561293, 0.53015372, 0.37134411,
        0.49743792, 0.41903823, 0.40784391, 0.35124162],
       [0.42380765, 0.35175404, 0.39944817, 0.41765865, 0.34666929,
        0.43870713, 0.36424911, 0.36558928, 0.3400473 ],
       [0.4964525 , 0.45419787, 0.46409145, 0.49936933, 0.3812771 ,
        0.47627119, 0.38439101, 0.40039417, 0.33614505],
       [0.42806464, 0.36795428, 0.39791092, 0.42609381, 0.34237288,
        0.40961766, 0.35124162, 0.35612929, 0.34059913],
       [0.46050453, 0.37051636, 0.43054789, 0.4642097 , 0.35135987,
        0.45750887, 0.36523453, 0.36862436, 0.33985022],
       [0.51001182, 0.38238076, 0.47315727, 0.51076074, 0.37063461,
        0.49901458, 0.40244383, 0.39018526, 0.35112337],
       [0.40626724, 0.35778479, 0.41115491, 0.41032716, 0.35802128,
        0.51340166, 0.36964919, 0.33783997, 0.34994088],


In [31]:
col_name=['RSI','MACD','CCI','CMO','ATR', 'BOP','MFI','ADOSC','BETA']
index_name=['Logic','MLP','KNeighbors','SVC','Tree','RandomForest','Ada','Gauss','Quad']
factor_comp=pd.DataFrame(np.array(pred_temp3),index=index_name,columns=col_name)
factor_comp

Unnamed: 0,RSI,MACD,CCI,CMO,ATR,BOP,MFI,ADOSC,BETA
Logic,0.534726,0.464683,0.489397,0.520891,0.399015,0.496334,0.416279,0.41624,0.352818
MLP,0.530942,0.405479,0.485613,0.530154,0.371344,0.497438,0.419038,0.407844,0.351242
KNeighbors,0.423808,0.351754,0.399448,0.417659,0.346669,0.438707,0.364249,0.365589,0.340047
SVC,0.496453,0.454198,0.464091,0.499369,0.381277,0.476271,0.384391,0.400394,0.336145
Tree,0.428065,0.367954,0.397911,0.426094,0.342373,0.409618,0.351242,0.356129,0.340599
RandomForest,0.460505,0.370516,0.430548,0.46421,0.35136,0.457509,0.365235,0.368624,0.33985
Ada,0.510012,0.382381,0.473157,0.510761,0.370635,0.499015,0.402444,0.390185,0.351123
Gauss,0.406267,0.357785,0.411155,0.410327,0.358021,0.513402,0.369649,0.33784,0.349941
Quad,0.511313,0.384628,0.469373,0.508987,0.375522,0.504533,0.408514,0.356248,0.344304


In [32]:
factor_comp.mean(axis=0)

RSI      0.478010
MACD     0.393264
CCI      0.446744
CMO      0.476495
ATR      0.366246
BOP      0.476981
MFI      0.386782
ADOSC    0.377677
BETA     0.345119
dtype: float64

In [33]:
factor_comp.mean(axis=1)

Logic           0.454487
MLP             0.444344
KNeighbors      0.383103
SVC             0.432510
Tree            0.379998
RandomForest    0.400928
Ada             0.432190
Gauss           0.390487
Quad            0.429269
dtype: float64

## Trained Models Using Combined Factors

In [41]:
pred_sigle_comb=[]
for cla_num in range(len(classifiers)):
    df_list=[df_RSI_filter,df_MACD_filter,df_CCI_filter,df_CMO_filter,df_ATR_filter,df_BOP_filter,df_MFI_filter,df_ADOSC_filter,df_BETA_filter]
    for n in range(len(df_list)):
        X_temp=df_list[n].values
        X_temp=((X_temp.transpose()-np.mean(X_temp,axis=1))/np.std(X_temp,axis=1)).transpose() 
        num=10
        num_delete=len(X_temp) % num
        width=X_temp.shape[1]
        temp=np.reshape(X_temp[:-num_delete,:],(-1,num,width))
        X_init=np.transpose(temp, [0, 2, 1])
        X=np.reshape(X_init,(-1, X_init.shape[2]))
        if n ==0:
            X_train=X.copy()
        else:
            X_train=np.concatenate((X_train,X),axis=1)
    df_returns_filter_mean=df_returns_filter.rolling(window=3).mean()
    df_returns_rank=df_returns_filter_mean.rank(method='first',axis=1)
    threshold_1=df_returns_rank.quantile(0.33,axis=1)
    threshold_2=df_returns_rank.quantile(0.67,axis=1)
    df_tmp=df_returns_rank.copy()
    df_tmp=df_tmp.sub(threshold_1,axis=0)
    df_tmp[df_tmp<0]=-1
    df_tmp[df_tmp>0]=0
    df_tmp2=df_returns_rank.copy()
    df_tmp2=df_tmp2.sub(threshold_2,axis=0)
    df_tmp2[df_tmp2>0]=1
    df_tmp2[df_tmp2<0]=0
    df_returns_adj=df_tmp+df_tmp2
    index_tag=[i for i in range(len(df_returns_adj.index)) if i % num==0 and i != 0]
    index_label=df_returns_adj.index[index_tag]
    Y_init=df_returns_adj.loc[index_label].values
    Y=Y_init.flatten()
    pred_temp=[]
    for train_times in range(10):
        from sklearn.model_selection import train_test_split #random_state=1234
        X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.25,random_state=None,stratify=Y)
        clf=classifiers[cla_num]  #cla_num
        clf.fit(X_train, y_train)
        prediction = clf.predict(X_test)
        print(np.mean(prediction == y_test))
        pred_temp.append(np.mean(prediction == y_test))
    print("------------------------")
    print(np.mean(pred_temp)) 
    pred_sigle_comb.append(np.mean(pred_temp))  

0.3429247142294048
0.35514387071344106
0.3366180528182893
0.33819471817106816
0.35238470634607805
0.36381553015372486
0.34528971225857313
0.3448955459203784
0.3519905400078833
0.34371304690579424
------------------------
0.3474970437524635
0.3618446984627513
0.3433188805675995
0.35908553409538824
0.3512022073314939
0.35672053606621995
0.3535672053606622
0.3535672053606622
0.35396137169885694
0.33977138352384706
0.3602680331099724
------------------------
0.3533307055577454
0.3429247142294048
0.3488372093023256
0.346866377611352
0.34647221127315725
0.33070555774536853
0.3251872290106425
0.3346472211273157
0.34213638155301535
0.3515963736696886
0.3433188805675995
------------------------
0.341269215608987
0.3543555380370516
0.3433188805675995
0.33543555380370516
0.3165155695703587
0.34371304690579424
0.34371304690579424
0.32558139534883723
0.3405597162002365
0.3208513992905006
0.32439889633425306
------------------------
0.33484430429641304
0.33109972408356325
0.3318880567599527
0.342530

In [42]:
pred_sigle_comb

[0.3474970437524635,
 0.3533307055577454,
 0.341269215608987,
 0.33484430429641304,
 0.33716988569176193,
 0.3388253843121798,
 0.34927079227433977,
 0.3445407962160031,
 0.34907370910524244]

## Model combination

In [44]:
num=10
df_list=[df_RSI_filter,df_MACD_filter,df_CCI_filter,df_CMO_filter,df_ATR_filter,df_BOP_filter,df_MFI_filter,df_ADOSC_filter,df_BETA_filter]
pred_list=[]
df_returns_filter_mean=df_returns_filter.rolling(window=3).mean()
df_returns_rank=df_returns_filter_mean.rank(method='first',axis=1)
threshold_1=df_returns_rank.quantile(0.33,axis=1)
threshold_2=df_returns_rank.quantile(0.67,axis=1)
df_tmp=df_returns_rank.copy()
df_tmp=df_tmp.sub(threshold_1,axis=0)
df_tmp[df_tmp<0]=-1
df_tmp[df_tmp>0]=0
df_tmp2=df_returns_rank.copy()
df_tmp2=df_tmp2.sub(threshold_2,axis=0)
df_tmp2[df_tmp2>0]=1
df_tmp2[df_tmp2<0]=0
df_returns_adj=df_tmp+df_tmp2
index_tag=[i for i in range(len(df_returns_adj.index)) if i % num==0 and i != 0]
index_label=df_returns_adj.index[index_tag]
Y_init=df_returns_adj.loc[index_label].values
Y=Y_init.flatten()
pred_list_adj=[]
pred_list_all=[]
for n in range(len(df_list)):
    X_temp=df_list[n].values
    #X_temp=df_RSI_filter.values
    X_temp=((X_temp.transpose()-np.mean(X_temp,axis=1))/np.std(X_temp,axis=1)).transpose() 
    #num=10
    num_delete=len(X_temp) % num
    width=X_temp.shape[1]
    temp=np.reshape(X_temp[:-num_delete,:],(-1,num,width))
    X_init=np.transpose(temp, [0, 2, 1])
    X=np.reshape(X_init,(-1, X_init.shape[2]))
    from sklearn.model_selection import train_test_split #random_state=1234
    X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.25,random_state=1234,stratify=Y)
    pred_list=[]
    for  clf in classifiers:
        clf.fit(X_train, y_train)
        prediction = clf.predict(X_test)
        print(np.mean(prediction == y_test))
        pred_list.append(prediction)
    pred_list_all.append(pred_list)
    count_1st=np.count_nonzero(np.array(pred_list) == 1,axis=0)
    count_2nd=np.count_nonzero(np.array(pred_list) == 0,axis=0)
    count_3rd=np.count_nonzero(np.array(pred_list) == -1,axis=0)
    y_pred=[]
    for i in range(len(count_1st)):
        max_=max(count_1st[i],count_2nd[i],count_3rd[i])
        if max_==count_1st[i]:
            y_pred.append(1) 
        elif max_==count_2nd[i]:
            y_pred.append(0)
        else:
            y_pred.append(-1)
    print('----------------'+str(n)+'----------------')
    print(np.mean(y_pred == y_test))
    print('----------------'+str(n)+'----------------')
    pred_list_adj.append(y_pred)

0.5273945605045329
0.5293653921955065
0.42609381158849036
0.49625541978715015
0.42569964525029563
0.47378793851005124
0.5143870713441072
0.38312968072526604
0.5147812376823019
----------------0----------------
0.5234528971225857
----------------0----------------
0.4623571147024044
0.40480882932597556
0.35908553409538824
0.4434371304690579
0.3669688608592826
0.37642885297595585
0.3819471817106819
0.36066219944816713
0.38037051635790303
----------------1----------------
0.4229404808829326
----------------1----------------
0.4946787544343713
0.48640126133228223
0.3921955065037446
0.4662987780843516
0.3941663381947182
0.42451714623571146
0.48088293259755616
0.4024438312968073
0.476941269215609
----------------2----------------
0.4931020890815924
----------------2----------------
0.5273945605045329
0.5352778872684273
0.42609381158849036
0.49625541978715015
0.41899881750098544
0.4505321245565629
0.5143870713441072
0.38312968072526604
0.5147812376823019
----------------3----------------
0.526

In [45]:
for i in range(len(pred_list_all)):
    if i ==0:
        pred_list_all_adj=np.array(pred_list_all[0])
    else:
        pred_list_all_adj=np.concatenate((pred_list_all_adj,pred_list_all[i]),axis=0)
y_pred_all=[]
count_1st=np.count_nonzero(np.array(pred_list_all_adj) == 1,axis=0)
count_2nd=np.count_nonzero(np.array(pred_list_all_adj) == 0,axis=0)
count_3rd=np.count_nonzero(np.array(pred_list_all_adj) == -1,axis=0)
for i in range(len(count_1st)):
    max_=max(count_1st[i],count_2nd[i],count_3rd[i])
    if max_==count_1st[i]:
        y_pred_all.append(1) 
    elif max_==count_2nd[i]:
        y_pred_all.append(0)
    else:
        y_pred_all.append(-1)
print('----------------'+'final'+'----------------')
print(np.mean(y_pred_all == y_test))

----------------final----------------
0.5309420575482854
