In [8]:
import os
import pandas as pd
import numpy as np
import warnings
from datetime import datetime,timedelta
from code.utils import get_price_after
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.utils import shuffle

warnings.filterwarnings("ignore")

# 读取数据

In [9]:
##############utils config###################
time_file = '../data/input/time/NF_time.csv'
time_column = 'time'
price_folder = '../data/input/price/XAUUSD_NF_1min/'
price_prefix = 'XAUUSD'
price_column = 'price'
service_fee = 4/100000.0

args = {
    "time_file" : time_file,
    "time_column" : time_column,
    "price_folder" : price_folder,
    "price_prefix" : price_prefix,
    "price_column" : price_column,
    "leverage": 1,
    "point_base":10.0,
    "service_fee":service_fee,
    "skip_rows":False
}

In [10]:
time = pd.read_csv(time_file)

timefile_list = os.listdir('../data/input/price/XAUUSD_NF_1min')
time_list = []
for time in timefile_list:
    time_list.append(datetime.strptime(time.replace('.csv','').replace('XAUUSD','').replace('.','/'),'%Y/%m/%d'))

time_list.sort()

data = []

data_all = pd.read_csv("../data/input/x/NF.csv")

for item in data_all['date']:
    time = datetime.strptime(item,'%Y/%m/%d')
    if time in time_list:
        data.append(list(data_all[data_all.date == item].iloc[0]))

y_price = get_price_after(5,args = args,profit_point = 1000,loss_point = -1000, service_fee = service_fee,save_price = False)


# 构造模型并验证

In [11]:
df_diff = pd.DataFrame(data)
df_diff.columns = data_all.columns
df_diff['year'] = pd.to_datetime(df_diff['date']).dt.year
df_diff['y1'] = list(y_price['signal'])

df_diff['NF_diff'] = df_diff['NF_actual'] - df_diff['NF_forecast']
df_diff['UR_diff'] = df_diff['UR_actual'] - df_diff['UR_forecast']
df_diff['HS_diff'] = df_diff['HS_actual'] - df_diff['HS_forecast']

df_diff['NF_diff_AP'] = df_diff['NF_actual'] - df_diff['previous']
df_diff['UR_diff_AP'] = df_diff['UR_actual'] - df_diff['UR_previous']
df_diff['HS_diff_AP'] = df_diff['HS_actual'] - df_diff['HS_previous']

df_diff['NF_diff_FP'] = df_diff['NF_forecast'] - df_diff['previous']
df_diff['UR_diff_FP'] = df_diff['UR_forecast'] - df_diff['UR_previous']
df_diff['HS_diff_FP'] = df_diff['HS_forecast'] - df_diff['HS_previous']

bi = preprocessing.Binarizer(copy=True, threshold=-0.000001)

df_diff['NF_diff_bi'] = bi.transform(pd.DataFrame(df_diff['NF_diff']))
df_diff['UR_diff_bi'] = bi.transform(pd.DataFrame(df_diff['UR_diff']))
df_diff['HS_diff_bi'] = bi.transform(pd.DataFrame(df_diff['HS_diff']))

#划分数据集

#option1:随机打乱
# shuffle(df_diff,random_state = 0)
# percentage = 0.8
# df_train = df_diff[:int(df_diff.shape[0]*percentage)]
# df_test = df_diff[int(df_diff.shape[0]*percentage):]

#option2:按年份划分
df_train = df_diff[df_diff.year < 2018]
df_test = df_diff[df_diff.year >= 2018]

diff = ['NF_diff','UR_diff','HS_diff','NF_diff_AP','UR_diff_AP','HS_diff_AP','NF_diff_FP','UR_diff_FP','HS_diff_FP']

diff_bi = ['NF_diff_bi','UR_diff_bi','HS_diff_bi']

origin = ['NF_actual', 'NF_forecast', 'previous',
       'UR_actual', 'UR_forecast', 'UR_previous', 'HS_actual', 'HS_forecast',
       'HS_previous']

previous_price = ['y_previous_1min']

col =  origin+diff+diff_bi

X_train = df_train[col]
y_train = df_train['y1'].values

X_test = df_test[col]
y_test = df_test['y1'].values

col_scale = origin+diff#+diff_bi

#数据标准化
mean_value = df_diff[col_scale].mean()
std_value = df_diff[col_scale].std()

print("mean:",list(mean_value))
print("std:",list(std_value))

for i in range(0, len(col_scale)):
    X_train[col_scale[i]] = X_train[col_scale[i]].apply(lambda x: (x - mean_value[i])/std_value[i])
    X_test[col_scale[i]] = X_test[col_scale[i]].apply(lambda x: (x - mean_value[i])/std_value[i])

#模型定义
logreg = LogisticRegression(class_weight = 'balanced')
logreg.fit(X_train,y_train)

#模型测试
y_pred = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train)
print("y_pred: ",y_pred)
print("y_test: ",y_test)
y_pred_proba = logreg.predict_proba(X_test)
#print("y_pred_proba: ",logreg.predict_proba(X_test))
print("训练数据集准确率：",logreg.score(X_train[col],y_train))
print("测试集准确率：",logreg.score(X_test,y_test))

# #单个数据测试
# X_item = [196,175,33,3.8,3.8,3.8,0.1,0.2,0.4]
# #X_item = [21,0,-0.1]
# X_item = [1,1,0]
# #X_item = [21,0,-0.1,1,1,0]
# X_item = [21,0,-0.1,163,0,-0.3]


# print("signal: ",logreg.predict([X_item]))

#查看结果分类
X_train_group = X_train
X_train_group['y1'] = df_diff['y1']
X_train_group['y_pred'] = y_pred_train
X_train_group = X_train.groupby(['NF_diff_bi','UR_diff_bi','HS_diff_bi','y1','y_pred'])

X_test_group = X_test
X_test_group['y1'] = df_diff['y1']
X_test_group['y_pred'] = y_pred
X_test_group = X_test.groupby(['NF_diff_bi','UR_diff_bi','HS_diff_bi','y1','y_pred'])

pd.set_option('max_row',1000) 
pd.set_option('max_columns',1000) 

print(pd.DataFrame(X_train_group.size()))
print(pd.DataFrame(X_test_group.size()))

print(metrics.roc_auc_score(y_test, y_pred_proba[:,1]))

print("coef:",logreg.coef_)
print("intercept:",logreg.intercept_)

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import random

def draw_ROC_curve(y_test,y_predict):
    '''
    画ROC曲线
    '''
    false_positive_rate,true_positive_rate,thresholds=roc_curve(y_test, y_predict)
    roc_auc=auc(false_positive_rate, true_positive_rate)
    plt.title('ROC')
    plt.plot(false_positive_rate, true_positive_rate,'b',label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.ylabel('TPR')
    plt.xlabel('FPR')
    plt.close(0)

draw_ROC_curve(y_test, y_pred_proba[:,1])

mean: [193.79661016949152, 191.0, 195.0, 4.967796610169491, 4.996610169491524, 5.022033898305083, 0.19661016949152546, 0.23559322033898317, 0.20677966101694917, 2.7966101694915255, -0.028813559322033857, -0.03898305084745764, -1.2033898305084745, -0.05423728813559318, -0.010169491525423726, -4.0, -0.025423728813559324, 0.028813559322033895]
std: [72.19174173351011, 34.36116492746446, 76.02789869061452, 1.1652202215958565, 1.1976940634193571, 1.1923705447856685, 0.15082085936280926, 0.04829047119687413, 0.14247129992855187, 64.0500288462658, 0.11751866683363194, 0.13132868630160083, 106.54540551608403, 0.12774170314957245, 0.25168050028875716, 74.6250397100339, 0.05443509371017502, 0.15979604183267077]
y_pred:  [ 1 -1 -1  1  1 -1 -1  1 -1  1 -1  1 -1 -1  1 -1]
y_test:  [ 1 -1  1  1  1 -1  1  1 -1  1 -1  1 -1 -1  1  1]
训练数据集准确率： 0.9534883720930233
测试集准确率： 0.8125
                                            0
NF_diff_bi UR_diff_bi HS_diff_bi y1 y_pred   
0          0.0        0.0         1

# 对结果存疑的部分，可以查看具体的源数据

In [12]:
print(pd.DataFrame(df_diff[(df_diff.NF_diff_bi == 1)&(df_diff.UR_diff_bi == 1)&(df_diff.HS_diff_bi == 0)]))

df_diff_group = df_diff.groupby(['NF_diff_bi','UR_diff_bi','HS_diff_bi','y1'])
print(pd.DataFrame(df_diff_group.size()))

         date             time direction    turn  NF_actual  NF_forecast  \
3    2012/8/3   2012/8/3 20:30      Long   800.8        163          100   
21   2016/1/8   2016/1/8 21:30     Short  2800.8        292          200   
23   2016/3/4   2016/3/4 21:30     Short  4600.6        242          190   
26   2016/7/8   2016/7/8 20:30     Short  4100.1        287          175   
33   2017/2/3   2017/2/3 21:30     Short  2700.7        227          175   
37   2017/7/7   2017/7/7 20:30     Short  4000.0        222          179   
42  2017/12/8  2017/12/8 21:30      Long  4600.6        228          200   
45   2018/3/9   2018/3/9 21:30      Long  3900.9        313          200   
49   2018/7/6   2018/7/6 20:30      Long  2900.9        213          200   
56   2019/2/1   2019/2/1 13:30       NaN     NaN        311          165   
58   2019/4/5   2019/4/5 12:30       NaN     NaN        196          175   

    previous  UR_actual  UR_forecast  UR_previous  HS_actual  HS_forecast  \
3         

# 将上上个格子输出的四个参数复制到下面的函数，就可以直接用了

In [13]:
from sklearn import preprocessing
import numpy as np

def NF_score(input,threshold = 0.5):
    bi = preprocessing.Binarizer(copy=True, threshold=-0.000001)
    
    coef = [-1.03448822,  0.23754841, -0.2418439 ,  0.12376236,  0.15298568,
        0.14432753, -0.32862321, -0.72987482, -0.69119416, -1.2934256 ,
       -0.3320293 , -0.10901835, -0.52836275, -0.21826463,  0.19434201,
        0.35576964,  0.20461333,  0.39568772, -1.0742329 ,  0.36672291,
       -0.13393514]
    intercept = [0.39621412]
    
    mean = [193.79661016949152, 191.0, 195.0, 4.967796610169491, 4.996610169491524, 5.022033898305083, 0.19661016949152546, 0.23559322033898317, 0.20677966101694917, 2.7966101694915255, -0.028813559322033857, -0.03898305084745764, -1.2033898305084745, -0.05423728813559318, -0.010169491525423726, -4.0, -0.025423728813559324, 0.028813559322033895]
    std = [72.19174173351011, 34.36116492746446, 76.02789869061452, 1.1652202215958565, 1.1976940634193571, 1.1923705447856685, 0.15082085936280926, 0.04829047119687413, 0.14247129992855187, 64.0500288462658, 0.11751866683363194, 0.13132868630160083, 106.54540551608403, 0.12774170314957245, 0.25168050028875716, 74.6250397100339, 0.05443509371017502, 0.15979604183267077]
    
    NF_diff_AF = input[0] - input[1]
    UR_diff_AF = input[3] - input[4]
    HS_diff_AF = input[6] - input[7]
    NF_diff_AP = input[0] - input[2]
    UR_diff_AP = input[3] - input[5]
    HS_diff_AP = input[6] - input[8]
    NF_diff_FP = input[1] - input[2]
    UR_diff_FP = input[4] - input[5]
    HS_diff_FP = input[7] - input[8]
    
    diff = input + [NF_diff_AF,UR_diff_AF,HS_diff_AF,
            NF_diff_AP,UR_diff_AP,HS_diff_AP,
            NF_diff_FP,UR_diff_FP,HS_diff_FP]
    
    for i in range(0,len(diff)):
        diff[i] = (diff[i] - mean_value[i])/std_value[i]
    
    NF_diff_AF_bi = bi.transform(NF_diff_AF)
    UR_diff_AF_bi = bi.transform(UR_diff_AF)
    HS_diff_AF_bi = bi.transform(HS_diff_AF)
    
    data = diff + [NF_diff_AF_bi,UR_diff_AF_bi,HS_diff_AF_bi]
    
    result = 1/(1 + np.exp(-(np.dot(data, np.array(coef).T))) + intercept)
    print("是1的概率：",result)
    
    return (1 if result>=threshold else 0)


X_item = [196,175,33,3.8,3.8,3.8,0.1,0.2,0.4]
threshold = 0.5

print(NF_score(X_item,threshold))



是1的概率： [0.09170159]
0
