In [4]:
import datetime
import sys
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

# 为了自动处理超参数
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
import joblib

# 读取数据
data  = pd.read_csv('/Users/zengyan/Excelsior/ai-trader/temp/doge_5m_0701_0705_1_featured_targeted.csv')

data.drop(columns=['timestamp','open','high','low','close','volume'], inplace=True)
target = data['target']
only_features = data.drop(columns=['target'])

x_train ,y_train = only_features,target

print("训练集特征：")
print(x_train.shape)
print("\n训练集目标：")
print(y_train.shape)


rf = RandomForestRegressor( random_state=42, n_jobs=-1)

# 这里有更合适的参数么？
#  oob_score 用于
# 定义参数网格
param_grid = {
    "n_estimators": [5,25, 50,75, 100], 
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40,50],
    'min_samples_split': [2, 5, 10,16,24],
    'min_samples_leaf': [1, 2, 4,8,12],
    'bootstrap': [True, False]
}
# param_grid = {
#     "n_estimators": [5,25, 50], 
#     'max_features': ['sqrt'],
#     'max_depth': [10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [2, 4,8],
#     'bootstrap': [True, False]
# }

gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
gs = gs.fit(x_train, y_train)
print(gs.best_params_)
print(gs.best_score_)



final_rf = RandomForestRegressor(n_jobs=-1,
                                 random_state=42,
                                 n_estimators=gs.best_params_['n_estimators'],
                                 max_features=gs.best_params_['max_features'],
                                 max_depth=gs.best_params_['max_depth'],
                                 min_samples_split=gs.best_params_['min_samples_split'],
                                 min_samples_leaf=gs.best_params_['min_samples_leaf'],
                                 bootstrap=gs.best_params_['bootstrap']
                                )
final_rf.fit(x_train, y_train)
df1 = pd.DataFrame(x_train.columns,columns=['Variable'])
df2 = pd.DataFrame(final_rf.feature_importances_,columns=['Importance'])
variable_importances = pd.concat([df1,df2],axis=1).sort_values(by='Importance',ascending=False)
print("训练结束，特征重要性如下：")
print(variable_importances)
# 保存模型
joblib.dump(final_rf, '/Users/zengyan/Excelsior/ai-trader/temp/doge_5m_0701_0705_1_rf_model.pkl')



训练集特征：
(1187, 10)

训练集目标：
(1187,)
{'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 12, 'min_samples_split': 2, 'n_estimators': 5}
-9.140056709772426e-06
训练结束，特征重要性如下：
            Variable  Importance
1          close_ema    0.303857
5                atr    0.147873
3          close_rsi    0.143666
4    close_macd_hist    0.140321
0          close_sma    0.060249
2  close_bb_position    0.059081
9         williams_r    0.045712
6              slowk    0.040476
8      vwap_position    0.029623
7              slowd    0.029142


['/Users/zengyan/Excelsior/ai-trader/temp/doge_5m_0701_0705_1_rf_model.pkl']