In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix
import joblib

# 데이터 불러오기 및 전처리
model_data = pd.read_csv('datasets/ks200.csv', encoding='cp949')

# Label Encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
model_data['forward_stage'] = encoder.fit_transform(model_data['forward_stage'])

# 날짜 처리 및 정렬
model_data['Date'] = pd.to_datetime(model_data['Date'])
model_data.set_index('Date', inplace=True)
model_data.sort_index(inplace=True)

# 특성과 레이블 분리
X = model_data.drop(columns=['forward_stage'])
y = model_data['forward_stage']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 초기화 및 하이퍼파라미터 그리드 정의
rnd_clf = RandomForestClassifier(n_estimators=100, n_jobs=1, random_state=42)
param_dist_rf = {
    'n_estimators': [50, 100, 500],
    'max_leaf_nodes': [20, 30, 40, 50],
    'max_features': [1, 2, 3],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 하이퍼파라미터 튜닝
time_split_cv = TimeSeriesSplit(n_splits=10)
rnd_search = RandomizedSearchCV(rnd_clf, param_dist_rf, cv=time_split_cv, random_state=42)
rnd_search.fit(X_train, y_train)

# 최적 모델 선택 및 훈련
best_clf = rnd_search.best_estimator_
best_clf.fit(X_train, y_train)

# 모델 평가
train_score = best_clf.score(X_train, y_train)
test_score = best_clf.score(X_test, y_test)

# 10-fold cross-validation의 정확도 계산
cv_scores = cross_val_score(best_clf, X_train, y_train, cv=time_split_cv, scoring='accuracy')
cv_mean_score = cv_scores.mean()

# label encoding된 값 찾기
up = encoder.transform(['up'])[0]
neutral = encoder.transform(['neutral'])[0]
down = encoder.transform(['down'])[0]

# 혼동 행렬 확인
y_test_pred = best_clf.predict(X_test)
cm_test = confusion_matrix(y_test, y_test_pred, labels=[up, neutral, down])
y_all_pred = best_clf.predict(X)
cm_all = confusion_matrix(y, y_all_pred, labels=[up, neutral, down])

# 특성 중요도 확인
feature_importance = list(zip(X_train.columns, best_clf.feature_importances_))

# 모델 저장
joblib.dump(best_clf, "separation.pkl")

# 결과 출력

# 결과를 출력합니다.
print(f"{rnd_search.best_params_}\n")
print("<10-fold cross-validation>")
print("accuracy score mean: ", cv_mean_score)
print("\n<AI model: machine learning done >")
print("accuracy_score of train data(0.8 of sample): ", train_score)
print("accuracy_score of test data(0.2 of sample): ", test_score)
print("\n<Confusion matrix>")
print("(of test)")
print("up", "neutral", "down")
print(cm_test)
print("(of all)")
print("up", "neutral", "down")
print(cm_all)
# <Feature importance>를 내림차순으로 정렬하여 출력
print("\n<Feature importance>")
sorted_feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
for name, score in sorted_feature_importance:
    print(name, ": ", score)

print("\n< AI model: save >")

{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_leaf_nodes': 50, 'max_features': 3, 'max_depth': 10}

<10-fold cross-validation>
accuracy score mean:  0.8089622641509434

<AI model: machine learning done >
accuracy_score of train data(0.8 of sample):  0.8802395209580839
accuracy_score of test data(0.2 of sample):  0.8666666666666667

<Confusion matrix>
(of test)
up neutral down
[[155  21   2]
 [ 17  98  24]
 [  1  13 254]]
(of all)
up neutral down
[[ 758   83    6]
 [  77  516  126]
 [  13   53 1291]]

<Feature importance>
per :  0.10181565568506426
pbr :  0.09912006076542794
WTI :  0.09818276130858881
6 :  0.07861861772166609
3 :  0.07282940879179021
VIX :  0.07004737747668983
1 :  0.06972662481013929
4 :  0.061193762831249165
7 :  0.05718270487859435
5 :  0.05463905881588111
USDKRW :  0.050798174108167546
11 :  0.0414016155000495
2 :  0.04019566427834848
8 :  0.03976508136992086
9 :  0.03761284578309894
10 :  0.026870585875323635

< AI model: save >
