In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
from sklearn.preprocessing import LabelEncoder

# 데이터 불러오기 및 전처리
model_data = pd.read_csv('merged_data/merged_data_finance.csv', encoding='utf-8')

# Label Encoding
encoder = LabelEncoder()
model_data['레이블'] = encoder.fit_transform(model_data['레이블'])

# 날짜 처리 및 정렬
model_data['날짜'] = pd.to_datetime(model_data['날짜'])
model_data.set_index('날짜', inplace=True)
model_data.sort_index(inplace=True)

# 특성과 레이블 분리
X = model_data.drop(columns=['레이블'])
y = model_data['레이블']

# 데이터 분할: 시간 기반으로 train/test 데이터 분할
split_date = '2019-01-01'  # 이 날짜를 기준으로 앞은 train, 뒤는 test 데이터로 분할
X_train = X.loc[X.index < split_date]
y_train = y.loc[y.index < split_date]
X_test = X.loc[X.index >= split_date]
y_test = y.loc[y.index >= split_date]

# 모델 초기화 및 하이퍼파라미터 그리드 정의
rnd_clf = RandomForestClassifier(random_state=42)
param_dist_rf = {
    'n_estimators': [50, 100, 500],
    'max_leaf_nodes': [20, 30, 40, 50],
    'max_features': [1, 2, 3],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 하이퍼파라미터 랜덤 탐색
time_split_cv = TimeSeriesSplit(n_splits=5)
rnd_search = RandomizedSearchCV(rnd_clf, param_dist_rf, cv=time_split_cv, random_state=42, n_iter=10)
rnd_search.fit(X_train, y_train)

# 최적 모델 선택 및 훈련
best_clf = rnd_search.best_estimator_
best_clf.fit(X_train, y_train)

# 모델 평가
train_score = best_clf.score(X_train, y_train)
test_score = best_clf.score(X_test, y_test)

# 혼동 행렬 확인
y_test_pred = best_clf.predict(X_test)
cm_test = confusion_matrix(y_test, y_test_pred)
y_all_pred = best_clf.predict(X)
cm_all = confusion_matrix(y, y_all_pred)

# 특성 중요도 확인
feature_importance = list(zip(X_train.columns, best_clf.feature_importances_))

# 모델 저장
joblib.dump(best_clf, "best_model_energy.pkl")

# 결과 출력
print(f"{rnd_search.best_params_}\n")
print("<Train and Test Score>")
print("Train Accuracy: ", train_score)
print("Test Accuracy: ", test_score)
print("\n<Confusion Matrix>")
print("(of test)")
print("up", "neutral", "down")
print(cm_test)
print("(of all)")
print("up", "neutral", "down")
print(cm_all)
print("\n<Feature Importance>")
sorted_feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
for name, score in sorted_feature_importance:
    print(name, ": ", score)

print("\n<Model Saved>")


{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_leaf_nodes': 50, 'max_features': 2, 'max_depth': 10}

<Train and Test Score>
Train Accuracy:  0.8959276018099548
Test Accuracy:  0.5357483317445186

<Confusion Matrix>
(of test)
up neutral down
[[217   1 282]
 [ 35   0 140]
 [ 29   0 345]]
(of all)
up neutral down
[[748  23 284]
 [ 86 345 169]
 [ 31  32 657]]

<Feature Importance>
통화량 :  0.06434457718658909
WTI :  0.06138682227303774
환율 :  0.06089247270886374
OBV :  0.05512438515065668
수출금액지수 :  0.054768329577546106
경제심리지수 :  0.052489437945007905
소비자물가지수 :  0.05152354222330329
PER :  0.050339545529714425
수입금액지수 :  0.049231232802296764
ATR :  0.04915455155992752
고용률 :  0.048504518860091064
미국경제성장률 :  0.04811175025304437
PBR :  0.045881807722451005
경기종합지수 :  0.04184905467592656
ADX :  0.03678181869535027
MACD :  0.03588945064795826
실업률 :  0.03570027621772538
기준금리 :  0.03536897827713891
한국경제성장률 :  0.034668763442921496
VIX :  0.03337554966694079
MFI :  0.019697052229

In [9]:
%pip install imblearn

Defaulting to user installation because normal site-packages is not writeable
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn (from imblearn)
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/a3/9e/fbe60a768502af54563dcb59ca7856f5a8833b3ad5ada658922e1ab09b7f/imbalanced_learn-0.11.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting joblib>=1.1.1 (from imbalanced-learn->imblearn)
  Obtaining dependency information for joblib>=1.1.1 from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl.metadata
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
   ---------------------------------------- 235.6/235.6 kB 1.6 MB/s eta 0:00:00
Downloading joblib-1.3.2-py3-none-any.whl (

In [12]:
# 데이터 불균형 처리

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import joblib

# 데이터 불러오기 및 전처리
model_data = pd.read_csv('merged_data/merged_data_energy.csv', encoding='utf-8')

# Label Encoding
encoder = LabelEncoder()
model_data['레이블'] = encoder.fit_transform(model_data['레이블'])

# 날짜 처리 및 정렬
model_data['날짜'] = pd.to_datetime(model_data['날짜'])
model_data.set_index('날짜', inplace=True)
model_data.sort_index(inplace=True)

# 특성과 레이블 분리
X = model_data.drop(columns=['레이블'])
y = model_data['레이블']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# SMOTE 적용
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# 모델 초기화 및 하이퍼파라미터 그리드 정의
rnd_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 500],
    'max_leaf_nodes': [20, 30, 40, 50],
    'max_features': [1, 2, 3],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 하이퍼파라미터 튜닝
time_split_cv = TimeSeriesSplit(n_splits=10)
grid_search = GridSearchCV(rnd_clf, param_grid_rf, cv=time_split_cv, scoring='accuracy')
grid_search.fit(X_resampled, y_resampled)

# 최적 모델 선택 및 훈련
best_clf = grid_search.best_estimator_
best_clf.fit(X_resampled, y_resampled)

# 모델 평가
train_score = best_clf.score(X_resampled, y_resampled)
test_score = best_clf.score(X_test, y_test)

# 혼동 행렬 확인
y_test_pred = best_clf.predict(X_test)
cm_test = confusion_matrix(y_test, y_test_pred)
y_all_pred = best_clf.predict(X)
cm_all = confusion_matrix(y, y_all_pred)

# 특성 중요도 확인
feature_importance = list(zip(X_train.columns, best_clf.feature_importances_))

# 모델 저장
joblib.dump(best_clf, "separation111.pkl")

# 결과 출력
print(f"{grid_search.best_params_}\n")
print("<Train and Test Score>")
print("Train Accuracy: ", train_score)
print("Test Accuracy: ", test_score)
print("\n<Confusion Matrix>")
print("(of test)")
print("up", "neutral", "down")
print(cm_test)
print("(of all)")
print("up", "neutral", "down")
print(cm_all)
print("\n<Feature Importance>")
sorted_feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
for name, score in sorted_feature_importance:
    print(name, ": ", score)

print("\n<Model Saved>")


exception calling callback for <Future at 0x25ba0da70d0 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "c:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "c:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "c:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "c:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "c:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))


KeyboardInterrupt: 

In [14]:
# 시계열 데이터의 특성을 직접적으로 고려

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
#import joblib
#from imblearn.over_sampling import SMOTE

# 데이터 불러오기 및 전처리
model_data = pd.read_csv('merged_data/merged_data_energy.csv', encoding='utf-8')

# Label Encoding
encoder = LabelEncoder()
model_data['레이블'] = encoder.fit_transform(model_data['레이블'])

# 날짜 처리 및 정렬
model_data['날짜'] = pd.to_datetime(model_data['날짜'])
model_data.set_index('날짜', inplace=True)
model_data.sort_index(inplace=True)

# 특성과 레이블 분리
X = model_data.drop(columns=['레이블'])
y = model_data['레이블']

# 데이터 분할 (시간 기준)
split_time = '2020-01-01'
X_train, X_test = X[:split_time], X[split_time:]
y_train, y_test = y[:split_time], y[split_time:]

# 모델 초기화 및 하이퍼파라미터 그리드 정의
rnd_clf = RandomForestClassifier(n_jobs=1, random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 500],
    'max_leaf_nodes': [20, 30, 40, 50],
    'max_features': [1, 2, 3],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 하이퍼파라미터 튜닝
time_split_cv = TimeSeriesSplit(n_splits=10)
grid_search = GridSearchCV(rnd_clf, param_grid_rf, cv=time_split_cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 최적 모델 선택 및 훈련
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

# 모델 평가
train_score = best_clf.score(X_train, y_train)
test_score = best_clf.score(X_test, y_test)

# 혼동 행렬 확인
y_test_pred = best_clf.predict(X_test)
cm_test = confusion_matrix(y_test, y_test_pred)

# 특성 중요도 확인
feature_importance = list(zip(X_train.columns, best_clf.feature_importances_))

# 모델 저장 (필요시 활성화)
joblib.dump(best_clf, "model.pkl")

# 결과 출력
print(f"{grid_search.best_params_}\n")
print("<Train and Test Score>")
print("Train Accuracy: ", train_score)
print("Test Accuracy: ", test_score)
print("\n<Confusion Matrix>")
print("(of test)")
print("up", "neutral", "down")
print(cm_test)
print("\n<Feature Importance>")
sorted_feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
for name, score in sorted_feature_importance:
    print(name, ": ", score)

print("\n<Model Saved (if needed)>")


KeyboardInterrupt: 