In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# 데이터 불러오기
data_scaled = pd.read_csv('mydatasets/8_tech.csv')

# Label Encoding
le = LabelEncoder()
data_scaled['forward_stage'] = le.fit_transform(data_scaled['forward_stage'])

# 특성과 레이블 분리
X = data_scaled.drop(columns=['Date', 'forward_stage'])
y = data_scaled['forward_stage']

# 데이터를 훈련 세트와 테스트 세트로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델 초기화 및 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# 예측 수행
predictions = rf_model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

# 평가 결과 출력
print(f'Accuracy: {accuracy:.2f}')
print("Classification Report:")
print(classification_rep)

# 특성 중요도 시각화
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()


KeyError: 'forward_stage'

In [28]:
import pandas as pd

# 데이터 불러오기
data_scaled = pd.read_csv('Scaled.csv')

# 클래스 별 개수 확인
class_counts = data_scaled['forward_stage'].value_counts()

print(class_counts)


forward_stage
down       1355
up          847
neutral     716
Name: count, dtype: int64


In [33]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# 특성과 레이블 분리
X = data_scaled.drop(columns=['Date', 'forward_stage'])
y = data_scaled['forward_stage']

# SMOTE 적용
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# 클래스 별 개수 확인
print(f'Original dataset shape: {Counter(y)}')
print(f'Resampled dataset shape: {Counter(y_res)}')


Original dataset shape: Counter({'down': 1355, 'up': 847, 'neutral': 716})
Resampled dataset shape: Counter({'down': 1355, 'neutral': 1355, 'up': 1355})


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

# 데이터 불러오기
data_scaled = pd.read_csv('Scaled.csv')

# 특성과 레이블 분리
X = data_scaled.drop(columns=['Date', 'forward_stage'])
y = data_scaled['forward_stage']

# SMOTE 적용
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# 데이터를 훈련 세트와 테스트 세트로 분리
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델 초기화 및 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# 예측 수행
predictions = rf_model.predict(X_test)

# 모델 평가
acc = accuracy_score(y_test, predictions)
class_report = classification_report(y_test, predictions)

# 평가 결과 출력
print(f'Accuracy: {acc:.2f}')
print(f'Classification Report:\n{class_report}')


Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

        down       0.95      0.91      0.93       289
     neutral       0.88      0.89      0.89       247
          up       0.94      0.96      0.95       277

    accuracy                           0.92       813
   macro avg       0.92      0.92      0.92       813
weighted avg       0.92      0.92      0.92       813

