In [1]:
import yfinance as yf
import pandas as pd

# WTI Crude Oil
wti = yf.download("CL=F", start="2006-12-30", end="2023-09-21")['Close']
wti.name = 'WTI'

# USD/KRW exchange rate
usdkrw = yf.download("USDKRW=X", start="2006-12-30", end="2023-09-21")['Close']
usdkrw.name = 'USDKRW'

# KOSPI 200
kospi200 = yf.download("^KS11", start="2006-12-30", end="2023-09-21")['Close']
kospi200.name = 'KOSPI200'

# Concatenate all the series into a single DataFrame
data = pd.concat([wti, usdkrw, kospi200], axis=1)

# Handling missing data
data.fillna(method='ffill', inplace=True)  # Forward fill to handle missing data
data.fillna(method='bfill', inplace=True)  # Backward fill to handle missing data at the beginning

# Calculate the forward stage for KOSPI 200
forward_days = 60
data['Forward_Return'] = data['KOSPI200'].shift(-forward_days) / data['KOSPI200'] - 1
data['forward_stage'] = pd.cut(data['Forward_Return'], bins=[-float('inf'), 0, 0.04, float('inf')], labels=['down', 'neutral', 'up'])

# Drop rows with NaN in 'forward_stage'
data.dropna(subset=['forward_stage'], inplace=True)

# Save to CSV if needed
data.to_csv('combined_data.csv')

print(data.head())


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
                  WTI      USDKRW     KOSPI200  Forward_Return forward_stage
Date                                                                        
2007-01-01  61.049999  917.229980  1435.260010        0.010110       neutral
2007-01-02  61.049999  914.419983  1435.260010        0.012520       neutral
2007-01-03  58.320000  914.309998  1409.349976        0.021563       neutral
2007-01-04  55.590000  925.239990  1397.290039        0.038403       neutral
2007-01-05  56.310001  925.440002  1385.760010        0.048197            up


In [2]:
import pandas as pd

# Load the data
data = pd.read_csv('combined_data.csv')

# Round to the second decimal place
data[['WTI', 'USDKRW', 'KOSPI200', 'Forward_Return']] = data[['WTI', 'USDKRW', 'KOSPI200', 'Forward_Return']].round(2)

# Reorder the columns
data = data[['Date', 'forward_stage', 'WTI', 'USDKRW', 'KOSPI200', 'Forward_Return']]

# Change the date format to YY-MM-DD
data['Date'] = pd.to_datetime(data['Date']).dt.strftime('%y-%m-%d')

# Save the modified data to a new CSV file
data.to_csv('modified_combined_data.csv', index=False)

print(data.head())


       Date forward_stage    WTI  USDKRW  KOSPI200  Forward_Return
0  07-01-01       neutral  61.05  917.23   1435.26            0.01
1  07-01-02       neutral  61.05  914.42   1435.26            0.01
2  07-01-03       neutral  58.32  914.31   1409.35            0.02
3  07-01-04       neutral  55.59  925.24   1397.29            0.04
4  07-01-05            up  56.31  925.44   1385.76            0.05


In [3]:
import pandas as pd

# Load the data
data = pd.read_csv('combined_data.csv')

# Round to the second decimal place
data[['WTI', 'USDKRW', 'KOSPI200', 'Forward_Return']] = data[['WTI', 'USDKRW', 'KOSPI200', 'Forward_Return']].round(2)

# Drop the 'Forward_Return' column
data.drop(columns='Forward_Return', inplace=True)

# Reorder the columns
data = data[['Date', 'forward_stage', 'WTI', 'USDKRW', 'KOSPI200']]

# Change the date format to YY-MM-DD
data['Date'] = pd.to_datetime(data['Date']).dt.strftime('%y-%m-%d')

# Save the modified data to a new CSV file
data.to_csv('1.csv', index=False)

print(data.head())


       Date forward_stage    WTI  USDKRW  KOSPI200
0  07-01-01       neutral  61.05  917.23   1435.26
1  07-01-02       neutral  61.05  914.42   1435.26
2  07-01-03       neutral  58.32  914.31   1409.35
3  07-01-04       neutral  55.59  925.24   1397.29
4  07-01-05            up  56.31  925.44   1385.76


In [23]:
# -*- coding: utf-8 -*- 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import sklearn.metrics as mt 
from sklearn.tree import export_graphviz 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import StratifiedShuffleSplit 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV 
import joblib 
from sklearn.metrics import confusion_matrix 

# 1. Load the modified CSV data
model_data = pd.read_csv("modified_1.csv")

# Set 'Date' column as index
model_data.set_index('Date', inplace=True)

# 2. Generate complete data of features and label
X = model_data.drop(columns=['forward_stage'])  # Exclude the 'forward_stage' column for features
y = model_data['forward_stage']  # Use 'forward_stage' column as label

X_past = X[y.notna()] 
y_past = y[y.notna()]

# 3. Split the data into train and test
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X_past, y_past):
    X_train, X_test = X_past.iloc[train_index,], X_past.iloc[test_index,] 
    y_train, y_test = y_past.iloc[train_index], y_past.iloc[test_index]

# 4. Model fine-tuning: find optimal hyperparameters
rnd_clf = RandomForestClassifier(n_estimators=100, n_jobs=1, random_state=42)  # n_jobs를 1로 설정
param_dist_rf = {'n_estimators': [50, 100, 500], 'max_leaf_nodes': [20, 30, 40, 50], 'max_features': [1, 2]} 

rnd_search = RandomizedSearchCV(rnd_clf, param_dist_rf, cv=10, random_state=42)
rnd_search.fit(X_train, y_train)
print(rnd_search.best_params_)

# 5. Train the model and evaluate it using K-fold cross-validation
rnd_search = RandomizedSearchCV(rnd_clf, param_dist_rf, cv=10, random_state=42, n_jobs=1)  # n_jobs를 1로 설정
rnd_scores = cross_val_score(rnd_clf, X_train, y_train, scoring="accuracy", cv=10)
print("\n<10-fold cross-validation>")
print("accuracy score mean: ", rnd_scores.mean())

# 6. Train the final model
rnd_clf.fit(X_train, y_train)
print("\n<AI model: machine learning done >")
print("accuracy_score of train data(0.8 of sample): ", rnd_clf.score(X_train, y_train))

# 7. Evaluate the model on test data
print("accuracy_score of test data(0.2 of sample): ", rnd_clf.score(X_test, y_test))

# 8. Check the confusion matrix
y_test_pred = rnd_clf.predict(X_test) 
cm1 = confusion_matrix(y_test, y_test_pred, labels=["up", "neutral", "down"]) 
print("\n<Confusion matrix>")
print("(of test)")
print("up", "neutral", "down")
print(cm1)
cm2 = confusion_matrix(y_past, rnd_clf.predict(X_past), labels=["up", "neutral", "down"]) 
print("(of all)")
print("up", "neutral", "down")
print(cm2)

# 9. Check feature importance
print("\n<Feature importance>")
for name, score in zip(X.columns, rnd_clf.feature_importances_):
    print(name, ": ", score)

# 10. Generate prediction data for backtesting
y_prediction = rnd_clf.predict(X)
y_pred = pd.Series(y_prediction, index=y.index)

# 11. Save the model
joblib.dump(rnd_clf, "forecast_model.pkl")
print("\n< AI model: save >")


FileNotFoundError: [Errno 2] No such file or directory: '/modified_1.csv'