In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv("CRDB_daily_10Y.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values('Date', inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2461 entries, 0 to 2460
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       2461 non-null   datetime64[ns]
 1   Open       2461 non-null   float64       
 2   High       2461 non-null   float64       
 3   Low        2461 non-null   float64       
 4   Close      2461 non-null   int64         
 5   Adj_Close  2461 non-null   float64       
 6   Volume     2461 non-null   float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 134.7 KB


In [3]:
df['HL_PCT']  = (df['High'] - df['Low']) / df['Low']
df['OC_PCT']  = (df['Close'] - df['Open']) / df['Open']
df['RET_1']   = df['Close'].pct_change()
df['VOL_10']  = df['RET_1'].rolling(10).std()
df['VOL_20']  = df['RET_1'].rolling(20).std()

FORECAST_HORIZON = 5
df['future_return'] = (
    df['Close'].shift(-FORECAST_HORIZON) - df['Close']
) / df['Close']

df.dropna(inplace=True)


In [4]:
features = ['HL_PCT', 'OC_PCT', 'Volume', 'RET_1', 'VOL_10', 'VOL_20']

X = df[features]
y = df['future_return']


In [5]:
split = int(len(df) * 0.7)

X_train = X.iloc[:split]
X_test  = X.iloc[split:]

y_train = y.iloc[:split]
y_test  = y.iloc[split:]


In [6]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


In [7]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)


In [8]:
dt = DecisionTreeRegressor(max_depth=5, random_state=42)
dt.fit(X_train_scaled, y_train)

y_pred_dt = dt.predict(X_test_scaled)


In [9]:
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr   = r2_score(y_test, y_pred_lr)

rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt   = r2_score(y_test, y_pred_dt)

print("Linear Regression RMSE:", rmse_lr)
print("Linear Regression R2:", r2_lr)

print("Decision Tree RMSE:", rmse_dt)
print("Decision Tree R2:", r2_dt)


Linear Regression RMSE: 0.04544023257890772
Linear Regression R2: -0.0028642555412816684
Decision Tree RMSE: 0.051209692081585705
Decision Tree R2: -0.27369484205395844


In [10]:
if rmse_lr < rmse_dt:
    best_model = lr
    print("Linear Regression selected")
else:
    best_model = dt
    print("Decision Tree selected")
with open("model.pkl", "wb") as f:
    pickle.dump(best_model, f)


Linear Regression selected


In [11]:
with open("model.pkl", "wb") as f:
    pickle.dump(best_model, f)


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import joblib  # <-- use joblib instead of pickle

# --- Load Data ---
df = pd.read_csv("CRDB_daily_10Y.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values('Date', inplace=True)
df.reset_index(drop=True, inplace=True)

# --- Feature Engineering ---
df['HL_PCT'] = (df['High'] - df['Low']) / df['Low']
df['OC_PCT'] = (df['Close'] - df['Open']) / df['Open']
df['RET_1'] = df['Close'].pct_change()
df['VOL_10'] = df['RET_1'].rolling(10).std()
df['VOL_20'] = df['RET_1'].rolling(20).std()

FORECAST_HORIZON = 5
df['future_return'] = (df['Close'].shift(-FORECAST_HORIZON) - df['Close']) / df['Close']

df.dropna(inplace=True)

# --- Features and Target ---
features = ['HL_PCT', 'OC_PCT', 'Volume', 'RET_1', 'VOL_10', 'VOL_20']
X = df[features]
y = df['future_return']

# --- Train/Test Split ---
split = int(len(df) * 0.7)
X_train = X.iloc[:split]
X_test  = X.iloc[split:]
y_train = y.iloc[:split]
y_test  = y.iloc[split:]

# --- Scale Features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# --- Train Model ---
model = DecisionTreeRegressor()
model.fit(X_train_scaled, y_train)

# --- Save Model and Scaler ---
joblib.dump(scaler, "scaler.joblib")
joblib.dump(model, "model.joblib")

print("✅ Model and scaler saved successfully!")


✅ Model and scaler saved successfully!
