In [None]:
import pandas as pd
import numpy as np
import ta
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Classification Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
# Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
# Metrics
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score

--- Step 1: Libraries Imported Successfully ---


In [2]:
print("\n--- Step 2: Loading Data and Engineering Features ---")
df = pd.read_csv('1hour.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y:%m:%d %H:%M:%S')
df = df.sort_values('timestamp').set_index('timestamp')

# --- Create Target Variables for BOTH models ---
# Classification Target: 1 if the price goes up, 0 if it goes down.
df['Target_Movement'] = (df['close'].shift(-1) > df['close']).astype(int)
# Regression Target: The actual closing price of the next hour.
df['Target_Close'] = df['close'].shift(-1)

# --- Engineer New Features ---
df['price_change'] = df['close'] - df['open']
df['high_low_diff'] = df['high'] - df['low']
for window in [6, 12, 24]:
    df[f'rolling_mean_close_{window}h'] = df['close'].rolling(window=window).mean()
    df[f'rolling_std_close_{window}h'] = df['close'].rolling(window=window).std()
df['rsi'] = ta.momentum.RSIIndicator(close=df['close'], window=14).rsi()
df['macd'] = ta.trend.MACD(close=df['close']).macd()
df['hour_sin'] = np.sin(2 * np.pi * df.index.hour / 24)
df['dayofweek_cos'] = np.cos(2 * np.pi * df.index.dayofweek / 7)
print("✅ New features created.")


--- Step 2: Loading Data and Engineering Features ---
✅ New features created.


In [3]:
print("\n--- Step 3: Lagging All Features to Prevent Data Leakage ---")
feature_cols_to_lag = [col for col in df.columns if 'Target' not in col]
for col in feature_cols_to_lag:
    df[f'{col}_lag1'] = df[col].shift(1)
df.dropna(inplace=True)
print("✅ All features lagged. Data is ready for modeling.")


--- Step 3: Lagging All Features to Prevent Data Leakage ---
✅ All features lagged. Data is ready for modeling.


In [4]:
print("\n--- Step 4: Defining Final Feature Set and Targets ---")
FINAL_FEATURE_COLS = [col for col in df.columns if '_lag1' in col]
CLS_TARGET_COL = 'Target_Movement'
REG_TARGET_COL = 'Target_Close'

X = df[FINAL_FEATURE_COLS]
y_cls = df[CLS_TARGET_COL]
y_reg = df[REG_TARGET_COL]
print(f"We have {len(FINAL_FEATURE_COLS)} features for our models.")


--- Step 4: Defining Final Feature Set and Targets ---
We have 17 features for our models.


In [5]:
print("\n\n" + "="*50)
print("🚀 STARTING CLASSIFICATION MODEL PIPELINE 🚀")
print("="*50)

# --- 5a. Split data for Classification ---
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X, y_cls, test_size=0.2, random_state=42, stratify=y_cls)

# --- 5b. Find Best Baseline Classification Model ---
print("\n--- Step 5a: Evaluating Classification Models with Normal Parameters ---")
cls_models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}
baseline_accuracies = {}
for name, model in cls_models.items():
    pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', model)])
    pipeline.fit(X_train_cls, y_train_cls)
    y_pred = pipeline.predict(X_test_cls)
    accuracy = accuracy_score(y_test_cls, y_pred)
    baseline_accuracies[name] = accuracy
    print(f"  Accuracy of {name}: {accuracy:.2%}")

best_cls_model_name = max(baseline_accuracies, key=baseline_accuracies.get)
print(f"\n🏆 Best Baseline Classifier: '{best_cls_model_name}'")

# --- 5c. Hyperparameter Tune the Best Classification Model ---
print(f"\n--- Step 5b: Performing Hyperparameter Tuning for {best_cls_model_name} ---")
cls_param_grids = {
    "Random Forest": {'classifier__n_estimators': [100, 200], 'classifier__max_depth': [10, 20]},
    "Decision Tree": {'classifier__max_depth': [10, 20, 30]},
    "XGBoost": {'classifier__n_estimators': [100, 200], 'classifier__learning_rate': [0.05, 0.1], 'classifier__max_depth': [3, 5]}
}
chosen_cls_model = cls_models[best_cls_model_name]
chosen_cls_grid = cls_param_grids[best_cls_model_name]
final_cls_pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', chosen_cls_model)])
grid_search_cls = GridSearchCV(final_cls_pipeline, chosen_cls_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search_cls.fit(X_train_cls, y_train_cls)

# --- 5d. Final Evaluation of Tuned Classification Model ---
print("\n--- Step 5c: Final Evaluation of Tuned Classification Model ---")
final_cls_model = grid_search_cls.best_estimator_
y_pred_cls_final = final_cls_model.predict(X_test_cls)
final_accuracy = accuracy_score(y_test_cls, y_pred_cls_final)
print(f"Initial Accuracy of {best_cls_model_name}: {baseline_accuracies[best_cls_model_name]:.2%}")
print(f"**Final Tuned Accuracy of {best_cls_model_name}: {final_accuracy:.2%}**")
print("\nFinal Classification Report:")
print(classification_report(y_test_cls, y_pred_cls_final, target_names=['Down 🔽', 'Up 🔼']))



🚀 STARTING CLASSIFICATION MODEL PIPELINE 🚀

--- Step 5a: Evaluating Classification Models with Normal Parameters ---
  Accuracy of Random Forest: 49.42%
  Accuracy of Decision Tree: 50.31%
  Accuracy of XGBoost: 50.44%

🏆 Best Baseline Classifier: 'XGBoost'

--- Step 5b: Performing Hyperparameter Tuning for XGBoost ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits

--- Step 5c: Final Evaluation of Tuned Classification Model ---
Initial Accuracy of XGBoost: 50.44%
**Final Tuned Accuracy of XGBoost: 52.27%**

Final Classification Report:
              precision    recall  f1-score   support

      Down 🔽       0.52      0.45      0.48      5285
        Up 🔼       0.53      0.60      0.56      5487

    accuracy                           0.52     10772
   macro avg       0.52      0.52      0.52     10772
weighted avg       0.52      0.52      0.52     10772



In [6]:
print("\n\n" + "="*50)
print("🚀 STARTING REGRESSION MODEL PIPELINE 🚀")
print("="*50)

# --- 6a. Split data for Regression using TimeSeriesSplit ---
# For regression on time series, it's better not to shuffle the data.
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train_reg, X_test_reg = X.iloc[train_index], X.iloc[test_index]
    y_train_reg, y_test_reg = y_reg.iloc[train_index], y_reg.iloc[test_index]

# --- 6b. Find Best Baseline Regression Model ---
print("\n--- Step 6a: Evaluating Regression Models with Normal Parameters ---")
reg_models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}
baseline_reg_scores = {}
for name, model in reg_models.items():
    pipeline = Pipeline([('scaler', StandardScaler()), ('regressor', model)])
    pipeline.fit(X_train_reg, y_train_reg)
    y_pred_reg = pipeline.predict(X_test_reg)
    r2 = r2_score(y_test_reg, y_pred_reg)
    baseline_reg_scores[name] = r2
    print(f"  R² Score of {name}: {r2:.4f}")

best_reg_model_name = max(baseline_reg_scores, key=baseline_reg_scores.get)
print(f"\n🏆 Best Baseline Regressor: '{best_reg_model_name}'")

# --- 6c. Hyperparameter Tune the Best Regression Model ---
print(f"\n--- Step 6b: Performing Hyperparameter Tuning for {best_reg_model_name} ---")
reg_param_grids = {
    "Linear Regression": {},
    "Random Forest": {'regressor__n_estimators': [100, 200], 'regressor__max_depth': [10, 20]},
    "XGBoost": {'regressor__n_estimators': [100, 200], 'regressor__learning_rate': [0.05, 0.1], 'regressor__max_depth': [3, 5]}
}
chosen_reg_model = reg_models[best_reg_model_name]
chosen_reg_grid = reg_param_grids[best_reg_model_name]
final_reg_pipeline = Pipeline([('scaler', StandardScaler()), ('regressor', chosen_reg_model)])
grid_search_reg = GridSearchCV(final_reg_pipeline, chosen_reg_grid, cv=tscv, n_jobs=-1, scoring='r2', verbose=1)
grid_search_reg.fit(X, y_reg) # Use full dataset for CV tuning in regression

# --- 6d. Final Evaluation of Tuned Regression Model ---
print("\n--- Step 6c: Final Evaluation of Tuned Regression Model ---")
final_reg_model = grid_search_reg.best_estimator_
y_pred_reg_final = final_reg_model.predict(X_test_reg)
final_r2 = r2_score(y_test_reg, y_pred_reg_final)
final_mse = mean_squared_error(y_test_reg, y_pred_reg_final)
print(f"Initial R² Score of {best_reg_model_name}: {baseline_reg_scores[best_reg_model_name]:.4f}")
print(f"**Final Tuned R² Score of {best_reg_model_name}: {final_r2:.4f}**")
print(f"Final Mean Squared Error: {final_mse:.4f}")



🚀 STARTING REGRESSION MODEL PIPELINE 🚀

--- Step 6a: Evaluating Regression Models with Normal Parameters ---
  R² Score of Linear Regression: 0.9988
  R² Score of Random Forest: 0.9961
  R² Score of XGBoost: 0.9867

🏆 Best Baseline Regressor: 'Linear Regression'

--- Step 6b: Performing Hyperparameter Tuning for Linear Regression ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits

--- Step 6c: Final Evaluation of Tuned Regression Model ---
Initial R² Score of Linear Regression: 0.9988
**Final Tuned R² Score of Linear Regression: 0.9988**
Final Mean Squared Error: 24465.6911


In [7]:
def predict_on_historical_data():
    """
    Takes a random sample from the unseen test set, makes a prediction,
    and displays the results. This demonstrates the model's functionality
    without needing live data or user input.
    """
    print("\n\n" + "="*50)
    print("🤖 PREDICTING ON A SAMPLE FROM THE HISTORICAL TEST SET 🤖")
    print("="*50)

    try:
        # --- 4a. Select a Random Unseen Data Point ---
        # We take one random row from our test set (X_test).
        # The model has not been trained on this data.
        sample_features = X_test.sample(1, random_state=42)
        
        print("Using the following random data point from the test set:")
        # .T transposes the data for better readability
        print(sample_features.T)

        # --- 4b. Make Predictions ---
        # Scale the features using the scaler we trained in Part 1
        sample_features_scaled = scaler.transform(sample_features)

        # Classification Prediction using the final tuned model from Part 3
        cls_pred_raw = final_cls_model.predict(sample_features_scaled)
        cls_proba = final_cls_model.predict_proba(sample_features_scaled)
        cls_pred = "UP 🔼" if cls_pred_raw[0] == 1 else "DOWN 🔽"
        confidence = cls_proba[0][cls_pred_raw[0]]

        # Regression Prediction (using the unscaled features for tree-based models)
        # using the final tuned model from Part 2
        reg_pred = final_reg_model.predict(sample_features)

        # --- 4c. Display Results ---
        print("\n--- PREDICTION RESULTS ---")
        print(f"🧭 Predicted Next Hour's Movement: {cls_pred} (Confidence: {confidence:.2%})")
        print(f"💲 Predicted Next Hour's Close Price: ${reg_pred[0]:,.2f}")

    except Exception as e:
        print(f"\nAn error occurred during the prediction process: {e}")

In [8]:
predict_on_historical_data()



🤖 PREDICTING ON A SAMPLE FROM THE HISTORICAL TEST SET 🤖

An error occurred during the prediction process: name 'X_test' is not defined
