In [1]:
# Cell 1: Imports & Configuration
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
from datetime import datetime
from tqdm.notebook import tqdm

# Database
from sqlalchemy import create_engine
import pymysql

# Metrics and Preprocessing
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.neural_network import MLPRegressor

# Hypertuning
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

warnings.filterwarnings("ignore")
print("Libraries imported.")

# Database Configuration
db_config = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': '',  
    'database': 'trading_system'
}
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}/{db_config['database']}"
engine = create_engine(db_url)

# Path Definition
base_dir = Path.cwd().parent
output_dir = base_dir / "4_Results" / "Tuning_Runs"
output_dir.mkdir(exist_ok=True)

print(f"Results will be saved to: {output_dir}")

Libraries imported.
Results will be saved to: c:\Users\18kyu\Desktop\Unishit\IR\4_Results\Tuning_Runs


In [2]:
# Cell 2: Load 'model_features' table from database
print("Loading 'model_features' table from database.")

try:
    df = pd.read_sql("SELECT * FROM model_features", con=engine)
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['ticker', 'date'])
    
    print(f"Successfully loaded {len(df):,} rows.")
    print(f"Data date range: {df['date'].min().date()} to {df['date'].max().date()}")
    print(f"Number of unique tickers: {df['ticker'].nunique()}")

except Exception as e:
    print(f"ERROR loading data: {e}")
    raise


Loading 'model_features' table from database.
Successfully loaded 1,855,010 rows.
Data date range: 2010-04-11 to 2020-05-31
Number of unique tickers: 4028


In [3]:
# Cell 3: Define Feature Sets
print("\n Defining feature sets for model training.")

# 1. Price_Only
price_features = [
    'prev_close', 'ma_3', 'ma_5', 'ma_10', 'volatility',
    'rsi', 'momentum_3', 'momentum_5', 'volume'
]

# 2. Price_Base_Sentiment
base_sentiment_features = price_features + [
    'textblob_polarity', 'vader_compound', 'finbert_compound'
]

# 3. Price_Full_Sentiment
full_sentiment_features = price_features + [
    'textblob_polarity', 'vader_compound', 'finbert_compound',
    'news_count', 'has_news', 'days_since_news', 'sentiment_freshness',
    'textblob_polarity_volatility', 'vader_compound_volatility',
    'finbert_compound_volatility', 'finbert_momentum_3',
    'vader_momentum_3', 'textblob_momentum_3'
]

# Store feature sets
feature_sets = {
    "Price_Only": price_features,
    "Price_Base_Sentiment": base_sentiment_features,
    "Price_Full_Sentiment": full_sentiment_features
}

print(f"Total feature sets: {len(feature_sets)}")


 Defining feature sets for model training.
Total feature sets: 3


In [4]:
# Cell 4: Define Models & Parameter Grids for Tuning
print("Defining models and hyperparameter search spaces.")

# 1. Ridge
grid_ridge = {
    'alpha': uniform(0.1, 10.0)  # Sample from 0.1 to 10.0
}

# 2. Lasso
grid_lasso = {
    'alpha': uniform(0.001, 1.0) # Sample from 0.001 to 1.0
}

# 3. SVR
grid_svr = {
    'C': uniform(0.1, 10.0),
    'gamma': ['scale', 'auto'] + list(uniform(0.001, 0.1).rvs(3)),
    'kernel': ['rbf'] # Stick with RBF as it's most common
}

# 4. Random Forest
grid_rf = {
    'n_estimators': randint(50, 200),
    'max_depth': [None] + list(randint(5, 20).rvs(3)),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 1.0]
}

# 5. XGBoost
grid_xgb = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.7, 0.3), # 0.7 to 1.0
    'colsample_bytree': uniform(0.7, 0.3)
}

# 6. MLP Regressor
grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (64, 32), (50, 50)],
    'activation': ['relu', 'tanh'],
    'alpha': uniform(0.0001, 0.01),
    'learning_rate': ['constant', 'adaptive']
}

models_to_tune = {
    "Ridge": (Ridge(random_state=42), grid_ridge),
    "Lasso": (Lasso(random_state=42), grid_lasso),
    "SVR": (SVR(), grid_svr),
    "RandomForest": (RandomForestRegressor(random_state=42), grid_rf),
    "XGBoost": (xgb.XGBRegressor(objective='reg:squarederror', random_state=42), grid_xgb),
    "MLP": (MLPRegressor(random_state=42, max_iter=500, early_stopping=True), grid_mlp)
}

print(f"Configured {len(models_to_tune)} model families for tuning.")


Defining models and hyperparameter search spaces.
Configured 6 model families for tuning.


In [5]:
# Cell 5: Define Helper Functions
def calculate_metrics(y_true, y_pred, model_name, ticker, feature_set_name):
    """Calculate metrics with safety checks"""
    mask = ~(np.isnan(y_true) | np.isnan(y_pred) | np.isinf(y_true) | np.isinf(y_pred))
    y_true_clean = y_true[mask]
    y_pred_clean = y_pred[mask]
    
    if len(y_true_clean) == 0:
        return None
    
    rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
    mae = mean_absolute_error(y_true_clean, y_pred_clean)
    r2 = r2_score(y_true_clean, y_pred_clean)
    correct_direction = np.sign(y_true_clean) == np.sign(y_pred_clean)
    directional_accuracy = np.mean(correct_direction) * 100
    
    return {
        'ticker': ticker,
        'model': model_name,
        'feature_set': feature_set_name,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'directional_accuracy': directional_accuracy
    }

print("Helper functions defined.")


Helper functions defined.


In [6]:
# Cell 6: Main Tuning Loop

print("Starting Model Hypertuning Loop.")

all_results = []
top_10_tickers = df['ticker'].value_counts().head(10).index.tolist()

N_ITER = 10  # Number of parameter combinations to try. Increase for more thoroughness.
CV_FOLDS = 3 # Number of cross-validation folds.

print(f"Will tune models for {len(top_10_tickers)} tickers.")
print(f"Using RandomizedSearchCV with n_iter={N_ITER} and cv={CV_FOLDS}.")

for ticker in tqdm(top_10_tickers, desc="Processing Tickers"):
    ticker_df = df[df['ticker'] == ticker]
    
    # 1. Split data
    split_idx = int(len(ticker_df) * 0.8)
    train_df = ticker_df.iloc[:split_idx]
    test_df = ticker_df.iloc[split_idx:]
    
    if len(train_df) < 50 or len(test_df) < 5:
        continue
        
    # 2. Iterate through feature sets
    for set_name, feature_cols in feature_sets.items():
        
        # 3. Scale data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(train_df[feature_cols])
        X_test = scaler.transform(test_df[feature_cols])
        y_train = train_df['target_return'].values
        y_test = test_df['target_return'].values

        # 4. Iterate through models
        for model_name, (model, param_grid) in models_to_tune.items():
            
            # Skip SVR/MLP for Price_Full_Sentiment (too slow/complex)
            if set_name == "Price_Full_Sentiment" and model_name in ["SVR", "MLP"]:
                continue

            try:
                # 5. Run Randomized Search Cross-Validation
                search = RandomizedSearchCV(
                    estimator=model,
                    param_distributions=param_grid,
                    n_iter=N_ITER,
                    cv=CV_FOLDS,
                    scoring='neg_mean_squared_error', # Optimize for RMSE
                    n_jobs=-1, # Use all available cores
                    random_state=42,
                    verbose=0
                )
                
                search.fit(X_train, y_train)
                
                # 6. Get best model and make predictions
                best_model = search.best_estimator_
                y_pred = best_model.predict(X_test)
                
                # 7. Calculate and store metrics
                model_full_name = f"{model_name}_Tuned"
                metrics = calculate_metrics(y_test, y_pred, model_full_name, ticker, set_name)
                
                if metrics:
                    # Add best parameters to the results
                    metrics['best_params'] = str(search.best_params_)
                    metrics['best_cv_score'] = search.best_score_ # This is neg_mean_squared_error
                    all_results.append(metrics)
                    
            except Exception as e:
                print(f"Error tuning {model_name} for {ticker} ({set_name}): {e}")
                continue

print(f"\n Hypertuning Complete. Total results generated: {len(all_results)}.")


Starting Model Hypertuning Loop.
Will tune models for 10 tickers.
Using RandomizedSearchCV with n_iter=10 and cv=3.


Processing Tickers:   0%|          | 0/10 [00:00<?, ?it/s]


 Hypertuning Complete. Total results generated: 160.


In [7]:
# Cell 7: Save Results
print("Saving tuned model results.")

if all_results:
    # 1. Convert to DataFrame
    results_df = pd.DataFrame(all_results)
    results_df['timestamp'] = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # 2. Save to CSV
    csv_path = output_dir / "03C_Hypertuning_Performance.csv"
    results_df.to_csv(csv_path, index=False)
    print(f"Successfully saved {len(results_df)} results to {csv_path}")
    
    # 3. Save to Database
    print("Saving results to 'results_hypertuning' table in database.")
    try:
        results_df.to_sql(
            'results_hypertuning',
            con=engine,
            if_exists='replace',
            index=False,
            chunksize=1000
        )
        print("Successfully wrote results to database!")
    except Exception as e:
        print(f"Error writing results to database: {e}")
else:
    print("No results were generated. Skipping save.")


Saving tuned model results.
Successfully saved 160 results to c:\Users\18kyu\Desktop\Unishit\IR\4_Results\Tuning_Runs\03C_Hypertuning_Performance.csv
Saving results to 'results_hypertuning' table in database.
Successfully wrote results to database!


In [8]:
# Cell 8: Summary of Best Models
print("\n--- Hypertuning Performance Summary ---")

if all_results:
    results_df = pd.DataFrame(all_results)
    
    # Show mean performance by model type
    summary = results_df.groupby(['model', 'feature_set']).agg(
        avg_rmse=('rmse', 'mean'),
        avg_r2=('r2', 'mean'),
        avg_dir_acc=('directional_accuracy', 'mean'),
        count=('ticker', 'count')
    ).sort_values(by='avg_rmse').round(4)
    
    print("\n Average Performance of Tuned Models:")
    print(summary.to_string())

    # Show the single best model overall
    best_overall = results_df.loc[results_df['rmse'].idxmin()]
    print("\n Best Overall Tuned Model (by RMSE):")
    print(f"  Ticker: {best_overall['ticker']}")
    print(f"  Model: {best_overall['model']}")
    print(f"  Features: {best_overall['feature_set']}")
    print(f"  RMSE: {best_overall['rmse']:.6f}")
    print(f"  R2: {best_overall['r2']:.6f}")
    print(f"  Params: {best_overall['best_params']}")
else:
    print("No results to summarize.")


--- Hypertuning Performance Summary ---

 Average Performance of Tuned Models:
                                         avg_rmse  avg_r2  avg_dir_acc  count
model              feature_set                                               
Lasso_Tuned        Price_Base_Sentiment    0.0715 -0.0057      52.8302     10
                   Price_Full_Sentiment    0.0715 -0.0057      52.8302     10
                   Price_Only              0.0715 -0.0057      52.8302     10
XGBoost_Tuned      Price_Full_Sentiment    0.0719 -0.0264      51.3208     10
                   Price_Base_Sentiment    0.0719 -0.0260      49.9057     10
                   Price_Only              0.0720 -0.0282      51.6981     10
RandomForest_Tuned Price_Full_Sentiment    0.0721 -0.0320      50.6604     10
Ridge_Tuned        Price_Only              0.0727 -0.0418      51.0377     10
RandomForest_Tuned Price_Base_Sentiment    0.0728 -0.0533      49.9057     10
Ridge_Tuned        Price_Base_Sentiment    0.0733 -0.0572     