In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

# 📂 Load data
df = pd.read_csv('../data/cleaned/feature_engineered_data.csv')

# 🧠 Define features and target
features = ['price', '1h', '24h', '7d', '24h_volume', 'mkt_cap',
            'price_ma7', 'price_ma30', 'volatility_7d', 'volume_change_pct', 'price_change_pct']
target = 'liquidity_ratio'

X = df[features]
y = df[target]

print("Initial shape:", X.shape)

# ⚠️ Replace inf with NaN
X = X.replace([np.inf, -np.inf], np.nan)
y.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop NaNs
valid_idx = X.dropna().index.intersection(y.dropna().index)
X = X.loc[valid_idx]
y = y.loc[valid_idx]

# 🔄 Optional: log-transform the target
y_log = np.log1p(y)

# ⚠️ Drop inf/NaN from log-transformed target
if np.isinf(y_log).any() or np.isnan(y_log).any():
    print("⚠️ Found inf or NaN in y_log!")
    valid_idx = y_log.replace([np.inf, -np.inf], np.nan).dropna().index
    y_log = y_log.loc[valid_idx]
    X = X.loc[valid_idx]

# 🛠️ Step 1: Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# 🛠️ Step 2: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 🔀 Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_log, test_size=0.2, random_state=42)

# 🔍 Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [20, 40],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt', 'log2', None]
}

# 🤖 Train model with GridSearchCV
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# ✅ Best model
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# 🔍 Evaluate
y_pred = best_model.predict(X_test)

# Reverse log1p safely
y_test_exp = np.expm1(y_test)
y_pred_exp = np.expm1(y_pred)

# Clip large values to avoid overflow
y_test_exp = np.clip(y_test_exp, a_min=0, a_max=1e10)
y_pred_exp = np.clip(y_pred_exp, a_min=0, a_max=1e10)

# 🧮 Final evaluation
mse = mean_squared_error(y_test_exp, y_pred_exp)
r2 = r2_score(y_test_exp, y_pred_exp)
print(f"Test MSE: {mse:.4f}")
print(f"Test R²: {r2:.4f}")

# 💾 Save model and pre-processing tools
joblib.dump(best_model, '../model/best_random_forest_model.pkl')
joblib.dump(scaler, '../model/scaler.pkl')
joblib.dump(imputer, '../model/imputer.pkl')
print("✅ Model, scaler, and imputer saved.")



Initial shape: (1000, 11)
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best hyperparameters: {'max_depth': 40, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Test MSE: 1.0137
Test R²: 0.9303
✅ Model, scaler, and imputer saved.
