In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load the dataset
df = pd.read_csv('../data/cleaned/feature_engineered_data.csv')

# Define features and target
features = ['price', '1h', '24h', '7d', '24h_volume', 'mkt_cap',
            'price_ma7', 'price_ma30', 'volatility_7d', 'volume_change_pct', 'price_change_pct']
target = 'liquidity_ratio'

X = df[features].copy()
y = df[target].copy()

print("Initial shape:", X.shape)

# Replace inf/-inf with NaN in X and y
X.replace([np.inf, -np.inf], np.nan, inplace=True)
y.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaNs in either X or y
combined = pd.concat([X, y], axis=1)
combined.dropna(inplace=True)

# Separate cleaned X and y
X_clean = combined[features]
y_clean = combined[target]

# Impute missing values in features
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X_clean)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Optional: log transform target
log_transform = True
if log_transform:
    y_clean = np.log1p(y_clean)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_clean, test_size=0.2, random_state=42
)

# Train the model with best hyperparameters
model = RandomForestRegressor(
    n_estimators=150,
    max_depth=40,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=42
)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

if log_transform:
    y_test = np.expm1(y_test)
    y_pred = np.expm1(y_pred)

# Final safety checks before metrics
if np.any(np.isinf(y_test)) or np.any(np.isnan(y_test)) or \
   np.any(np.isinf(y_pred)) or np.any(np.isnan(y_pred)):
    raise ValueError("Prediction or actual values contain NaN or Inf")

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse:.4f}")
print(f"Test R²: {r2:.4f}")

# Save model, scaler, and imputer
joblib.dump(model, '../model/best_random_forest_model.pkl')
joblib.dump(imputer, '../model/imputer.pkl')
joblib.dump(scaler, '../model/scaler.pkl')

print("✅ Model, imputer, and scaler saved.")


Initial shape: (1000, 11)
Test MSE: 1.0137
Test R²: 0.9303
✅ Model, imputer, and scaler saved.
