In [1]:
# %%
# ### 1. Imports & Setup

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
import json

os.makedirs('../model', exist_ok=True)
os.makedirs('../data/results', exist_ok=True)

# %%
# ### 2. Load Data and Define Features/Target

df = pd.read_csv('../data/cleaned/feature_engineered_data.csv')

features = ['price', '1h', '24h', '7d', '24h_volume', 'mkt_cap',
            'price_ma7', 'price_ma30', 'volatility_7d', 'volume_change_pct', 'price_change_pct']
target = 'liquidity_ratio'

# Replace inf/-inf with NaN in features and target
df[features] = df[features].replace([np.inf, -np.inf], np.nan)
df[target] = df[target].replace([np.inf, -np.inf], np.nan)

# Clip extreme values to avoid overflow issues
df[features] = df[features].clip(lower=-1e10, upper=1e10)
df[target] = df[target].clip(lower=-1e10, upper=1e10)

# Drop rows with any NaN in features or target
df_clean = df.dropna(subset=features + [target])

X = df_clean[features]
y = df_clean[target]

print(f"Data shape after cleaning: {X.shape}")

# %%
# ### 3. Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# %%
# ### 4. Define and Train Pipeline

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(
        n_estimators=200,
        max_depth=20,
        max_features=None,
        min_samples_leaf=1,
        min_samples_split=2,
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)

# %%
# ### 5. Evaluate on Test Set

y_pred = pipeline.predict(X_test)

# Safety check
if np.any(np.isinf(y_pred)) or np.any(np.isnan(y_pred)):
    raise ValueError("Predictions contain NaN or Inf")
if np.any(np.isinf(y_test)) or np.any(np.isnan(y_test)):
    raise ValueError("Actual test values contain NaN or Inf")

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse:.4f}")
print(f"Test R²: {r2:.4f}")

# %%
# ### 6. Save Pipeline and Feature List

joblib.dump(pipeline, '../model/full_pipeline.pkl')
print("✅ Pipeline saved to ../model/full_pipeline.pkl")

with open('../model/features.json', 'w') as f:
    json.dump(features, f)
print("✅ Feature list saved to ../model/features.json")

# %%
# ### 7. Prediction & Deployment Step (can be run separately later)

pipeline = joblib.load('../model/full_pipeline.pkl')
with open('../model/features.json', 'r') as f:
    features = json.load(f)

df_pred = pd.read_csv('../data/cleaned/feature_engineered_data.csv')

# Replace inf/-inf with NaN in features
df_pred[features] = df_pred[features].replace([np.inf, -np.inf], np.nan)

# Clip extreme values in features
df_pred[features] = df_pred[features].clip(lower=-1e10, upper=1e10)

# Drop rows with NaNs in features
valid_idx = df_pred[features].dropna().index
X_pred = df_pred.loc[valid_idx, features]

# Clean target similarly
df_pred.loc[valid_idx, target] = df_pred.loc[valid_idx, target].replace([np.inf, -np.inf], np.nan)
df_pred.loc[valid_idx, target] = df_pred.loc[valid_idx, target].clip(lower=-1e10, upper=1e10)

# Drop any rows with NaN in target after cleaning
valid_idx_final = df_pred.loc[valid_idx, target].dropna().index

X_pred = df_pred.loc[valid_idx_final, features]
y_true = df_pred.loc[valid_idx_final, target]

print(f"Dropped {len(df_pred) - len(X_pred)} rows due to NaN/Inf/large values after cleaning.")

# Final sanity checks
if np.isinf(X_pred.values).any() or np.isinf(y_true.values).any():
    raise ValueError("Infinite values remain in features or target after cleaning!")
if (np.abs(X_pred.values) > 1e10).any() or (np.abs(y_true.values) > 1e10).any():
    raise ValueError("Too large values remain in features or target after cleaning!")

# Predict
y_pred = pipeline.predict(X_pred)

# Evaluate
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"Deployment Data Test MSE: {mse:.4f}")
print(f"Deployment Data Test R²: {r2:.4f}")

# Save predictions
df_results = df_pred.loc[y_true.index].copy()
df_results['predicted_liquidity_ratio'] = y_pred
df_results.to_csv('../data/results/predictions.csv', index=False)
print("✅ Predictions saved to ../data/results/predictions.csv")



Data shape after cleaning: (997, 11)
Test MSE: 7.0775
Test R²: 0.5132
✅ Pipeline saved to ../model/full_pipeline.pkl
✅ Feature list saved to ../model/features.json
Dropped 3 rows due to NaN/Inf/large values after cleaning.
Deployment Data Test MSE: 31.1728
Deployment Data Test R²: 0.9311
✅ Predictions saved to ../data/results/predictions.csv


In [9]:
import pandas as pd
import json
import os

# Paths 
data_path = '../data/cleaned/feature_engineered_data.csv'
features_path = '../model/features.json'
deployment_csv_path = '../Data/Cleaned/deployment_data.csv'

print(f"Current working dir: {os.getcwd()}")

# Load dataframe
print(f"Loading data from {data_path}")
df = pd.read_csv(data_path)
print(f"Data loaded with columns: {df.columns.tolist()}")

# Load features list
if not os.path.exists(features_path):
    raise FileNotFoundError(f"Features JSON file not found at {features_path}")

with open(features_path, 'r') as f:
    features = json.load(f)

print(f"Features loaded: {features}")

# Check for missing features
missing_features = set(features) - set(df.columns)
if missing_features:
    print(f"Warning: These features are missing in dataframe columns: {missing_features}")

# Filter dataframe with available features only
filtered_features = [feat for feat in features if feat in df.columns]
df_deployment = df[filtered_features]

# Save deployment CSV
os.makedirs(os.path.dirname(deployment_csv_path), exist_ok=True)
df_deployment.to_csv(deployment_csv_path, index=False)

print(f"✅ Deployment CSV created at: {deployment_csv_path}")






Current working dir: c:\Users\jashw\Cryptocurrency Liquidity Prediction for Market Stability\Notebook
Loading data from ../data/cleaned/feature_engineered_data.csv
Data loaded with columns: ['coin', 'symbol', 'price', '1h', '24h', '7d', '24h_volume', 'mkt_cap', 'date', 'SourceFile', 'price_ma7', 'price_ma30', 'volatility_7d', 'volume_change_pct', 'price_change_pct', 'liquidity_ratio', 'liquidity_ratio_7d']
Features loaded: ['price', '1h', '24h', '7d', '24h_volume', 'mkt_cap', 'price_ma7', 'price_ma30', 'volatility_7d', 'volume_change_pct', 'price_change_pct']
✅ Deployment CSV created at: ../Data/Cleaned/deployment_data.csv
