In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV

warnings.filterwarnings('ignore')
np.random.seed(42)

# Load the dataset
data = pd.read_csv('perfect_prediction_neoverse_logs.csv')

# Timestamp handling
timestamp_col = 'Timestamps' if 'Timestamps' in data.columns else 'Timestamp'
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data.drop(columns=[timestamp_col], inplace=True)

# Define column types
categorical_columns = ['Player ID', 'Player Rank', 'Team Affiliation', 'VIP Status',
                       'Player Level', 'Dark Market Transactions']
numerical_columns = ['Hours Played', 'Money Spent ($)', 'Criminal Score', 'Missions Completed',
                     'Cash on Hand ($)', 'Sync Stability (%)', 'Quest Exploit Score',
                     'Transaction Amount ($)', 'Neural Link Stability (%)']

# Encode categorical variables that are part of our formula
for col in categorical_columns:
    le = LabelEncoder()
    data[f'{col}_encoded'] = le.fit_transform(data[col])

# Backup original data
data_original = data.copy()

# Define feature sets that directly reflect our formula
feature_sets = [
    ["Hours Played", "Quest Exploit Score"],
    ["Hours Played", "Quest Exploit Score", "Criminal Score"],
    ["Hours Played", "Quest Exploit Score", "Criminal Score", "VIP Status_encoded"],
    ["Hours Played", "Quest Exploit Score", "Criminal Score",
        "VIP Status_encoded", "Team Affiliation_encoded"],
    ["Hours Played", "Quest Exploit Score", "Criminal Score", "VIP Status_encoded",
     "Team Affiliation_encoded", "Dark Market Transactions_encoded"]
]

splits = [0.6, 0.7, 0.8, 0.9]
y_full = data["Money Spent ($)"]

results = []

# Use Ridge Regression with Cross-Validation to match your format
ridge_alphas = np.logspace(-3, 3, 20)

for idx, features in enumerate(feature_sets):
    for split in splits:
        X = data[features]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y_full, train_size=split, random_state=42)

        model = RidgeCV(alphas=ridge_alphas,
                        scoring='neg_root_mean_squared_error', cv=5)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        rmse = round(np.sqrt(mean_squared_error(y_test, preds)), 2)
        r2 = round(r2_score(y_test, preds), 4)

        # Create the same JSON structure you specified
        results.append({
            "feature_set_name": f"Set {idx+1}",
            "features": features,
            "train_split": split,
            "train_size": len(X_train),
            "test_size": len(X_test),
            "rmse": rmse,
            "r2_score": r2,
            "best_alpha": float(model.alpha_),
            "predictions": preds[:50].tolist(),
            "actuals": y_test[:50].tolist()
        })

        print(
            f"✅ {features} | Split: {int(split*100)}% ➤ RMSE: {rmse} | R²: {r2} | Alpha: {model.alpha_}")

# Save results with the exact structure you specified
with open("ridge_model_results_for_money_spent_perfect.json", "w") as f:
    json.dump(results, f, indent=2)

print("\n📁 Saved all model results to 'ridge_model_results_for_money_spent_perfect.json'")