# Data Preparation for Test-Time Scaling

This notebook covers:
1. Data loading
2. Initial exploration
3. Data cleaning
4. Train/test splitting
5. Initial scaling for baseline

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from utils.preprocessing import ScalingManager, load_and_preprocess_data, handle_missing_values
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Data Loading

Load your dataset and perform initial preprocessing.

In [None]:
# Example with a sample dataset (replace with your actual data)
# X, y = load_and_preprocess_data('path_to_your_data.csv', target_column='target')

# For demonstration, let's create some synthetic data
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, random_state=42)
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
y = pd.Series(y, name='target')

## 2. Initial Data Exploration

In [None]:
# Basic statistics
print("Dataset shape:", X.shape)
print("\nFeature statistics:")
print(X.describe())

# Distribution plots
plt.figure(figsize=(15, 5))
for i, col in enumerate(X.columns[:3]):
    plt.subplot(1, 3, i+1)
    sns.histplot(X[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()

## 3. Data Cleaning

In [None]:
# Handle missing values if any
X = handle_missing_values(X, strategy='mean')

# Check for and remove any duplicates
X = X.drop_duplicates()
y = y[X.index]

## 4. Train/Test Split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

## 5. Initial Scaling (Baseline)

In [None]:
# Initialize scaling manager
scaling_manager = ScalingManager()

# Fit and transform with standard scaler (baseline)
scaling_manager.fit_scaler(X_train, 'standard')
X_train_scaled = scaling_manager.transform(X_train, 'standard')
X_test_scaled = scaling_manager.transform(X_test, 'standard')

# Save processed data
np.save('../data/processed/X_train.npy', X_train_scaled)
np.save('../data/processed/X_test.npy', X_test_scaled)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_test.npy', y_test)

## 6. Visualization of Scaled Data

In [None]:
# Compare original vs scaled distributions for a few features
plt.figure(figsize=(15, 5))
for i, col in enumerate(X_train.columns[:3]):
    plt.subplot(1, 3, i+1)
    sns.histplot(X_train[col], label='Original', alpha=0.5)
    sns.histplot(X_train_scaled[:, i], label='Scaled', alpha=0.5)
    plt.title(f'Distribution of {col}')
    plt.legend()
plt.tight_layout()