# Model Training for Test-Time Scaling

This notebook covers:
1. Loading preprocessed data
2. Model definition
3. Training baseline model
4. Initial evaluation

In [None]:
import sys
sys.path.append('..')

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

## 1. Load Preprocessed Data

In [None]:
# Load the preprocessed data
X_train = np.load('../data/processed/X_train.npy')
X_test = np.load('../data/processed/X_test.npy')
y_train = np.load('../data/processed/y_train.npy')
y_test = np.load('../data/processed/y_test.npy')

print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)

## 2. Model Definition

In [None]:
# Initialize the baseline model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42
)

## 3. Model Training

In [None]:
# Train the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, '../data/processed/baseline_model.pkl')

## 4. Initial Evaluation

In [None]:
# Make predictions on test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## 5. Feature Importance Analysis

In [None]:
import matplotlib.pyplot as plt

# Get feature importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), [f'Feature {i}' for i in indices], rotation=45)
plt.tight_layout()
plt.show()