Based on cross-validated performance, Random Forest is selected as the current best-performing model and is trained on the full training set.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import Ridge

RANDOM = 123
np.random.seed(RANDOM)

In [2]:
train_df = pd.read_csv('../data/CW1_train.csv')
test_df = pd.read_csv('../data/CW1_test.csv')

X = train_df.drop(columns=['outcome'])
y = train_df['outcome']

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f'Categorical columns: {categorical_cols}')
print(f'Numerical columns: {numerical_cols}')

Categorical columns: ['cut', 'color', 'clarity']
Numerical columns: ['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = X.select_dtypes(include=['object']).columns.tolist()


In [3]:
# Preprocess: passthrough numeric, one-hot encode categoricals
preprocess_tree = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), categorical_cols),
    ],
    remainder="drop",
)

In [4]:
rf_model = RandomForestRegressor(
    n_estimators=600,
    max_depth=None,
    min_samples_leaf=2,
    random_state=RANDOM,
    n_jobs=-1,
)

rf_pipeline = Pipeline(steps=[
    ("preprocess", preprocess_tree),
    ("model", rf_model),
])


In [5]:
rf_pipeline.fit(X, y)
print("RF trained on full training set.")

RF trained on full training set.


In [6]:
# Predict on test set
yhat = rf_pipeline.predict(test_df)

# Create submission (single column named yhat)
submission = pd.DataFrame({"yhat": yhat})

# Save (replace KNUMBER)
submission_path = "CW1_submission_k23075501.csv"
submission.to_csv(submission_path, index=False)

print("Saved:", submission_path)
print(submission.head())


Saved: CW1_submission_k23075501.csv
        yhat
0   5.167404
1  -6.748130
2   4.059679
3 -17.691721
4 -14.128551


In [7]:
assert submission.shape == (len(test_df), 1)
assert submission.columns.tolist() == ["yhat"]
assert np.isfinite(submission["yhat"]).all()
print("Submission format checks passed.")


Submission format checks passed.


# Cross-Validated Evaluation of Random Forest