# Model Comparison

Original dataset:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import pandas as pd

df = pd.read_csv('CW1_train.csv')
# Identify categorical columns
categorical_cols = ['cut', 'color', 'clarity']  # Replace with actual categorical column names

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Split data
X = df.drop(columns=['outcome'])
y = df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_train = r2_score(y_train, model.predict(X_train))
    r2_test = r2_score(y_test, y_pred)
    results[name] = {"R² Train": r2_train, "R² Test": r2_test}
    print(f"{name}: R² Train = {r2_train:.4f}, R² Test = {r2_test:.4f}")

Linear Regression: R² Train = 0.2992, R² Test = 0.2762
Ridge Regression: R² Train = 0.2992, R² Test = 0.2762
Lasso Regression: R² Train = 0.2984, R² Test = 0.2751
Random Forest: R² Train = 0.9241, R² Test = 0.4294
XGBoost: R² Train = 0.9177, R² Test = 0.3748


Selected features:

In [57]:
df = pd.read_csv('selected_features.csv')

# Split data
X = df.drop(columns=['outcome'])
y = df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_train = r2_score(y_train, model.predict(X_train))
    r2_test = r2_score(y_test, y_pred)
    results[name] = {"R² Train": r2_train, "R² Test": r2_test}
    print(f"{name}: R² Train = {r2_train:.4f}, R² Test = {r2_test:.4f}")

Linear Regression: R² Train = 0.2942, R² Test = 0.2722
Ridge Regression: R² Train = 0.2942, R² Test = 0.2722
Lasso Regression: R² Train = 0.2941, R² Test = 0.2725
Random Forest: R² Train = 0.9239, R² Test = 0.4315
XGBoost: R² Train = 0.8888, R² Test = 0.3856


Engineered features:

In [60]:
df = pd.read_csv('processed_features.csv')

# Split data
X = df.drop(columns=['outcome'])
y = df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_train = r2_score(y_train, model.predict(X_train))
    r2_test = r2_score(y_test, y_pred)
    results[name] = {"R² Train": r2_train, "R² Test": r2_test}
    print(f"{name}: R² Train = {r2_train:.4f}, R² Test = {r2_test:.4f}")

Linear Regression: R² Train = 0.3323, R² Test = 0.2770
Ridge Regression: R² Train = 0.3312, R² Test = 0.2743


  model = cd_fast.enet_coordinate_descent(


Lasso Regression: R² Train = 0.3165, R² Test = 0.2673
Random Forest: R² Train = 0.9140, R² Test = 0.3726
XGBoost: R² Train = 0.9534, R² Test = 0.3267


# Grid Search for Random Forest and XGBoost

In [5]:
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

df = pd.read_csv('selected_features.csv')

# Split data
X = df.drop(columns=['outcome'])
y = df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grids
rf_param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

xgb_param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

# Run Grid Search for Random Forest
rf = RandomForestRegressor(random_state=42)
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=3, scoring='r2', verbose=2, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
best_rf = rf_grid_search.best_estimator_

# Run Grid Search for XGBoost
xgb = XGBRegressor(random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_param_grid, cv=3, scoring='r2', verbose=2, n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)
best_xgb = xgb_grid_search.best_estimator_

# Print best parameters
print("\nBest Random Forest Parameters:", rf_grid_search.best_params_)
print("\nBest XGBoost Parameters:", xgb_grid_search.best_params_)

# Evaluate optimized models
rf_y_pred = best_rf.predict(X_test)
xgb_y_pred = best_xgb.predict(X_test)

rf_r2 = r2_score(y_train, best_rf.predict(X_train))
rf_r2test = r2_score(y_test, rf_y_pred)

xgb_r2 = r2_score(y_train, best_xgb.predict(X_train))
xgb_r2test = r2_score(y_test, xgb_y_pred)

# Print optimized results
print(f"\nOptimized Random Forest: R² Train = {rf_r2:.4f}, R² Test = {rf_r2test:.4f}")
print(f"Optimized XGBoost: R² Train = {xgb_r2:.4f}, R² Test = {xgb_r2test:.4f}")

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   1.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   1.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   1.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   1.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   1.6s
[CV] END max_depth=10, max_features



[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=0.7; total time=   2.9s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=0.7; total time=   3.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=1.0; total time=   3.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=1.0; total time=   2.9s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=0.7; total time=   3.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=1.0; total time=   3.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=500, subsample=0.7; total time=   4.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=500, subsample=0.7; total time=   5.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estima