In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data
data_file = '../numerical_data.csv'
df = pd.read_csv(data_file, index_col='id')

# Ensure the target exists
if 'CompTotal' not in df.columns:
    raise ValueError("Target column 'CompTotal' not found in dataset.")

# Cap extreme outliers for the target
threshold = 1e6
df['CompTotal'] = np.clip(df['CompTotal'], 0, threshold)
df.reset_index(drop=True, inplace=True)

# Define features and target
X = df.drop('CompTotal', axis=1)
y = df['CompTotal']

# Automatically identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

# Create a preprocessor that scales numerical features and one-hot encodes categorical features.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ]
)

# Build a pipeline with a basic linear regression model.
# The TransformedTargetRegressor applies np.log1p to the target during training and np.expm1 when predicting.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
model = TransformedTargetRegressor(regressor=pipeline, func=np.log1p, inverse_func=np.expm1)

# 5-fold Cross-Validation to evaluate model performance
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
rms_errors = []

for train_idx, val_idx in kfold.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    
    rms = np.sqrt(mean_squared_error(y_val, preds))
    rms_errors.append(rms)

avg_rms = np.mean(rms_errors)
print(f"Model Average CV RMS Error: {avg_rms:.4f}")


Model Average CV RMS Error: 160216.6542


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# -----------------------
# Data Loading & Cleaning
# -----------------------
data_file = '../numerical_data.csv'
df = pd.read_csv(data_file, index_col='id')

if 'CompTotal' not in df.columns:
    raise ValueError("Target column 'CompTotal' not found in dataset.")

# Cap extreme outliers for the target and reset index
threshold = 1e6
df['CompTotal'] = np.clip(df['CompTotal'], 0, threshold)
df.reset_index(drop=True, inplace=True)

# Define features and target
X = df.drop('CompTotal', axis=1)
y = df['CompTotal']

# -----------------------
# Preprocessing: One-Hot Encoding & Scaling
# -----------------------
# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

# Create a preprocessor:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy="mean")),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ]
)

# Fit the preprocessor so that all sub-transformers are fitted.
X_transformed = preprocessor.fit_transform(X)

# Retrieve the OneHotEncoder instance from the preprocessor.
cat_encoder = preprocessor.named_transformers_['cat']

# In case the OneHotEncoder isn't fitted (it should be after preprocessor.fit_transform),
# you can force a fit on just the categorical data:
if not hasattr(cat_encoder, 'categories_'):
    cat_encoder.fit(X[categorical_features])

# Now, get the feature names for the categorical part.
cat_features_names = cat_encoder.get_feature_names_out(categorical_features)

# Combine numerical feature names (unchanged) with categorical feature names.
all_feature_names = list(numerical_features) + list(cat_features_names)

# Convert transformed features into a DataFrame.
X_preprocessed = pd.DataFrame(
    X_transformed.toarray() if hasattr(X_transformed, "toarray") else X_transformed,
    columns=all_feature_names
)

# -----------------------
# Part (b): Feature Engineering via K-Means Clustering
# -----------------------
print("\n=== (b) Feature Engineering with K-Means Clustering ===")

# Apply K-Means clustering on the preprocessed features.
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_preprocessed)

# Convert cluster labels into dummy variables.
cluster_dummies = pd.get_dummies(clusters, prefix="cluster")

# Create an enriched feature set by combining preprocessed features with cluster dummies.
X_enriched = pd.concat([X_preprocessed, cluster_dummies], axis=1)

# Apply log1p transformation to the target for variance stabilization.
y_trans = np.log1p(y)

# 5-fold Cross-Validation for the feature-engineered model.
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
fe_rms_errors = []
fe_model_params = []  # Save model parameters for each fold

for train_idx, val_idx in kfold.split(X_enriched):
    X_train, X_val = X_enriched.iloc[train_idx], X_enriched.iloc[val_idx]
    y_train, y_val = y_trans.iloc[train_idx], y_trans.iloc[val_idx]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    preds_trans = model.predict(X_val)
    preds = np.expm1(preds_trans)  # Invert the log1p transformation
    y_val_orig = np.expm1(y_val)
    
    rms = np.sqrt(mean_squared_error(y_val_orig, preds))
    fe_rms_errors.append(rms)
    fe_model_params.append({'coefficients': model.coef_, 'intercept': model.intercept_})

avg_rms_fe = np.mean(fe_rms_errors)
print(f"Feature Engineered Model Average CV RMS Error: {avg_rms_fe:.4f}")



=== (b) Feature Engineering with K-Means Clustering ===
Feature Engineered Model Average CV RMS Error: 145013.7387


In [7]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
print("\n=== (c) Regularized Model with Expanded Basis Functions ===")
data_file = '../numerical_data.csv'
df = pd.read_csv(data_file, index_col='id')
if 'CompTotal' not in df.columns:
    raise ValueError("Target column 'CompTotal' not found in dataset.")

# Cap extreme outliers
threshold = 1e6
df['CompTotal'] = np.clip(df['CompTotal'], 0, threshold)
df.reset_index(drop=True, inplace=True)

# Define features and target; apply log1p transform to the target.
X = df.drop('CompTotal', axis=1)
y = df['CompTotal']
y_trans = np.log1p(y)
X_encoded = pd.get_dummies(X, drop_first=True)
X_clean = X_encoded

# Expand features using 2nd order polynomial expansion (without bias)
poly2 = PolynomialFeatures(degree=2, include_bias=False)
X_poly2 = pd.DataFrame(poly2.fit_transform(X_clean), columns=poly2.get_feature_names_out(X_clean.columns))

# Generate clusters using KMeans on the one-hot encoded features
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_clean)
cluster_dummies = pd.get_dummies(clusters, prefix="cluster")

# Combine polynomial features with cluster dummies
X_c = pd.concat([X_poly2, cluster_dummies], axis=1)

# Split data into training and testing sets (using y_trans as target)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_trans, test_size=0.2, random_state=42)

# Evaluate Ridge regression over a set of alphas
alphas = [0.01, 0.1, 1, 10, 100]
ridge_rms_errors = []
ridge_models = []

for alpha in alphas:
    ridge = Ridge(alpha=alpha, random_state=42)
    ridge.fit(X_train_c, y_train_c)
    preds_test_trans = ridge.predict(X_test_c)
    preds_test = np.expm1(preds_test_trans)
    y_test_orig = np.expm1(y_test_c)
    
    rms = np.sqrt(mean_squared_error(y_test_orig, preds_test))
    ridge_rms_errors.append(rms)
    ridge_models.append(ridge)

best_alpha_idx = np.argmin(ridge_rms_errors)
best_alpha = alphas[best_alpha_idx]
best_ridge = ridge_models[best_alpha_idx]
best_ridge_rms = ridge_rms_errors[best_alpha_idx]
coef_norm = np.linalg.norm(best_ridge.coef_)

print(f"Best Ridge Model Alpha: {best_alpha}")
print(f"Best Ridge Model Test RMS Error: {best_ridge_rms:.4f}")
print(f"Norm of Best Ridge Model Parameters: {coef_norm:.4f}")


=== (c) Regularized Model with Expanded Basis Functions ===
Best Ridge Model Alpha: 100
Best Ridge Model Test RMS Error: 137276.6711
Norm of Best Ridge Model Parameters: 2.2193


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [8]:
print("\n=== (d) Non-Linear Model using Degree-3 Polynomial Expansion ===")

poly3 = PolynomialFeatures(degree=3, include_bias=False)
X_poly3 = pd.DataFrame(poly3.fit_transform(X_clean), columns=poly3.get_feature_names_out(X_clean.columns))

# Combine the degree-3 polynomial features with the cluster dummies (re-use same clusters)
X_d = pd.concat([X_poly3, cluster_dummies], axis=1)

# Set up 5-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
nonlinear_rms_errors = []
nonlinear_model_params = []

for train_idx, val_idx in kfold.split(X_d):
    X_train, X_val = X_d.iloc[train_idx], X_d.iloc[val_idx]
    y_train, y_val = y_trans.iloc[train_idx], y_trans.iloc[val_idx]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    preds_trans = model.predict(X_val)
    
    # Clip predictions in log-space to prevent extreme outputs
    preds_trans = np.clip(preds_trans, -20, 20)
    preds = np.expm1(preds_trans)
    y_val_orig = np.expm1(y_val)
    
    rms = np.sqrt(mean_squared_error(y_val_orig, preds))
    nonlinear_rms_errors.append(rms)
    nonlinear_model_params.append({'coefficients': model.coef_, 'intercept': model.intercept_})

avg_rms_nonlinear = np.mean(nonlinear_rms_errors)
print(f"Non-Linear (Degree-3) Model Average CV RMS Error: {avg_rms_nonlinear:.4f}")


=== (d) Non-Linear Model using Degree-3 Polynomial Expansion ===
Non-Linear (Degree-3) Model Average CV RMS Error: 125609342.4648
