# Regression of Used Car Prices

## Keras - 1 Attempt

## Attempt 1 

In [None]:

# Reproducibility
import os, random, time, json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Data Loading
train_df = pd.read_csv('train.csv.zip')
test_df = pd.read_csv('test.csv.zip')
sample_sub = pd.read_csv('sample_submission.csv.zip', nrows=1)

# Infer ID and target columns
cols = list(sample_sub.columns)
id_col = cols[0]
target_columns = cols[1:]

# Combine training data
df = train_df.copy()

# Target encoding for continuous regression
y_values = df[target_columns].astype(float).values
if np.all(y_values >= 0):
    y_enc = np.log1p(y_values)
else:
    y_enc = y_values

# Features
X = df.drop(columns=target_columns + [id_col], errors='ignore')

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc,
    test_size=0.2,
    random_state=seed
)

# Optional: drop ID from features
for df_ in (X_train, X_val):
    if id_col in df_.columns:
        df_.drop(columns=[id_col], inplace=True)

# Drop all-missing columns
all_missing = [c for c in X_train.columns if X_train[c].isna().all()]
X_train.drop(columns=all_missing, inplace=True)
X_val.drop(columns=all_missing, inplace=True)

# Feature types
numeric_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object','category']).columns.tolist()
low_cardinality = [c for c in cat_features if X_train[c].nunique() <= 50]

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', cat_pipeline, low_cardinality)
])

# Fit and transform
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

# Model architecture determination
n_samples, n_features = X_train_proc.shape
n_targets = y_train.shape[1] if y_train.ndim > 1 else 1

if n_samples < 10000 or n_features < 100:
    layer_sizes = [min(n_features*2, 128), min(n_features, 64)]
    use_bn = False
    drop_rates = [0.3, 0.3]
else:
    sizes = [n_features * i for i in (2, 1, 0.5, 0.25)]
    layer_sizes = [int(min(s, 1024)) for s in sizes if min(s, 1024) >= 16]
    use_bn = True
    drop_rates = [0.4] * len(layer_sizes)

# Build model
model = Sequential()
for idx, size in enumerate(layer_sizes):
    if idx == 0:
        model.add(Dense(size, activation='relu', input_dim=n_features))
    else:
        model.add(Dense(size, activation='relu'))
    if use_bn:
        model.add(BatchNormalization())
    model.add(Dropout(drop_rates[idx]))
model.add(Dense(n_targets, activation='linear'))

# Compile
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[RootMeanSquaredError(name='rmse'), MeanAbsoluteError(name='mae')]
)

# Callbacks
callbacks = [
    EarlyStopping(patience=10, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_model.h5', save_best_only=True, verbose=1)
]

# Training
start_time = time.time()
history = model.fit(
    X_train_proc, y_train,
    validation_data=(X_val_proc, y_val),
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
duration = time.time() - start_time

# Logging results
results = {
    'training_loss': history.history['loss'][-1],
    'validation_loss': history.history['val_loss'][-1],
    'training_RMSE': history.history['rmse'][-1],
    'validation_RMSE': history.history['val_rmse'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# Prediction & Submission
X_test = test_df.copy()
if id_col in X_test.columns:
    test_ids = X_test[id_col]
X_test = X_test.drop(columns=target_columns + [id_col], errors='ignore')
X_test.drop(columns=all_missing, errors='ignore', inplace=True)
X_test_proc = preprocessor.transform(X_test)
raw_preds = model.predict(X_test_proc)
final = np.expm1(np.clip(raw_preds, a_min=None, a_max=20)) if np.all(raw_preds >= 0) else raw_preds
if final.ndim == 1:
    final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 1: val_loss improved from inf to 0.75041, saving model to best_model.h5




2357/2357 - 4s - 2ms/step - loss: 12.3391 - mae: 2.7249 - rmse: 3.5127 - val_loss: 0.7504 - val_mae: 0.7122 - val_rmse: 0.8663
Epoch 2/100

Epoch 2: val_loss improved from 0.75041 to 0.54282, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 5.8454 - mae: 1.9247 - rmse: 2.4177 - val_loss: 0.5428 - val_mae: 0.5688 - val_rmse: 0.7368
Epoch 3/100

Epoch 3: val_loss improved from 0.54282 to 0.45569, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 3.5368 - mae: 1.4943 - rmse: 1.8806 - val_loss: 0.4557 - val_mae: 0.5071 - val_rmse: 0.6750
Epoch 4/100

Epoch 4: val_loss improved from 0.45569 to 0.37745, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 2.1124 - mae: 1.1452 - rmse: 1.4534 - val_loss: 0.3774 - val_mae: 0.4401 - val_rmse: 0.6144
Epoch 5/100

Epoch 5: val_loss improved from 0.37745 to 0.33741, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 1.1412 - mae: 0.8305 - rmse: 1.0683 - val_loss: 0.3374 - val_mae: 0.4109 - val_rmse: 0.5809
Epoch 6/100

Epoch 6: val_loss improved from 0.33741 to 0.31136, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 0.5927 - mae: 0.5837 - rmse: 0.7699 - val_loss: 0.3114 - val_mae: 0.3947 - val_rmse: 0.5580
Epoch 7/100

Epoch 7: val_loss improved from 0.31136 to 0.30957, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 0.3941 - mae: 0.4626 - rmse: 0.6278 - val_loss: 0.3096 - val_mae: 0.3974 - val_rmse: 0.5564
Epoch 8/100

Epoch 8: val_loss improved from 0.30957 to 0.30434, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 0.3574 - mae: 0.4359 - rmse: 0.5979 - val_loss: 0.3043 - val_mae: 0.3940 - val_rmse: 0.5517
Epoch 9/100

Epoch 9: val_loss did not improve from 0.30434
2357/2357 - 4s - 2ms/step - loss: 0.3496 - mae: 0.4300 - rmse: 0.5913 - val_loss: 0.3079 - val_mae: 0.3992 - val_rmse: 0.5549
Epoch 10/100

Epoch 10: val_loss did not improve from 0.30434
2357/2357 - 3s - 1ms/step - loss: 0.3470 - mae: 0.4279 - rmse: 0.5891 - val_loss: 0.3114 - val_mae: 0.4010 - val_rmse: 0.5580
Epoch 11/100

Epoch 11: val_loss did not improve from 0.30434
2357/2357 - 4s - 2ms/step - loss: 0.3462 - mae: 0.4273 - rmse: 0.5884 - val_loss: 0.3116 - val_mae: 0.4016 - val_rmse: 0.5582
Epoch 12/100

Epoch 12: val_loss did not improve from 0.30434
2357/2357 - 3s - 1ms/step - loss: 0.3461 - mae: 0.4275 - rmse: 0.5883 - val_loss: 0.3082 - val_mae: 0.3978 - val_rmse: 0.5551
Epoch 13/100

Epoch 13: val_loss did not improve from 0.30434
2357/2357 - 4s - 2ms/step - loss: 0.3464 - mae: 0.4273 - rmse:

## Keras Tuner - 1 Attempt

## Attempt 1 

In [None]:
# Reproducibility
import os, random, time, json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Data Loading
train_df = pd.read_csv('train.csv.zip')
test_df = pd.read_csv('test.csv.zip')
sample_sub = pd.read_csv('sample_submission.csv.zip', nrows=1)

# Infer ID and target columns
cols = list(sample_sub.columns)
id_col = cols[0]
target_columns = cols[1:]

# Combine training data
df = train_df.copy()

# Target encoding for continuous regression
y_values = df[target_columns].astype(float).values
if np.all(y_values >= 0):
    y_enc = np.log1p(y_values)
else:
    y_enc = y_values

# Features
X = df.drop(columns=target_columns + [id_col], errors='ignore')

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc,
    test_size=0.2,
    random_state=seed
)

# Optional: drop ID from features
for df_ in (X_train, X_val):
    if id_col in df_.columns:
        df_.drop(columns=[id_col], inplace=True)

# Drop all-missing columns
all_missing = [c for c in X_train.columns if X_train[c].isna().all()]
X_train.drop(columns=all_missing, inplace=True)
X_val.drop(columns=all_missing, inplace=True)

# Feature types
numeric_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object','category']).columns.tolist()
low_cardinality = [c for c in cat_features if X_train[c].nunique() <= 50]

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', cat_pipeline, low_cardinality)
])

# Fit and transform
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

# Model architecture determination
n_samples, n_features = X_train_proc.shape
n_targets = y_train.shape[1] if y_train.ndim > 1 else 1

import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

# Define early stopping and checkpointing
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# Input dimension
n_features = X_train_proc.shape[1]

# HyperModel
class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024)
        drop = hp.Float('dropout', 0.0, 0.5)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        model = Sequential()
        for idx in range(layers):
            if idx == 0:
                model.add(Dense(units, activation='relu', input_dim=n_features))
            else:
                model.add(Dense(units, activation='relu'))
            model.add(Dropout(drop))
        model.add(Dense(1, activation='linear'))  # Assuming n_targets = 1 for regression

        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[RootMeanSquaredError(name='rmse'), MeanAbsoluteError(name='mae')])
        return model

# Tuner
bs = 32  # batch size
ep = 20  # epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    seed=42,
    overwrite=True,
    project_name='bayesian_tuner'
)

if y_val is not None:
    tuner.search(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_proc, y_train,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(tuner.get_best_hyperparameters(1)[0])

# Retrain model with original callbacks and data
start_time = time.time()
if y_val is not None:
    history = model.fit(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_proc, y_train,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

duration = time.time() - start_time


# Logging results
results = {
    'training_loss': history.history['loss'][-1],
    'validation_loss': history.history['val_loss'][-1],
    'training_RMSE': history.history['rmse'][-1],
    'validation_RMSE': history.history['val_rmse'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# Prediction & Submission
X_test = test_df.copy()
if id_col in X_test.columns:
    test_ids = X_test[id_col]
X_test = X_test.drop(columns=target_columns + [id_col], errors='ignore')
X_test.drop(columns=all_missing, errors='ignore', inplace=True)
X_test_proc = preprocessor.transform(X_test)
raw_preds = model.predict(X_test_proc)
final = np.expm1(np.clip(raw_preds, a_min=None, a_max=20)) if np.all(raw_preds >= 0) else raw_preds
if final.ndim == 1:
    final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Trial 10 Complete [00h 06m 56s]
val_loss: 0.2960398197174072

Best val_loss So Far: 0.29548490047454834
Total elapsed time: 01h 12m 03s
Epoch 1/100




4714/4714 - 33s - 7ms/step - loss: 0.5952 - mae: 0.5276 - rmse: 0.7715 - val_loss: 0.3115 - val_mae: 0.4086 - val_rmse: 0.5581
Epoch 2/100
4714/4714 - 20s - 4ms/step - loss: 0.3683 - mae: 0.4491 - rmse: 0.6069 - val_loss: 0.3149 - val_mae: 0.4172 - val_rmse: 0.5612
Epoch 3/100




4714/4714 - 20s - 4ms/step - loss: 0.3568 - mae: 0.4400 - rmse: 0.5973 - val_loss: 0.2985 - val_mae: 0.3894 - val_rmse: 0.5464
Epoch 4/100
4714/4714 - 20s - 4ms/step - loss: 0.3379 - mae: 0.4253 - rmse: 0.5813 - val_loss: 0.3087 - val_mae: 0.3924 - val_rmse: 0.5556
Epoch 5/100




4714/4714 - 19s - 4ms/step - loss: 0.3303 - mae: 0.4196 - rmse: 0.5747 - val_loss: 0.2977 - val_mae: 0.3890 - val_rmse: 0.5457
Epoch 6/100
4714/4714 - 19s - 4ms/step - loss: 0.3279 - mae: 0.4171 - rmse: 0.5726 - val_loss: 0.3013 - val_mae: 0.3893 - val_rmse: 0.5489
Epoch 7/100
4714/4714 - 19s - 4ms/step - loss: 0.3265 - mae: 0.4162 - rmse: 0.5714 - val_loss: 0.3024 - val_mae: 0.3900 - val_rmse: 0.5499
Epoch 8/100
4714/4714 - 19s - 4ms/step - loss: 0.3258 - mae: 0.4158 - rmse: 0.5708 - val_loss: 0.3035 - val_mae: 0.3973 - val_rmse: 0.5509
Epoch 9/100
4714/4714 - 21s - 4ms/step - loss: 0.3239 - mae: 0.4142 - rmse: 0.5691 - val_loss: 0.3019 - val_mae: 0.3916 - val_rmse: 0.5495
Epoch 10/100
4714/4714 - 21s - 4ms/step - loss: 0.3244 - mae: 0.4146 - rmse: 0.5695 - val_loss: 0.3005 - val_mae: 0.3894 - val_rmse: 0.5482
Epoch 11/100
4714/4714 - 23s - 5ms/step - loss: 0.3227 - mae: 0.4136 - rmse: 0.5681 - val_loss: 0.3025 - val_mae: 0.3935 - val_rmse: 0.5500
Epoch 12/100
4714/4714 - 21s - 5ms/st














[1m3928/3928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step


In [None]:
print(duration)

327.234264
