### Dataset =  kaggle competitions download -c house-prices-advanced-regression-techniques

In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

2025-07-16 16:12:59.989851: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-16 16:13:00.000356: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-16 16:13:00.086030: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-16 16:13:00.164574: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752664380.229457    4253 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752664380.24

## Import Dataset

In [2]:
train_df = pd.read_csv("../Datasets/house-prices-advanced-regression-techniques/train.csv")
test_df = pd.read_csv("../Datasets/house-prices-advanced-regression-techniques/test.csv")

In [3]:
train_X = train_df.drop(['Id', 'SalePrice'], axis=1)
train_Y = train_df['SalePrice']

train_X.shape

(1460, 79)

## preprocessing

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)


train_X = train_X.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu'], axis=1)


In [5]:
from sklearn.pipeline   import Pipeline
from sklearn.impute     import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose    import ColumnTransformer

# 1. Define columns
numeric_features  = train_X.select_dtypes(include=['number']).columns.tolist()
categorical_feats = train_X.select_dtypes(include=['object']).columns.tolist()
ordinal_feats     = ['ExterQual','KitchenQual']   # example
for col in ordinal_feats:
    if col in categorical_feats:
        categorical_feats.remove(col)

quality_order = ['Po','Fa','TA','Gd','Ex']

# 2. Pipelines
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',   StandardScaler())
])

ord_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[quality_order]*len(ordinal_feats)))
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 3. ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipe, numeric_features),
    ('ord', ord_pipe, ordinal_feats),
    ('cat', cat_pipe, categorical_feats),
])

# 4. Fit & transform
X_array = preprocessor.fit_transform(train_X)

# 5. Rebuild column names
num_names = numeric_features
ord_names = ordinal_feats
cat_names = preprocessor.named_transformers_['cat'] \
                .named_steps['onehot'] \
                .get_feature_names_out(categorical_feats).tolist()

all_feature_names = num_names + ord_names + cat_names
train_X_processed = pd.DataFrame(X_array, columns=all_feature_names)

# 6. Verify no missing values remain
print("Missing after:\n", train_X_processed.isna().sum().loc[lambda s: s>0])


Missing after:
 Series([], dtype: int64)


In [6]:
train_Y.shape

(1460,)

In [7]:

# import numpy as np

# # Original target
# y = train_Y

# # Log-transform the target
# y_log = np.log1p(y)  # log(1 + x)


In [31]:
from tensorflow.keras import regularizers

# 1) Define the model
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(train_X_processed.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu',
        kernel_regularizer=regularizers.l2(0.06)),
    # tf.keras.layers.Dense(4, activation='relu',
    #     kernel_regularizer=regularizers.l2(0.03)),
    tf.keras.layers.Dense(1, activation='linear')
])

# Temporarily remove L2 to isolate the issue:
# model = tf.keras.models.Sequential([
#     tf.keras.Input(shape=(train_X_processed.shape[1],)),
#     tf.keras.layers.Dense(128, activation='relu'),  # no kernel_regularizer
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(32, activation='relu'),
#     tf.keras.layers.Dense(16, activation='relu'),
#     tf.keras.layers.Dense(1, activation='linear')
# ])


In [32]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01,
    decay_steps=70000,
    decay_rate=0.9
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)


In [33]:
model.compile(
    optimizer=optimizer,
    loss='mean_absolute_error',
    metrics=['mae']
)


In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_X_processed, train_Y,
    test_size=0.1,        # 20% for testing
    random_state=42,      # for reproducibility
    shuffle=True          # default True, but you can turn off for time series
)

In [36]:
model.fit(
    X_train, y_train,
    epochs=1000,
    batch_size=128,
    validation_data=(X_test, y_test)
)


Epoch 1/1000
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 20138.8828 - mae: 16416.7109 - val_loss: 22105.5898 - val_mae: 18396.8340
Epoch 2/1000
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 19664.2617 - mae: 15957.1963 - val_loss: 22064.6504 - val_mae: 18361.3789
Epoch 3/1000
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 19560.6328 - mae: 15859.1533 - val_loss: 22049.2734 - val_mae: 18354.5625
Epoch 4/1000
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 20300.7520 - mae: 16607.6191 - val_loss: 22050.1895 - val_mae: 18364.5820
Epoch 5/1000
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 19557.1055 - mae: 15872.9238 - val_loss: 22046.2578 - val_mae: 18369.4121
Epoch 6/1000
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 20181.6758 - mae: 16504.5703 - val_loss: 22044.3359 - val_mae: 183

<keras.src.callbacks.history.History at 0x7972181f7620>

In [37]:


# Store Id for submission
test_ids = test_df['Id']

# Drop Id (not a feature)
test_data = test_df.drop('Id', axis=1)

# Preprocess test data using already-fitted pipeline
test_X_processed = preprocessor.transform(test_data)

# Make sure it's a DataFrame
test_X_processed = pd.DataFrame(test_X_processed, columns=all_feature_names)


In [38]:
test_log_preds = model.predict(test_X_processed).flatten()
# test_preds = np.expm1(test_log_preds)

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [40]:
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": test_log_preds
})

submission.to_csv("../submissions/housing_prices_submission_v1_dense_NN.csv", index=False)
