In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from model import MLPNeuralNetwork

In [2]:
train_df = pd.read_csv('home-data-for-ml-course/train.csv')
test_df = pd.read_csv('home-data-for-ml-course/test.csv')

In [3]:
# Print basic information about the training and test datasets
print("Training dataset shape:", train_df.shape)
print("Test dataset shape:", test_df.shape)

# Print the number of columns in each dataset
print(f"Number of columns in training dataset: {train_df.shape[1]}")
print(f"Number of columns in test dataset: {test_df.shape[1]}")


# Check for missing values
print("\nMissing values in training dataset:")
print(train_df.isnull().sum().sort_values(ascending=False).head(10))

print("\nMissing values in test dataset:")
print(test_df.isnull().sum().sort_values(ascending=False).head(10))

Training dataset shape: (1460, 81)
Test dataset shape: (1459, 80)
Number of columns in training dataset: 81
Number of columns in test dataset: 80

Missing values in training dataset:
PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
MasVnrType      872
FireplaceQu     690
LotFrontage     259
GarageYrBlt      81
GarageCond       81
GarageType       81
dtype: int64

Missing values in test dataset:
PoolQC          1456
MiscFeature     1408
Alley           1352
Fence           1169
MasVnrType       894
FireplaceQu      730
LotFrontage      227
GarageYrBlt       78
GarageQual        78
GarageFinish      78
dtype: int64


In [4]:
# %%
# 0 is numerical, 1 is categorical
columns_dtypes = {
    'MSSubClass': 1,
    'MSZoning': 1,
    'LotFrontage': 0,
    'LotArea': 0,
    'Street': 1,
    'Alley': 1,
    'LotShape': 1,
    'LandContour': 1,
    'Utilities': 1,
    'LotConfig': 1,
    'LandSlope': 1,
    'Neighborhood': 1,
    'Condition1': 1,
    'Condition2': 1,
    'BldgType': 1,
    'HouseStyle': 1,
    'OverallQual': 1,
    'OverallCond': 1,
    'YearBuilt': 0,
    'YearRemodAdd': 0,
    'RoofStyle': 1,
    'RoofMatl': 1,
    'Exterior1st': 1,
    'Exterior2nd': 1,
    'MasVnrType': 1,
    'MasVnrArea': 0,
    'ExterQual': 1,
    'ExterCond': 1,
    'Foundation': 1,
    'BsmtQual': 1,
    'BsmtCond': 1,
    'BsmtExposure': 1,
    'BsmtFinType1': 1,
    'BsmtFinSF1': 0,
    'BsmtFinType2': 1,
    'BsmtFinSF2': 0,
    'BsmtUnfSF': 0,
    'TotalBsmtSF': 0,
    'Heating': 1,
    'HeatingQC': 1,
    'CentralAir': 1,
    'Electrical': 1,
    '1stFlrSF': 0,
    '2ndFlrSF': 0,
    'LowQualFinSF': 0,
    'GrLivArea': 0,
    'BsmtFullBath': 0,
    'BsmtHalfBath': 0,
    'FullBath': 0,
    'HalfBath': 0,
    'BedroomAbvGr': 0,
    'KitchenAbvGr': 0,
    'KitchenQual': 1,
    'TotRmsAbvGrd': 0,
    'Functional': 1,
    'Fireplaces': 0,
    'FireplaceQu': 1,
    'GarageType': 1,
    'GarageYrBlt': 0,
    'GarageFinish': 1,
    'GarageCars': 0,
    'GarageArea': 0,
    'GarageQual': 1,
    'GarageCond': 1,
    'PavedDrive': 1,
    'WoodDeckSF': 0,
    'OpenPorchSF': 0,
    'EnclosedPorch': 0,
    '3SsnPorch': 0,
    'ScreenPorch': 0,
    'PoolArea': 0,
    'PoolQC': 1,
    'Fence': 1,
    'MiscFeature': 1,
    'MiscVal': 0,
    'MoSold': 0,
    'YrSold': 0,
    'SaleType': 1,
    'SaleCondition': 1,
    'SalePrice': 0
}

In [5]:
# %%
def nomalization(data: pd.DataFrame, label_col: str):
    # Make a copy to avoid modifying the original
    data_to_normalize = data.copy()
    
    # Only drop the label column if it exists and is specified
    if label_col and label_col in data.columns:
        data_to_normalize = data.drop(columns=[label_col])
        
    features_max = data_to_normalize.max()
    features_min = data_to_normalize.min()
    
    # Handle division by zero (when max == min)
    range_values = features_max - features_min
    range_values = range_values.replace(0, 1)  # Replace zeros with 1 to avoid division by zero
    
    normalized_data = 2 * (data_to_normalize - features_min) / range_values - 1
    
    # Add back the label column if it exists
    if label_col and label_col in data.columns:
        normalized_data[label_col] = data[label_col]
        
    return normalized_data

In [6]:
def preprocess_data(data: pd.DataFrame, columns_dtype: dict, label_col: str, fit_encoders=False):
    imputer = KNNImputer(n_neighbors=int(np.sqrt(len(data))))
    for col in data.columns:
        if col in columns_dtype and columns_dtype[col] == 1: # categorical
            data[col] = data[col].fillna('unknown')
        elif col in columns_dtype and columns_dtype[col] == 0: # numerical
            imputed_values = imputer.fit_transform(data[[col]])
            data[col] = imputed_values.flatten()
                
    categorical_cols = [col for col in data.columns if col in columns_dtype and columns_dtype[col] == 1]
    numerical_cols = [col for col in data.columns if col in columns_dtype and columns_dtype[col] == 0]
    
    # For train data, fit and transform. For test data, just transform
    if fit_encoders:
        global encoder
        encoded_categorical_data = encoder.fit_transform(data[categorical_cols])
    else:
        encoded_categorical_data = encoder.transform(data[categorical_cols])
        
    encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
    
    for feature in encoded_feature_names:
        if feature not in columns_dtype:
            columns_dtype[feature] = 1
    
    categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoded_feature_names, index=data.index)
    
    normalized_numerical_data = nomalization(data[numerical_cols], label_col)
    
    df = pd.concat([categorical_df, normalized_numerical_data], axis=1)

    return df

In [7]:
def df_to_np(df: pd.DataFrame, label_col: str):
    if label_col:
        features_data = df.drop(columns=[label_col]).to_numpy()
        label_data = df[label_col].to_numpy()
        return features_data, label_data
    else:
        return df.to_numpy()

In [8]:
def k_fold(features: np.ndarray, labels: np.ndarray, k: int):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    return kf.split(features, labels)

In [9]:
# Store encoders to ensure consistency between train and test processing
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
# Store min/max values for consistent normalization
numerical_min_max = {}

In [10]:
preprocess_transformer = FunctionTransformer(func=preprocess_data, kw_args={'columns_dtype': columns_dtypes, 'label_col': 'SalePrice', 'fit_encoders': True})
np_transformer = FunctionTransformer(func=df_to_np, kw_args={'label_col': 'SalePrice'})

In [11]:
pipeline = Pipeline(steps=[
    ('preprocess', preprocess_transformer),
    ('df_to_np', np_transformer)
])

train_features, train_labels = pipeline.fit_transform(train_df.drop(columns=['Id']))

print('Data preprocessed')
print('Dataframe turned into numpy arrays')
print('Training features shape: ', train_features.shape)
print('Training labels shape: ', train_labels.shape)

Data preprocessed
Dataframe turned into numpy arrays
Training features shape:  (1460, 288)
Training labels shape:  (1460,)


In [12]:
test_preprocess_transformer = FunctionTransformer(func=preprocess_data, kw_args={'columns_dtype': columns_dtypes, 'label_col': None, 'fit_encoders': False})
test_np_transformer = FunctionTransformer(func=df_to_np, kw_args={'label_col': None})

In [13]:
test_data_pipeline = Pipeline(steps=[
    ('preprocess', test_preprocess_transformer),
    ('df_to_np', test_np_transformer)
])

test_features = test_data_pipeline.fit_transform(test_df.drop(columns=['Id']))
test_id = df_to_np(test_df['Id'], None)

print('Testing data preprocessed')
print('Dataframe turned into numpy arrays')
print('Testing features shape: ', test_features.shape)
print('Testing ids shape: ', test_id.shape)

Testing data preprocessed
Dataframe turned into numpy arrays
Testing features shape:  (1459, 288)
Testing ids shape:  (1459,)




In [14]:
k_fold_index = []
for train_index, test_index in k_fold(train_features, train_labels, k=7):
    k_fold_index.append([train_index, test_index])

params = {
    'alphas': [0.6, 0.7, 0.8, 0.9],
    'lambdas': [0, 0.1, 0.2, 0.3],
    'epsilons': [math.pow(math.e, -7), math.pow(math.e, -6), math.pow(math.e, -5)],
    'hidden_sizes': [5, 10, 20],
    'neurons_per_layer': [5, 20, 45],
    'early_stopping_threshold': 150000,
    'early_stopping_folds': 5
}

In [15]:
neural_network = MLPNeuralNetwork(len(train_features[0]), 1)
neural_network.get_info()

Parameters successfully loaded from best_params.pkl
Loaded parameters: {'alpha': 0.9, 'lambda': 0.2, 'epsilon': 0.006737946999085469, 'layer': 5, 'npl': 20, 'mean_mae': 160475.4781737818, 'folds_completed': 3, 'input_size': 288, 'output_size': 1, 'timestamp': '2025-03-28T21:45:48.634819'}
Automatically loaded parameters from best_params.pkl
Model: MLP - Multi-Layer Perceptron
Type: Regression
Purpose: Housing Prices Prediction

Architecture:
  Input size: 288
  Output size: 1
  Hidden layers: 5
  Neurons per layer: 20

Hyperparameters:
  Learning rate (alpha): 0.9
  Regularization (lambda): 0.2
  Convergence threshold (epsilon): 0.006737946999085469

Performance: MAE = 160475.478174

Last trained: 2025-03-28T21:45:48.634819

Status: Model needs initialization before training/prediction



In [16]:
neural_network.grid_search(train_features, train_labels, params, k_fold_index)

Early stopping enabled: Will skip parameter sets with average RMSE > 150000 after 4 folds
Training with alpha=0.6, lambda=0, epsilon=0.0009118819655545166, layer=5, npl=10

Weights are valid.

Starting cross-validation...

Processing fold 1...
Training stopped after reaching max_epochs (250).
RMSE for fold 1: 197620.0361723129

Processing fold 2...
Training stopped after reaching max_epochs (250).
RMSE for fold 2: 182564.37314478235

Processing fold 3...
Training stopped after reaching max_epochs (250).
RMSE for fold 3: 189109.1680173917

Processing fold 4...
Training stopped after reaching max_epochs (250).
RMSE for fold 4: 182552.06770380505

Early stopping: Current RMSE (187961.4113) is greater than the threshold (150000.0000)
Skipping remaining folds for this parameter set.
Mean RMSE: 187961.41125957298

New best parameters found with RMSE: 187961.4113
Training with alpha=0.6, lambda=0, epsilon=0.0009118819655545166, layer=5, npl=20

Weights are valid.

Starting cross-validation...

{'alpha': 0.7,
 'lambda': 0.3,
 'epsilon': 0.0009118819655545166,
 'layer': 10,
 'npl': 20,
 'mean_rmse': np.float64(176820.07651027397),
 'folds_completed': 4}

In [17]:
neural_network.fit(train_features, train_labels)

Training with parameters: alpha=0.7, lambda=0.3, epsilon=0.0009118819655545166, layers=10, neurons=20
Epoch 1 of 5000
Epoch 2 of 5000
Epoch 3 of 5000
Epoch 4 of 5000
Epoch 5 of 5000
Epoch 6 of 5000
Epoch 7 of 5000
Epoch 8 of 5000
Epoch 9 of 5000
Epoch 10 of 5000
Epoch 11 of 5000
Epoch 12 of 5000
Epoch 13 of 5000
Epoch 14 of 5000
Epoch 15 of 5000
Epoch 16 of 5000
Epoch 17 of 5000
Epoch 18 of 5000
Epoch 19 of 5000
Epoch 20 of 5000
Epoch 21 of 5000
Epoch 22 of 5000
Epoch 23 of 5000
Epoch 24 of 5000
Epoch 25 of 5000
Epoch 26 of 5000
Epoch 27 of 5000
Epoch 28 of 5000
Epoch 29 of 5000
Epoch 30 of 5000
Epoch 31 of 5000
Epoch 32 of 5000
Epoch 33 of 5000
Epoch 34 of 5000
Epoch 35 of 5000
Epoch 36 of 5000
Epoch 37 of 5000
Epoch 38 of 5000
Epoch 39 of 5000
Epoch 40 of 5000
Epoch 41 of 5000
Epoch 42 of 5000
Epoch 43 of 5000
Epoch 44 of 5000
Epoch 45 of 5000
Epoch 46 of 5000
Epoch 47 of 5000
Epoch 48 of 5000
Epoch 49 of 5000
Epoch 50 of 5000
Epoch 51 of 5000
Epoch 52 of 5000
Epoch 53 of 5000
Epoch 

np.float64(162920.40409297362)

In [21]:
housing_price_prediction = neural_network.predict(test_id,test_features)

In [22]:
print(housing_price_prediction.shape)

(1459, 2)


In [23]:
filename = 'submission.csv'
housing_price_prediction.to_csv(filename, index=False)