In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from model import MLPNeuralNetwork

In [2]:
train_df = pd.read_csv('home-data-for-ml-course/train.csv')
test_df = pd.read_csv('home-data-for-ml-course/test.csv')

In [3]:
# 0 is numerical, 1 is categorical
columns_dtypes = {
    'MSSubClass': 1,
    'MSZoning': 1,
    'LotFrontage': 0,
    'LotArea': 0,
    'Street': 1,
    'Alley': 1,
    'LotShape': 1,
    'LandContour': 1,
    'Utilities': 1,
    'LotConfig': 1,
    'LandSlope': 1,
    'Neighborhood': 1,
    'Condition1': 1,
    'Condition2': 1,
    'BldgType': 1,
    'HouseStyle': 1,
    'OverallQual': 1,
    'OverallCond': 1,
    'YearBuilt': 0,
    'YearRemodAdd': 0,
    'RoofStyle': 1,
    'RoofMatl': 1,
    'Exterior1st': 1,
    'Exterior2nd': 1,
    'MasVnrType': 1,
    'MasVnrArea': 0,
    'ExterQual': 1,
    'ExterCond': 1,
    'Foundation': 1,
    'BsmtQual': 1,
    'BsmtCond': 1,
    'BsmtExposure': 1,
    'BsmtFinType1': 1,
    'BsmtFinSF1': 0,
    'BsmtFinType2': 1,
    'BsmtFinSF2': 0,
    'BsmtUnfSF': 0,
    'TotalBsmtSF': 0,
    'Heating': 1,
    'HeatingQC': 1,
    'CentralAir': 1,
    'Electrical': 1,
    '1stFlrSF': 0,
    '2ndFlrSF': 0,
    'LowQualFinSF': 0,
    'GrLivArea': 0,
    'BsmtFullBath': 0,
    'BsmtHalfBath': 0,
    'FullBath': 0,
    'HalfBath': 0,
    'BedroomAbvGr': 0,
    'KitchenAbvGr': 0,
    'KitchenQual': 1,
    'TotRmsAbvGrd': 0,
    'Functional': 1,
    'Fireplaces': 0,
    'FireplaceQu': 1,
    'GarageType': 1,
    'GarageYrBlt': 0,
    'GarageFinish': 1,
    'GarageCars': 0,
    'GarageArea': 0,
    'GarageQual': 1,
    'GarageCond': 1,
    'PavedDrive': 1,
    'WoodDeckSF': 0,
    'OpenPorchSF': 0,
    'EnclosedPorch': 0,
    '3SsnPorch': 0,
    'ScreenPorch': 0,
    'PoolArea': 0,
    'PoolQC': 1,
    'Fence': 1,
    'MiscFeature': 1,
    'MiscVal': 0,
    'MoSold': 0,
    'YrSold': 0,
    'SaleType': 1,
    'SaleCondition': 1,
    'SalePrice': 0
}

In [5]:
def nomalization(data: pd.DataFrame, label_col: str):
    data_to_normalize = data.drop(columns=[label_col])
    features_max = data_to_normalize.max()
    features_min = data_to_normalize.min()
    normalized_data = 2 * (data_to_normalize - features_min) / (features_max - features_min) - 1
    normalized_data[label_col] = data[label_col]
    return normalized_data

In [6]:
def preprocess_data(data: pd.DataFrame, columns_dtype: dict, label_col: str):
    imputer = KNNImputer(n_neighbors=int(np.sqrt(len(data))))
    for col in data.columns:
        if columns_dtype[col] == 1: # categorical
            data[col] = data[col].fillna('unknown')
        else: # numerical
            imputed_values = imputer.fit_transform(data[[col]])
            data[col] = imputed_values.flatten()
                
    categorical_cols = [col for col in data.columns if columns_dtype[col] == 1]
    numerical_cols = [col for col in data.columns if columns_dtype[col] == 0]
                
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
    encoded_categorical_data = encoder.fit_transform(data[categorical_cols])
    encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
    
    for feature in encoded_feature_names:
        columns_dtype[feature] = 1
    
    categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoded_feature_names, index=data.index)
    
    normalized_numerical_data = nomalization(data[numerical_cols], label_col)
    
    df = pd.concat([categorical_df, normalized_numerical_data], axis=1)

    return df

In [7]:
def df_to_np(df: pd.DataFrame, label_col: str):
	features_data = df.drop(columns=[label_col]).to_numpy()
	label_data = df[label_col].to_numpy()
	return features_data, label_data

In [8]:
def k_fold(features: np.ndarray, labels: np.ndarray, k: int):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    return kf.split(features, labels)

In [9]:
preprocess_transformer = FunctionTransformer(func=preprocess_data, kw_args={'columns_dtype': columns_dtypes, 'label_col': 'SalePrice'})
np_transformer = FunctionTransformer(func=df_to_np, kw_args={'label_col': 'SalePrice'})

In [None]:
pipeline = Pipeline(steps=[
    ('preprocess', preprocess_transformer),
    ('df_to_np', np_transformer)
])

train_features, train_labels = pipeline.fit_transform(train_df.drop(columns=['Id']))

print('Data preprocessed')
print('Dataframe turned into numpy arrays')
print('Training features shape: ', train_features.shape)
print('Training labels shape: ', train_labels.shape)

Data preprocessed
Dataframe turned into numpy arrays
Training features shape:  (1460, 288)
Training labels shape:  (1460,)


In [11]:
k_fold_index = []
for train_index, test_index in k_fold(train_features, train_labels, k=7):
    k_fold_index.append([train_index, test_index])

params = {
    'alphas': [0.6, 0.7, 0.8, 0.9],
    'lambdas': [0, 0.1, 0.2],
    'epsilons': [math.pow(math.e, -6), math.pow(math.e, -5)],
    'hidden_sizes': [5, 10],
    'neurons_per_layer': [10, 20],
    'early_stopping_threshold': 60000,
    'early_stopping_folds': 3
}

In [12]:
neural_network = MLPNeuralNetwork(len(train_features[0]), 1)
neural_network.get_info()

No parameter file found. You will need to run grid_search before fitting the model.
Or load parameters manually using load_parameters(filename).
Model: MLP - Multi-Layer Perceptron
Type: Regression
Purpose: Housing Prices Prediction

Architecture:
  Input size: 288
  Output size: 1

Note: No parameters loaded. Run grid_search or load_parameters first.

Status: Model needs initialization before training/prediction



In [13]:
neural_network.grid_search(train_features, train_labels, params, k_fold_index)

Training with alpha=0.6, lambda=0, epsilon=0.0009118819655545166, layer=1, npl=5

Weights are valid.

Starting cross-validation...

Processing fold 1...
Training stopped after reaching max_epochs (1000).
MAE for fold 1: 170270.23230098857
Processing fold 2...
Training stopped after reaching max_epochs (1000).
MAE for fold 2: 149611.15695853136
Processing fold 3...
Training stopped after reaching max_epochs (1000).
MAE for fold 3: 144544.36928725263
Processing fold 4...
Training stopped after reaching max_epochs (1000).
MAE for fold 4: 139259.0131228234
Processing fold 5...
Training stopped after reaching max_epochs (1000).
MAE for fold 5: 121997.66541064327
Processing fold 6...
Training stopped after reaching max_epochs (1000).
MAE for fold 6: 100106.5953145125
Processing fold 7...
Training stopped after reaching max_epochs (1000).
MAE for fold 7: 91433.48965908716
Processing fold 8...
Training stopped after reaching max_epochs (1000).
MAE for fold 8: 83818.69309803071
Processing fold 

KeyboardInterrupt: 

In [None]:
neural_network.fit(train_features, train_labels)

In [None]:
test_features = pipeline.transform(test_df.drop(columns=['Id']))
test_id = test_df['Id']

In [None]:
housing_price_prediction = neural_network.predict(test_id,test_features)

In [None]:
filename = 'submission.csv'
housing_price_prediction.to_csv(filename, index=False)