In [1]:
'''
First Full model for neural network.
Need to do more preprocessing, maybe use ensemble techs?
  Average MSE: 1762373241.38
  Average RMSE: 40877.9508149
  Average Sale Price: $180615.06
  Average RMSE: $40877.95
  RMSE as Percentage of Average Sale Price: 22.63%
  Baseline RMSE: $79258.23
  Model RMSE is better than baseline.
  Logarithmic RMSE: 3.0168102385336213

2nd: enoded additional categorical 
  MSE scores for each fold: [  1.05978198e+09   1.48816898e+09   1.65048210e+09   3.81049748e+09
    1.25836784e+09]
  RMSE scores for each fold: [ 32554.29274558  38576.79329149  40626.12589008  61729.22710779
    35473.48081063]
  Average MSE: 1853459676.36
  Average RMSE: 41791.9839691
  Average Sale Price: $180615.06
  Average RMSE: $41791.98
  RMSE as Percentage of Average Sale Price: 23.14%
  Baseline RMSE: $79258.23
  Model RMSE is better than baseline.
  Logarithmic RMSE: 2.4033767712078378

  
3rd: added 0s for missing values instead of dropping and used MinMaxScaler()
Logarithmic RMSE: 0.5253631751233521

'''

'\nFirst Full model for neural network.\nNeed to do more preprocessing, maybe use ensemble techs?\n  Average MSE: 1762373241.38\n  Average RMSE: 40877.9508149\n  Average Sale Price: $180615.06\n  Average RMSE: $40877.95\n  RMSE as Percentage of Average Sale Price: 22.63%\n  Baseline RMSE: $79258.23\n  Model RMSE is better than baseline.\n  Logarithmic RMSE: 3.0168102385336213\n\n2nd: enoded additional categorical \n  MSE scores for each fold: [  1.05978198e+09   1.48816898e+09   1.65048210e+09   3.81049748e+09\n    1.25836784e+09]\n  RMSE scores for each fold: [ 32554.29274558  38576.79329149  40626.12589008  61729.22710779\n    35473.48081063]\n  Average MSE: 1853459676.36\n  Average RMSE: 41791.9839691\n  Average Sale Price: $180615.06\n  Average RMSE: $41791.98\n  RMSE as Percentage of Average Sale Price: 23.14%\n  Baseline RMSE: $79258.23\n  Model RMSE is better than baseline.\n  Logarithmic RMSE: 2.4033767712078378\n\n  \n3rd: added 0s for missing values instead of dropping and us

In [15]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import warnings
import matplotlib.pyplot as plt

def load_and_initial_clean(filepath):
    """ Load the dataset and drop irrelevant columns. """
    data = pd.read_csv(filepath)
    return data

def fill_missing_values(data):
    """ Fill missing numerical values and drop rows for specific cases. """
    data['LotArea'].fillna(data['LotArea'].median(), inplace=True)
    
    if data['LotFrontage'].isnull().any():
        predict_missing_values(data, 'LotFrontage', 'LotArea')
    
    data['GarageYrBlt'].fillna(0, inplace=True)
    data['HasGarage'] = data['GarageYrBlt'].apply(lambda x: 1 if x > 0 else 0)
    
    columns_to_fill = [
        'TotalBsmtSF', 'BsmtFullBath', 'GarageCars', 'GarageArea',
        'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtHalfBath', 'MasVnrArea'
    ]
    data[columns_to_fill] = data[columns_to_fill].fillna(0)
    return data

def predict_missing_values(data, target_column, predictor_column):
    """ Predict missing values in a target column using a predictor column. """
    non_na_data = data.dropna(subset=[target_column])
    model = LinearRegression()
    model.fit(non_na_data[[predictor_column]], non_na_data[target_column])
    missing_indices = data[target_column].isnull()
    data.loc[missing_indices, target_column] = model.predict(data.loc[missing_indices, [predictor_column]])

def get_season(month):
    """ Convert month number to season name. """
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

def engineer_features(data):
    """ Add new features and handle categorical variables. """
    data['TotalSF'] = data['1stFlrSF'] + data['2ndFlrSF'] + data['TotalBsmtSF']
    data['HouseAge'] = data['YrSold'] - data['YearBuilt']
    data['RemodelAge'] = data['YrSold'] - data['YearRemodAdd']
    data['HasBasement'] = data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    data['TotalBath'] = data['FullBath'] + 0.5 * data['HalfBath'] + data['BsmtFullBath'] + 0.5 * data['BsmtHalfBath']
    data['OverallScore'] = data['OverallQual'] + data['OverallCond']
    data['LotFrontageRatio'] = data['LotFrontage'] / data['LotArea']
    data['SaleSeason'] = data['MoSold'].apply(get_season)
    
    categorical_cols = [
        'MSSubClass', 'Alley', 'MSZoning', 'Street', 'LotShape',
        'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
        'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
        'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'Foundation', 'Heating', 'CentralAir',
        'Functional', 'GarageType', 'GarageFinish', 'PavedDrive',
        'MiscFeature', 'SaleType', 'SaleCondition', 'SaleSeason',
        'MasVnrType', 'Electrical', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
        'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC',
        'Fence'
    ]
    data = pd.get_dummies(data, columns=categorical_cols, dummy_na=True)
    return data

def add_missing_features(data):
    """ Add missing features to the DataFrame. """
    missing_features = [
        'PoolQC_Fa', 'Condition2_RRAe', 'Heating_OthW', 'RoofMatl_Metal',
        'Condition2_RRAn', 'RoofMatl_Roll', 'Electrical_Mix', 'HouseStyle_2.5Fin',
        'Heating_Floor', 'RoofMatl_Membran', 'Condition2_RRNn', 'MiscFeature_TenC',
        'Exterior2nd_Other', 'Exterior1st_Stone', 'Utilities_NoSeWa', 'RoofMatl_ClyTile',
        'GarageQual_Ex', 'Exterior1st_ImStucc'
    ]

    for feature in missing_features:
        data[feature] = 0

def save_preprocessed_data(data, output_file_path):
    """ Save the preprocessed data to a CSV file. """
    data.to_csv(output_file_path, index=False)

def train_gradient_boosting(X, y):
    model = GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
    )
    model.fit(X, y)
    return model

def evaluate_model(model, X, y):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = -cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(mse_scores)  # Convert MSE to RMSE

    print("MSE scores for each fold:", mse_scores)
    print("RMSE scores for each fold:", rmse_scores)
    print("Average MSE:", np.mean(mse_scores))
    print("Average RMSE:", np.mean(rmse_scores))

    average_price = y.mean()
    rmse_percentage = (rmse_scores.mean() / average_price) * 100
    print(f"Average Sale Price: ${average_price:.2f}")
    print(f"Average RMSE: ${rmse_scores.mean():.2f}")
    print(f"RMSE as Percentage of Average Sale Price: {rmse_percentage:.2f}%")

    baseline_rmse = np.sqrt(mean_squared_error(y, [y.mean()] * len(y)))
    print(f"Baseline RMSE: ${baseline_rmse:.2f}")
    if rmse_scores.mean() < baseline_rmse:
        print("Model RMSE is better than baseline.")
    else:
        print("Model RMSE is not better than baseline.")

def main():
    file_path = 'train.csv'
    output_file_path = 'train_with_features_Gus.csv'
    
    data = load_and_initial_clean(file_path)
    data = fill_missing_values(data)
    data = engineer_features(data)
    add_missing_features(data)
    save_preprocessed_data(data, output_file_path)
    
    X = data.drop('SalePrice', axis=1)
    y = data['SalePrice']
    
    # Train and evaluate Gradient Boosting model
    gb_model = train_gradient_boosting(X, y)
    print("Evaluating Gradient Boosting Model:")
    evaluate_model(gb_model, X, y)
    
    # Fit and evaluate the Neural Network model for comparison
    pipeline = make_pipeline(
        MinMaxScaler(),
        MLPRegressor(
            hidden_layer_sizes=(128, 64, 50),
            activation='relu',
            solver='adam',
            alpha=0.0001,
            learning_rate_init=0.001,
            max_iter=1000,
            random_state=42
        )
    )
    print("Evaluating Neural Network Model:")
    evaluate_model(pipeline, X, y)
    
    # Predict on the test data
    test_data = pd.read_csv('test_with_features_Gus.csv')
    X_test = test_data.drop('Id', axis=1)
    gb_model.fit(X, y)
    y_test_pred = gb_model.predict(X_test)

    # Create a DataFrame for submission that includes the Id and the predicted prices
    submission = pd.DataFrame({
        'Id': test_data['Id'],
        'SalePrice': y_test_pred
    })
    
    # Save the submission file
    submission.to_csv('predictions.csv', index=False)

if __name__ == "__main__":
    main()

Evaluating Gradient Boosting Model:
MSE scores for each fold: [  7.19610284e+08   4.27551637e+08   1.89561407e+09   6.97590355e+08
   4.55535660e+08]
RMSE scores for each fold: [ 26825.55282412  20677.32181077  43538.65024751  26411.93584985
  21343.28137981]
Average MSE: 839180400.471
Average RMSE: 27759.3484224
Average Sale Price: $180921.20
Average RMSE: $27759.35
RMSE as Percentage of Average Sale Price: 15.34%
Baseline RMSE: $79415.29
Model RMSE is better than baseline.
Evaluating Neural Network Model:
MSE scores for each fold: [  1.07022160e+09   1.16176009e+09   3.81937468e+09   9.60989793e+08
   6.31192626e+08]
RMSE scores for each fold: [ 32714.24157681  34084.60194124  61801.08962958  30999.83537388
  25123.54723588]
Average MSE: 1528707757.95
Average RMSE: 36944.6631515
Average Sale Price: $180921.20
Average RMSE: $36944.66
RMSE as Percentage of Average Sale Price: 20.42%
Baseline RMSE: $79415.29
Model RMSE is better than baseline.
