In [None]:
'''
First Full model for neural network.
Need to do more preprocessing, maybe use ensemble techs?
  Average MSE: 1762373241.38
  Average RMSE: 40877.9508149
  Average Sale Price: $180615.06
  Average RMSE: $40877.95
  RMSE as Percentage of Average Sale Price: 22.63%
  Baseline RMSE: $79258.23
  Model RMSE is better than baseline.
  Logarithmic RMSE: 3.0168102385336213

2nd: enoded additional categorical 
  MSE scores for each fold: [  1.05978198e+09   1.48816898e+09   1.65048210e+09   3.81049748e+09
    1.25836784e+09]
  RMSE scores for each fold: [ 32554.29274558  38576.79329149  40626.12589008  61729.22710779
    35473.48081063]
  Average MSE: 1853459676.36
  Average RMSE: 41791.9839691
  Average Sale Price: $180615.06
  Average RMSE: $41791.98
  RMSE as Percentage of Average Sale Price: 23.14%
  Baseline RMSE: $79258.23
  Model RMSE is better than baseline.
  Logarithmic RMSE: 2.4033767712078378

  
3rd: added 0s for missing values instead of dropping and used MinMaxScaler()
Logarithmic RMSE: 0.5253631751233521

'''

: 

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, KFold

def load_and_initial_clean(filepath):
    """ Load the dataset and drop irrelevant columns. """
    data = pd.read_csv(filepath)
    return data

def fill_missing_values(data):
    """ Fill missing numerical values and drop rows for specific cases. """
    data['LotArea'].fillna(data['LotArea'].median(), inplace=True)
    
    if data['LotFrontage'].isnull().any():
        predict_missing_values(data, 'LotFrontage', 'LotArea')
    
    data['GarageYrBlt'].fillna(0, inplace=True)
    data['HasGarage'] = data['GarageYrBlt'].apply(lambda x: 1 if x > 0 else 0)
    
    columns_to_fill = [
        'TotalBsmtSF', 'BsmtFullBath', 'GarageCars', 'GarageArea',
        'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtHalfBath', 'MasVnrArea'
    ]
    data[columns_to_fill] = data[columns_to_fill].fillna(0)
    return data

def predict_missing_values(data, target_column, predictor_column):
    """ Predict missing values in a target column using a predictor column. """
    non_na_data = data.dropna(subset=[target_column])
    model = LinearRegression()
    model.fit(non_na_data[[predictor_column]], non_na_data[target_column])
    missing_indices = data[target_column].isnull()
    data.loc[missing_indices, target_column] = model.predict(data.loc[missing_indices, [predictor_column]])

def get_season(month):
    """ Convert month number to season name. """
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

def engineer_features(data):
    """ Add new features and handle categorical variables. """
    data['TotalSF'] = data['1stFlrSF'] + data['2ndFlrSF'] + data['TotalBsmtSF']
    data['HouseAge'] = data['YrSold'] - data['YearBuilt']
    data['RemodelAge'] = data['YrSold'] - data['YearRemodAdd']
    data['HasBasement'] = data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    data['TotalBath'] = data['FullBath'] + 0.5 * data['HalfBath'] + data['BsmtFullBath'] + 0.5 * data['BsmtHalfBath']
    data['OverallScore'] = data['OverallQual'] + data['OverallCond']
    data['LotFrontageRatio'] = data['LotFrontage'] / data['LotArea']
    data['SaleSeason'] = data['MoSold'].apply(get_season)
    
    categorical_cols = [
        'MSSubClass', 'Alley', 'MSZoning', 'Street', 'LotShape',
        'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
        'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
        'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'Foundation', 'Heating', 'CentralAir',
        'Functional', 'GarageType', 'GarageFinish', 'PavedDrive',
        'MiscFeature', 'SaleType', 'SaleCondition', 'SaleSeason',
        'MasVnrType', 'Electrical', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
        'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC',
        'Fence'
    ]
    data = pd.get_dummies(data, columns=categorical_cols, dummy_na=True)
    return data

def add_missing_features_test(data):
    """ Add missing features to the DataFrame. """
    missing_features = [
        'PoolQC_Fa', 'Condition2_RRAe', 'Heating_OthW', 'RoofMatl_Metal',
        'Condition2_RRAn', 'RoofMatl_Roll', 'Electrical_Mix', 'HouseStyle_2.5Fin',
        'Heating_Floor', 'RoofMatl_Membran', 'Condition2_RRNn', 'MiscFeature_TenC',
        'Exterior2nd_Other', 'Exterior1st_Stone', 'Utilities_NoSeWa', 'RoofMatl_ClyTile',
        'GarageQual_Ex', 'Exterior1st_ImStucc'
    ]

    for feature in missing_features:
        data[feature] = 0

def add_missing_features_train(data):
    #missing NA features. i.e one hot encoded features that have no NA, are not here
    missing_features = [
    'MSSubClass_150.0']

    # Add each missing feature to the test set with all values set to 0
    for feature in missing_features:
        data[feature] = 0

def save_preprocessed_data(data, output_file_path):
    """ Save the preprocessed data to a CSV file. """
    data.to_csv(output_file_path, index=False)

def train_gradient_boosting(X, y):
    model = GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
    )
    model.fit(X, y)
    return model

def evaluate_model(model, X, y):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = -cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(mse_scores)  # Convert MSE to RMSE

    print("MSE scores for each fold:", mse_scores)
    print("RMSE scores for each fold:", rmse_scores)
    print("Average MSE:", np.mean(mse_scores))
    print("Average RMSE:", np.mean(rmse_scores))

    average_price = y.mean()
    rmse_percentage = (rmse_scores.mean() / average_price) * 100
    print(f"Average Sale Price: ${average_price:.2f}")
    print(f"Average RMSE: ${rmse_scores.mean():.2f}")
    print(f"RMSE as Percentage of Average Sale Price: {rmse_percentage:.2f}%")

    baseline_rmse = np.sqrt(mean_squared_error(y, [y.mean()] * len(y)))
    print(f"Baseline RMSE: ${baseline_rmse:.2f}")
    if rmse_scores.mean() < baseline_rmse:
        print("Model RMSE is better than baseline.")
    else:
        print("Model RMSE is not better than baseline.")

    

def get_logrmse(predictions_path, sample_path):
    # Load the predictions and actual values
    predictions_df = pd.read_csv(predictions_path)
    actual_df = pd.read_csv(sample_path)

    # Sort both DataFrames by 'Id' to ensure alignment
    predictions_df.sort_values('Id', inplace=True)
    actual_df.sort_values('Id', inplace=True)

    # Find mismatched IDs
    predictions_ids = set(predictions_df['Id'])
    actual_ids = set(actual_df['Id'])
    mismatched_ids = actual_ids - predictions_ids

    # Warn if there are mismatches and adjust the actual values DataFrame
    if mismatched_ids:
        warnings.warn(f"Mismatched IDs found: {mismatched_ids}. Please correct.")
        #actual_df = actual_df[actual_df['Id'].isin(predictions_ids)]
    else:
        # Ensure the adjusted DataFrames are aligned correctly
        actual_df.sort_values('Id', inplace=True)  # Re-sort to ensure order after adjustments

        # Calculate the logarithm of predictions and actual values to prevent scale bias
        log_predictions = np.log(predictions_df['SalePrice'] + 1)
        log_actual = np.log(actual_df['SalePrice'] + 1)

        # Calculate RMSE using the log-transformed values
        mse = mean_squared_error(log_actual, log_predictions)
        rmse = np.sqrt(mse)

        print(f'Logarithmic RMSE: {rmse}')

def main():
    file_path = 'train.csv'
    output_file_path = 'train_with_features_Gus.csv'
    
    data = load_and_initial_clean(file_path)
    data.drop('Id', axis=1, inplace=True)
    data = fill_missing_values(data)
    data = engineer_features(data)
    add_missing_features_train(data)
    save_preprocessed_data(data, output_file_path)

    file_path_test = 'test.csv'
    output_file_path_test = 'test_with_features_Gus.csv'
    test_data = load_and_initial_clean(file_path_test)
    test_data = fill_missing_values(test_data)
    test_data = engineer_features(test_data)
    add_missing_features_test(test_data)
    save_preprocessed_data(test_data, output_file_path_test)
    
    X = data.drop('SalePrice', axis=1)
    y = data['SalePrice']
    
    # Train and evaluate Gradient Boosting model
    gb_model = train_gradient_boosting(X, y)
    print("#####################Evaluating Gradient Boosting Model#########################")
    evaluate_model(gb_model, X, y)
    # Predict on the test data
    test_data = pd.read_csv(output_file_path_test)
    X_test = test_data.drop('Id', axis=1)
    gb_model.fit(X, y)
    y_test_pred = gb_model.predict(X_test)

    # Create a DataFrame for submission that includes the Id and the predicted prices
    submission = pd.DataFrame({
        'Id': test_data['Id'],
        'SalePrice': y_test_pred
    })
    submission.to_csv('predictions_GB.csv', index=False)
    get_logrmse('predictions_GB.csv', 'sample_submission.csv')
    
    # Save the submission file

    #print("#####################Logarithmic RMSE For Gradient Boosting Model#########################")
    #get_logrmse('predictions.csv', 'sample_submission.csv')
    

    print("#####################Evaluating Neural Network Model#########################")
    # Fit and evaluate the Neural Network model for comparison
    pipeline = make_pipeline(
        MinMaxScaler(),
        MLPRegressor(
            hidden_layer_sizes=(128, 64, 50),
            activation='relu',
            solver='adam',
            alpha=0.0001,
            learning_rate_init=0.001,
            max_iter=1000,
            random_state=42
        )
    )
    
    evaluate_model(pipeline, X, y)
    
    pipeline.fit(X, y)

    # Predict on the test data
    y_test_pred = pipeline.predict(X_test)

    # Create a DataFrame for submission that includes the Id and the predicted prices
    submission = pd.DataFrame({
        'Id': test_data['Id'],
        'SalePrice': y_test_pred
    })
    submission.to_csv('predictions_NN.csv', index=False)
    get_logrmse('predictions_NN.csv', 'sample_submission.csv')
    


if __name__ == "__main__":
    main()

#####################Evaluating Gradient Boosting Model#########################
MSE scores for each fold: [  6.97187557e+08   4.04300432e+08   1.93833611e+09   7.01262809e+08
   4.55017473e+08]
RMSE scores for each fold: [ 26404.30944522  20107.22336943  44026.53867699  26481.36720439
  21331.13857671]
Average MSE: 839220875.755
Average RMSE: 27670.1154545
Average Sale Price: $180921.20
Average RMSE: $27670.12
RMSE as Percentage of Average Sale Price: 15.29%
Baseline RMSE: $79415.29
Model RMSE is better than baseline.
Logarithmic RMSE: 0.3818543996864455
#####################Evaluating Neural Network Model#########################
MSE scores for each fold: [  1.03509743e+09   1.11971635e+09   3.77621560e+09   9.78745738e+08
   6.16715773e+08]
RMSE scores for each fold: [ 32172.9301224   33462.16289029  61450.92029345  31284.91229929
  24833.76276855]
Average MSE: 1505298178.74
Average RMSE: 36640.9376748
Average Sale Price: $180921.20
Average RMSE: $36640.94
RMSE as Percentage of Aver