## Preprocessing for Neural Network Model

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import matplotlib.pyplot as plt



# Use the preprocessed data files
hp_train_data_path = "../1_Initial_Preprocessed_Data/initial_preprocessed_hp_train.csv"

# Read the House Prices Train & Test data 
hp_train_df = pd.read_csv(hp_train_data_path)


In [2]:
#Display the House Prices Train DataFrame
hp_train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,Other,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,Other,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Review the columns name
hp_train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
# Drop the non-beneficial ID columns.
hp_train_df = hp_train_df.drop(columns=['Id'], axis=1)
hp_train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,Other,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,Other,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
# Determine the number of unique values in each column.
hp_train_df.nunique()

MSSubClass         15
MSZoning            5
LotFrontage         6
LotArea          1073
Street              2
                 ... 
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
SalePrice         663
Length: 80, dtype: int64

In [6]:
# Convert categorical data to numeric with `pd.get_dummies`
hp_train_df = pd.get_dummies(hp_train_df)


# Display the encoded DataFrame
hp_train_df.head()

Unnamed: 0,MSSubClass,LotArea,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,8450,2003,2003,706,0,150,856,856,854,...,False,False,False,True,False,False,False,False,True,False
1,20,9600,1976,1976,978,0,284,1262,1262,0,...,False,False,False,True,False,False,False,False,True,False
2,60,11250,2001,2002,486,0,434,920,920,866,...,False,False,False,True,False,False,False,False,True,False
3,70,9550,1915,1970,216,0,540,756,961,756,...,False,False,False,True,True,False,False,False,False,False
4,60,14260,2000,2000,655,0,490,1145,1145,1053,...,False,False,False,True,False,False,False,False,True,False


## Split to Prepare the data 

In [7]:
# Split our preprocessed data into our features and target arrays
# As 'SalePrice' is your target variable
X = hp_train_df.drop('SalePrice', axis=1)
y = hp_train_df['SalePrice']

 
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [9]:
# Determine the number of input features
input_features = len( X_train_scaled[0])   #X_train.shape[1] 

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.layers.Dense(units=80, input_dim=input_features, activation='relu'))

# Fist hidden layer
nn.add(tf.keras.layers.Dense(units=30, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='linear'))

# Check the structure of the model
nn.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                18480     
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 20941 (81.80 KB)
Trainable params: 20941 (81.80 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
# Use Adam optimizer and mean squared error for loss
# Also include root mean squared error and mean absolute error as metrics

nn.compile(optimizer='adam', 
           loss='mean_squared_error', 
           metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])

In [11]:
# Train the model
fit_model = nn.fit(X_train, y_train, epochs=50)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
# Evaluate the model using the test data
model_loss, root_mean_squared_error, mean_absoluter_error = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, root_mean_squared_error: {root_mean_squared_error} mean_absoluter_error :{mean_absoluter_error}")

In [None]:
nn.evaluate(X_test_scaled,y_test,verbose=2)

## Evaluate Predictions

In [None]:
# Make predictions on the test data
y_predict = nn.predict(X_test)
y_predict_list = [i[0] for i in y_predict.tolist()]

#y_test = hp_train_df['SalePrice'] #.values.reshape(-1, 1)
#y_test.reshape(-1, 1)

# Create a DataFrame to compare actual and predicted values
pred = pd.DataFrame({"Actual": y_test, "Predicted": y_predict_list, "Delta": abs(y_test - y_predict_list)})


In [None]:
# Create scatter plot
plt.scatter(pred['Actual'], pred['Predicted'])

# Plot diagonal dashed red line for perfect predictions
plt.plot([pred['Actual'].min(), pred['Actual'].max()],
         [pred['Actual'].min(), pred['Actual'].max()], '--r')

# Add labels and title
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.title('Actual vs. Predicted Sale Prices')

# Display the plot
plt.show()

In [None]:
# Evaluate the performance of your predictive model.
pred.head(10)

In [None]:
# Reset the index of the DataFrame and select the first 100 rows
pred1 = pred.reset_index().head(100)

# Plot the Actual values
plt.plot(pred1.index, pred1.Actual, label='Actual')

# Plot the Predicted values with a dashed line style
plt.plot(pred1.index, pred1.Predicted, label='Predicted', linestyle='--')

# Plot the Delta values (absolute difference between Actual and Predicted)
plt.plot(pred1.index, pred1.Delta, label='Delta')


## Save the Model

In [None]:
# # Export our model to HDF5 file
# file_path = "House_Price_Predictions.h5"

# # Save the model to an HDF5 file
# nn.save(file_path)

# # Print a success message
# print(f"Model successfully saved to {file_path}")
