In [47]:
#Import neccessary libraries
import tensorflow as tf
import sklearn
import pandas as pd
import numpy as np

# A. Process the data
- The dataset we're using in this notebook is from the Kaggle competition "House Prices - Advanced Regression Techniques"
- Credit: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview

- As is stated, this notebook will try to focus on the various techniques relating to NN (Neural Network) models that apply Regression as the learning strategy.
- First, let's prepare the data.

In [48]:
df_train = pd.read_csv('../datasets/House_Prices_AdvancedRegression/train.csv', delimiter=',')
df_test = pd.read_csv('../datasets/House_Prices_AdvancedRegression/test.csv', delimiter=',')

In [49]:
#Here is the dataset
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


- We'll separate the features and targets.

In [50]:
df_traintargets = df_train.SalePrice
df_train = df_train.drop("SalePrice", axis=1)

## 1. Categorical Encoding
- We can see that datasets contain **multiple categorical features** that is in text form, which a tensorflow model cannot process.
- So, we will use **sklearn's OrdinalEncoder** class to map and transform these features to numerical representations.
- As there are lots of features to encode, we'll create a class for a more uniform way to call.

In [51]:
from sklearn.preprocessing import OrdinalEncoder

class Preprocessor:
    def _category_encode(self, data):
        encoder = OrdinalEncoder()
        data = data.to_numpy().reshape(-1, 1) #Transform data to numpy array and reshape it to fit the encoder
        enc_data = encoder.fit_transform(data)
        return enc_data
    
    def preprocess(self, dataset, columns=None):
        if columns == None:
            columns = [2, 5, 2, 5, 6, 7, 8, 9, 10,
                       11, 12, 13, 14, 15, 16, 21, 22, 
                       23,24,25,27,28,29,30,31,32,33,35,
                       39,40,41,42,53,55,57,58,60,63,64,
                       65,72,73,74,78,79] #Number of categorical features in text form, as described in data_description.txt, which can be found on the competition page, data section.
        
        for feature in columns:
            enc_feature = self._category_encode(dataset.iloc[:, feature])
            dataset.iloc[:, feature] = enc_feature
        
        return dataset

- **OrdinalEncoder does not deal with NaN values**, so we'll use Pandas *fillna* function to replace all NaN values into a numeric value.

In [52]:
df_train = df_train.fillna('0', axis=1)
df_test = df_test.fillna('0', axis=1) #We'll do the same for test dataframe
df_train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,0,Reg,Lvl,AllPub,...,0,0,0,0,0,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,0,Reg,Lvl,AllPub,...,0,0,0,0,0,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,0,IR1,Lvl,AllPub,...,0,0,0,0,0,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,0,IR1,Lvl,AllPub,...,0,0,0,0,0,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,0,IR1,Lvl,AllPub,...,0,0,0,0,0,0,12,2008,WD,Normal


- Let's insantiate and call our Preprocessor class now.

In [53]:
my_preprocessor = Preprocessor()

df_train = my_preprocessor.preprocess(df_train)

In [54]:
#Here is the processed dataframe
df_train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,3.0,65.0,8450,1.0,0.0,3.0,3.0,0.0,...,0,0,0.0,0.0,0.0,0,2,2008,8.0,4.0
1,2,20,3.0,80.0,9600,1.0,0.0,3.0,3.0,0.0,...,0,0,0.0,0.0,0.0,0,5,2007,8.0,4.0
2,3,60,3.0,68.0,11250,1.0,0.0,0.0,3.0,0.0,...,0,0,0.0,0.0,0.0,0,9,2008,8.0,4.0
3,4,70,3.0,60.0,9550,1.0,0.0,0.0,3.0,0.0,...,0,0,0.0,0.0,0.0,0,2,2006,8.0,0.0
4,5,60,3.0,84.0,14260,1.0,0.0,0.0,3.0,0.0,...,0,0,0.0,0.0,0.0,0,12,2008,8.0,4.0


## 2. Anything else? *to be continued*
...

## We'll perform the same preprocessing on test data.

In [55]:
df_test = my_preprocessor.preprocess(df_test)

oh yeah forgot to remove id column, it does not matter.

In [56]:
df_train = df_train.drop("Id",axis=1)
test_ids = df_test.Id
df_test = df_test.drop("Id", axis=1)

## Finally, we'll convert dataframe to numpy array and set type as float64.

In [57]:
ds_train = df_train.to_numpy().astype(np.float64)
ds_traintargets = df_traintargets.to_numpy().astype(np.float64)
ds_test = df_test.to_numpy().astype(np.float64)

# B. Build the model
- Let's build our model using Tensorflow and Keras libraries.

In [58]:
#normalizer = tf.keras.layers.Normalization(axis=-1)

#normalizer.adapt(ds_train)

In [59]:
my_model = tf.keras.Sequential([
    tf.keras.layers.Dense(80, activation='relu'),
    tf.keras.layers.Dense(1, activation='relu')
])

In [60]:
my_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss=tf.keras.losses.MeanSquaredError()
)

my_model.fit(
    ds_train,
    ds_traintargets,
    epochs=150,
    validation_split=0.2,
    verbose=None
)

<keras.src.callbacks.History at 0x20562e6c490>

In [61]:
predictions = my_model.predict(ds_test)

predictions = pd.DataFrame(predictions)

predictions



Unnamed: 0,0
0,142783.984375
1,156167.453125
2,188260.390625
3,190187.828125
4,162083.390625
...,...
1454,83830.750000
1455,97220.179688
1456,192499.500000
1457,94808.296875


In [62]:
output = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions[0]})
output.to_csv('submission-Sequential-150epochs.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
