Building MLP regressor using sequential API

In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

Loading the data

In [2]:
df=pd.read_csv('housing.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
df1=df.dropna()

In [5]:
df_encoded = pd.get_dummies(df1, columns=['ocean_proximity'])
print(df_encoded.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value  \
0       322.0       126.0         8.3252            452600.0   
1      2401.0      1138.0         8.3014            358500.0   
2       496.0       177.0         7.2574            352100.0   
3       558.0       219.0         5.6431            341300.0   
4       565.0       259.0         3.8462            342200.0   

   ocean_proximity_<1H OCEAN  ocean_proximity_INLAND  ocean_proximity_ISLAND  \
0                      False                   False                   False   
1   

In [8]:
# Identify boolean columns
bool_columns = df_encoded.select_dtypes(include='bool').columns

# Convert those columns to integers
df_encoded[bool_columns] = df_encoded[bool_columns].astype(int)
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   20433 non-null  float64
 1   latitude                    20433 non-null  float64
 2   housing_median_age          20433 non-null  float64
 3   total_rooms                 20433 non-null  float64
 4   total_bedrooms              20433 non-null  float64
 5   population                  20433 non-null  float64
 6   households                  20433 non-null  float64
 7   median_income               20433 non-null  float64
 8   median_house_value          20433 non-null  float64
 9   ocean_proximity_<1H OCEAN   20433 non-null  int32  
 10  ocean_proximity_INLAND      20433 non-null  int32  
 11  ocean_proximity_ISLAND      20433 non-null  int32  
 12  ocean_proximity_NEAR BAY    20433 non-null  int32  
 13  ocean_proximity_NEAR OCEAN  20433 no

In [9]:
X = df_encoded.drop(columns=['median_house_value'])
y = df_encoded['median_house_value']
print(X.shape)
print(y.shape)

(20433, 13)
(20433,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=113)

In [11]:
x_train,y_train = X_train[:-200],y_train[:-200]
x_valid,y_valid = X_train[-200:],y_train[-200:]

In [12]:
x_train.shape,y_train.shape,x_valid.shape,y_valid.shape,X_test.shape,y_test.shape

((16146, 13), (16146,), (200, 13), (200,), (4087, 13), (4087,))

In [13]:
tf.random.set_seed(42)
norm_layer = tf.keras.layers.Normalization(input_shape=x_train.shape[1:])
model = tf.keras.Sequential([
norm_layer,
tf.keras.layers.Dense(50, activation="relu"),
tf.keras.layers.Dense(50, activation="relu"),
tf.keras.layers.Dense(50, activation="relu"),
tf.keras.layers.Dense(1)
])

In [14]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(loss="mse", optimizer=optimizer, metrics=["RootMeanSquaredError"])
norm_layer.adapt(x_train)

In [15]:
history = model.fit(x_train, y_train, epochs=20,
validation_data=(x_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
mse_test, rmse_test = model.evaluate(X_test, y_test)



In [17]:
x_new = X_test[:3]
y_pred = model.predict(x_new)



In [24]:
dfp = pd.DataFrame(y_pred, columns=['Predicted'])
dfp['Actual'] = y_test[:3].values

dfp

Unnamed: 0,Predicted,Actual
0,120012.875,151700.0
1,404686.625,260600.0
2,184703.78125,144800.0
