In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [3]:
data = pd.read_csv(r'/Users/leonardominski/Documents/Projetos Python/Pesquisa/data/housing.csv')

In [4]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
data["rooms_per_household"] = data["total_rooms"]/data["households"]
data["bedrooms_per_room"] = data["total_bedrooms"]/data["total_rooms"]
data["population_per_household"]=data["population"]/data["households"]
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


In [6]:
#Working with categorical variables and transforming it to numerical, on our way to make the data ready for the ML algorithm.
#Defining a variable related to the library
ocean = LabelEncoder()
#Using the function in the column that we want to transform. This line of code will transform data, for example: when the string data is inland, near ocean or near bay it will convert to 1,2,3.
data['ocean_proximity'] = ocean.fit_transform(data['ocean_proximity'])
#In this line we want to make a column for every variable that haves into the ocean_proximity label, witch is inland, near_ocean, near_bay and island. For example, if our row is inland, we want inland to be valued as 1 and the rest as 0. The value [9] is the position of the ocean proximity position in the column
ocean_hot = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [9])], remainder='passthrough')
#creating a new data with the categorical data transformed
new_data = ocean_hot.fit_transform(data)
#Creating a DataFrame
new_data = pd.DataFrame(new_data)
#Defining its columns
new_data.columns =['<1H_OCEAN', 
    'INLAND', 
    'NEAR_OCEAN',
    'NEAR_BAY', 
    'ISLAND', 
    'longitude', 
    'latitude', 
    'housing_median_age', 
    'total_rooms', 
    'total_bedrooms', 
    'population', 
    'households', 
    'median_income',
    'median_house_value',
    'rooms_per_household',
    'bedrooms_per_room',
    'population_per_household']

new_data.head(9999)

Unnamed: 0,<1H_OCEAN,INLAND,NEAR_OCEAN,NEAR_BAY,ISLAND,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
0,0.0,0.0,0.0,1.0,0.0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,6.984127,0.146591,2.555556
1,0.0,0.0,0.0,1.0,0.0,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,6.238137,0.155797,2.109842
2,0.0,0.0,0.0,1.0,0.0,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,8.288136,0.129516,2.802260
3,0.0,0.0,0.0,1.0,0.0,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,5.817352,0.184458,2.547945
4,0.0,0.0,0.0,1.0,0.0,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,0.0,1.0,0.0,0.0,0.0,-121.05,39.13,10.0,3063.0,497.0,1168.0,507.0,4.4375,185100.0,6.041420,0.162259,2.303748
9995,0.0,1.0,0.0,0.0,0.0,-121.03,39.14,10.0,3138.0,524.0,1275.0,511.0,4.0775,164500.0,6.140900,0.166985,2.495108
9996,0.0,1.0,0.0,0.0,0.0,-121.07,39.13,8.0,4839.0,832.0,1977.0,762.0,4.0848,155900.0,6.350394,0.171936,2.594488
9997,0.0,1.0,0.0,0.0,0.0,-121.05,39.11,7.0,2767.0,423.0,1143.0,382.0,3.6333,170200.0,7.243455,0.152873,2.992147


In [7]:
new_data = new_data.dropna()
#new_data = new_data.drop(['<1H_OCEAN', 'INLAND', 'NEAR_OCEAN', 'NEAR_BAY', 'ISLAND'], axis = 1)
x = new_data.drop('median_house_value', axis =1)
y= new_data['median_house_value']
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
x = pd.DataFrame(x)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
new_data.columns

Index(['<1H_OCEAN', 'INLAND', 'NEAR_OCEAN', 'NEAR_BAY', 'ISLAND', 'longitude',
       'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms',
       'population', 'households', 'median_income', 'median_house_value',
       'rooms_per_household', 'bedrooms_per_room', 'population_per_household'],
      dtype='object')

In [10]:
from tensorflow.keras.regularizers import l2

model = tf.keras.Sequential([
                            tf.keras.layers.Dense(128, activation = 'relu', input_shape=(16,), kernel_regularizer=l2(0.01)), 
                            tf.keras.layers.Dense(64, activation = 'relu',kernel_regularizer=l2(0.01)),  
                            tf.keras.layers.Dense(32, activation = 'relu',kernel_regularizer=l2(0.01)),  
                            tf.keras.layers.Dense(1, activation = 'relu',kernel_regularizer=l2(0.01))
                             ])

In [11]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

In [12]:
history = model.fit(x_train, y_train, epochs=100, batch_size=64, validation_data=(x_test, y_test), verbose=2)


Epoch 1/100


2023-03-20 13:57:02.524620: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


256/256 - 1s - loss: 54964400128.0000 - mse: 54964400128.0000 - val_loss: 51412733952.0000 - val_mse: 51412733952.0000 - 573ms/epoch - 2ms/step
Epoch 2/100
256/256 - 0s - loss: 31439429632.0000 - mse: 31439429632.0000 - val_loss: 14095334400.0000 - val_mse: 14095334400.0000 - 132ms/epoch - 516us/step
Epoch 3/100
256/256 - 0s - loss: 11381289984.0000 - mse: 11381289984.0000 - val_loss: 10808506368.0000 - val_mse: 10808506368.0000 - 118ms/epoch - 461us/step
Epoch 4/100
256/256 - 0s - loss: 10035267584.0000 - mse: 10035267584.0000 - val_loss: 9869838336.0000 - val_mse: 9869838336.0000 - 114ms/epoch - 446us/step
Epoch 5/100
256/256 - 0s - loss: 9308094464.0000 - mse: 9308094464.0000 - val_loss: 9224348672.0000 - val_mse: 9224348672.0000 - 116ms/epoch - 452us/step
Epoch 6/100
256/256 - 0s - loss: 8782034944.0000 - mse: 8782034944.0000 - val_loss: 8735921152.0000 - val_mse: 8735921152.0000 - 118ms/epoch - 462us/step
Epoch 7/100
256/256 - 0s - loss: 8371771392.0000 - mse: 8371771392.0000 - va

In [21]:
print(np.sqrt(4334501888.0000))
new_data

65836.93407199335


Unnamed: 0,<1H_OCEAN,INLAND,NEAR_OCEAN,NEAR_BAY,ISLAND,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
0,0.0,0.0,0.0,1.0,0.0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,6.984127,0.146591,2.555556
1,0.0,0.0,0.0,1.0,0.0,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,6.238137,0.155797,2.109842
2,0.0,0.0,0.0,1.0,0.0,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,8.288136,0.129516,2.802260
3,0.0,0.0,0.0,1.0,0.0,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,5.817352,0.184458,2.547945
4,0.0,0.0,0.0,1.0,0.0,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,0.0,1.0,0.0,0.0,0.0,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,5.045455,0.224625,2.560606
20636,0.0,1.0,0.0,0.0,0.0,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,6.114035,0.215208,3.122807
20637,0.0,1.0,0.0,0.0,0.0,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,5.205543,0.215173,2.325635
20638,0.0,1.0,0.0,0.0,0.0,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,5.329513,0.219892,2.123209
