In [1]:
import pandas as pd
import numpy as np

#### LOAD THE DATA

In [2]:
data=pd.read_csv("realtor-data.zip.csv")
data.head()

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,103378.0,for_sale,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,601.0,920.0,
1,52707.0,for_sale,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,601.0,1527.0,
2,103379.0,for_sale,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,795.0,748.0,
3,31239.0,for_sale,145000.0,4.0,2.0,0.1,1947675.0,Ponce,Puerto Rico,731.0,1800.0,
4,34632.0,for_sale,65000.0,6.0,2.0,0.05,331151.0,Mayaguez,Puerto Rico,680.0,,


### PREPROSESSING

In [3]:
df=data.drop(["brokered_by","status","prev_sold_date","zip_code"],axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226382 entries, 0 to 2226381
Data columns (total 8 columns):
 #   Column      Dtype  
---  ------      -----  
 0   price       float64
 1   bed         float64
 2   bath        float64
 3   acre_lot    float64
 4   street      float64
 5   city        object 
 6   state       object 
 7   house_size  float64
dtypes: float64(6), object(2)
memory usage: 135.9+ MB


In [5]:
df.isna().sum()

price           1541
bed           481317
bath          511771
acre_lot      325589
street         10866
city            1407
state              8
house_size    568484
dtype: int64

In [6]:
df

Unnamed: 0,price,bed,bath,acre_lot,street,city,state,house_size
0,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,920.0
1,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,1527.0
2,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,748.0
3,145000.0,4.0,2.0,0.10,1947675.0,Ponce,Puerto Rico,1800.0
4,65000.0,6.0,2.0,0.05,331151.0,Mayaguez,Puerto Rico,
...,...,...,...,...,...,...,...,...
2226377,359900.0,4.0,2.0,0.33,353094.0,Richland,Washington,3600.0
2226378,350000.0,3.0,2.0,0.10,1062149.0,Richland,Washington,1616.0
2226379,440000.0,6.0,3.0,0.50,405677.0,Richland,Washington,3200.0
2226380,179900.0,2.0,1.0,0.09,761379.0,Richland,Washington,933.0


In [7]:
categoric=df.select_dtypes(include="O")
numeric=df.select_dtypes(exclude="O")

In [8]:
numeric.columns

Index(['price', 'bed', 'bath', 'acre_lot', 'street', 'house_size'], dtype='object')

#### FILLING NAN VALUES

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
numeric_impute=SimpleImputer(strategy='mean')

In [11]:
numeric[numeric.columns]=numeric_impute.fit_transform(numeric[numeric.columns])

In [12]:
categoric_impute=SimpleImputer(strategy='most_frequent')

In [13]:
categoric[categoric.columns]=categoric_impute.fit_transform(categoric[categoric.columns])

In [14]:
numeric.isna().sum()

price         0
bed           0
bath          0
acre_lot      0
street        0
house_size    0
dtype: int64

In [15]:
df=numeric.join(categoric)

In [16]:
df.isna().sum()

price         0
bed           0
bath          0
acre_lot      0
street        0
house_size    0
city          0
state         0
dtype: int64

In [17]:
df

Unnamed: 0,price,bed,bath,acre_lot,street,house_size,city,state
0,105000.0,3.0,2.0,0.12,1962661.0,920.000000,Adjuntas,Puerto Rico
1,80000.0,4.0,2.0,0.08,1902874.0,1527.000000,Adjuntas,Puerto Rico
2,67000.0,2.0,1.0,0.15,1404990.0,748.000000,Juana Diaz,Puerto Rico
3,145000.0,4.0,2.0,0.10,1947675.0,1800.000000,Ponce,Puerto Rico
4,65000.0,6.0,2.0,0.05,331151.0,2714.471335,Mayaguez,Puerto Rico
...,...,...,...,...,...,...,...,...
2226377,359900.0,4.0,2.0,0.33,353094.0,3600.000000,Richland,Washington
2226378,350000.0,3.0,2.0,0.10,1062149.0,1616.000000,Richland,Washington
2226379,440000.0,6.0,3.0,0.50,405677.0,3200.000000,Richland,Washington
2226380,179900.0,2.0,1.0,0.09,761379.0,933.000000,Richland,Washington


#### LABEL ENCODING

In [18]:
from sklearn.preprocessing import LabelEncoder

label_enc_city=LabelEncoder()
label_enc_stat=LabelEncoder()


df['city']=label_enc_city.fit_transform(df['city'])
df['state']=label_enc_stat.fit_transform(df['state'])

In [19]:
df

Unnamed: 0,price,bed,bath,acre_lot,street,house_size,city,state
0,105000.0,3.0,2.0,0.12,1962661.0,920.000000,92,41
1,80000.0,4.0,2.0,0.08,1902874.0,1527.000000,92,41
2,67000.0,2.0,1.0,0.15,1404990.0,748.000000,8788,41
3,145000.0,4.0,2.0,0.10,1947675.0,1800.000000,14314,41
4,65000.0,6.0,2.0,0.05,331151.0,2714.471335,10936,41
...,...,...,...,...,...,...,...,...
2226377,359900.0,4.0,2.0,0.33,353094.0,3600.000000,15072,51
2226378,350000.0,3.0,2.0,0.10,1062149.0,1616.000000,15072,51
2226379,440000.0,6.0,3.0,0.50,405677.0,3200.000000,15072,51
2226380,179900.0,2.0,1.0,0.09,761379.0,933.000000,15072,51


In [20]:
import joblib

In [21]:
joblib.dump(label_enc_city,"city_label_ecoding.joblib")
joblib.dump(label_enc_stat,"state_label_ecoding.joblib")

['state_label_ecoding.joblib']

In [22]:
label_enc_stat.transform(["New Jersey"])

array([32])

In [23]:
label_enc_city.transform(["Adjuntas"])

array([92])

#### TRAIN TEST SPLIT

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X = df.drop("price",axis=1)
y = df["price"]

In [26]:
X

Unnamed: 0,bed,bath,acre_lot,street,house_size,city,state
0,3.0,2.0,0.12,1962661.0,920.000000,92,41
1,4.0,2.0,0.08,1902874.0,1527.000000,92,41
2,2.0,1.0,0.15,1404990.0,748.000000,8788,41
3,4.0,2.0,0.10,1947675.0,1800.000000,14314,41
4,6.0,2.0,0.05,331151.0,2714.471335,10936,41
...,...,...,...,...,...,...,...
2226377,4.0,2.0,0.33,353094.0,3600.000000,15072,51
2226378,3.0,2.0,0.10,1062149.0,1616.000000,15072,51
2226379,6.0,3.0,0.50,405677.0,3200.000000,15072,51
2226380,2.0,1.0,0.09,761379.0,933.000000,15072,51


In [27]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [28]:
X_train

Unnamed: 0,bed,bath,acre_lot,street,house_size,city,state
10715,4.000000,3.00000,4.000000,601919.0,2359.000000,4829,31
2197075,3.000000,3.00000,0.130000,463962.0,2090.000000,1825,51
1243748,3.275841,2.49644,4.850000,1168730.0,2714.471335,16507,33
942203,2.000000,1.00000,0.200000,505575.0,1200.000000,10639,17
796441,3.000000,4.00000,2.000000,1329698.0,2866.000000,19880,14
...,...,...,...,...,...,...,...
2219731,3.000000,1.00000,4.330000,1033913.0,960.000000,16757,51
963395,2.000000,1.00000,0.280000,972520.0,1626.000000,13251,19
2215104,3.000000,1.00000,0.350000,153792.0,984.000000,13168,51
1484405,3.000000,2.00000,15.223027,1694253.0,1568.000000,10105,34


#### STANDARD SCALING

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
joblib.dump(scaler,"standard_scaling.joblib")

['standard_scaling.joblib']

In [31]:
X_train

array([[ 5.47790940e-01,  3.62227074e-01, -1.56425433e-02, ...,
        -6.59997694e-04, -9.11275299e-01,  3.53752316e-01],
       [-2.08162151e-01,  3.62227074e-01, -2.14283193e-02, ...,
        -9.82744834e-04, -1.43612362e+00,  1.54557345e+00],
       [ 3.60499976e-04,  2.66097818e-04, -1.43717656e-02, ...,
        -2.33501944e-04,  1.12906379e+00,  4.72934430e-01],
       ...,
       [-2.08162151e-01, -1.07538192e+00, -2.10994121e-02, ...,
        -2.30972750e-03,  5.45685456e-01,  1.54557345e+00],
       [-2.08162151e-01, -3.56577422e-01,  1.13624836e-03, ...,
        -1.60904226e-03,  1.05288674e-02,  5.32525487e-01],
       [ 5.47790940e-01,  3.62227074e-01, -2.12489154e-02, ...,
        -1.15071732e-03,  1.48548542e+00,  5.92116544e-01]])

#### SET UP ARTIFICIAL NEURAL NETWORK

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

In [33]:
def r2_keras(y_true, y_pred):
    ss_res = tf.reduce_sum(tf.square(y_true - y_pred))
    ss_tot = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true)))
    return 1 - ss_res / (ss_tot + tf.keras.backend.epsilon())

In [34]:
model=Sequential(
    [Dense(256,activation="elu",input_shape=(X_train.shape[1],)),
     Dense(128,activation="elu"),
     Dense(64,activation="elu"),
    Dense(32,activation="elu"),
     Dense(16,activation="elu"),
     Dense(8,activation="elu"),
     Dense(4,activation="elu"),
     Dense(2,activation="elu"),
    Dense(1,activation="elu")]
)
model.compile(optimizer='adam', loss='mse', metrics=['mae',r2_keras])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [35]:
model.summary()

#### TRAIN

In [36]:
model.fit(X_train,y_train,batch_size=32,        
    epochs=100,           
    validation_split=0.2, 
    verbose=1,shuffle=True  )

Epoch 1/100
[1m38962/38962[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 2ms/step - loss: 2076092006400.0000 - mae: 370593.0938 - r2_keras: -0.8404 - val_loss: 23618398453760.0000 - val_mae: 371069.1250 - val_r2_keras: -375.7728
Epoch 2/100
[1m38962/38962[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 2ms/step - loss: 2187180638208.0000 - mae: 346542.5625 - r2_keras: -0.3493 - val_loss: 23071482183680.0000 - val_mae: 354684.7500 - val_r2_keras: -364.8175
Epoch 3/100
[1m38962/38962[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2ms/step - loss: 3017093414912.0000 - mae: 328311.3125 - r2_keras: -0.1565 - val_loss: 4676202266624.0000 - val_mae: 315224.5312 - val_r2_keras: -0.0193
Epoch 4/100
[1m38962/38962[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 2ms/step - loss: 7862760243200.0000 - mae: 316861.4375 - r2_keras: -0.0608 - val_loss: 4643088760832.0000 - val_mae: 311713.3438 - val_r2_keras: -0.0663
Epoch 5/100
[1m38962/38962[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x178415dbc80>

#### EVALUATING

In [37]:
339401/df["price"].mean()*100

64.74702425142156

In [38]:
284252.46875/df["price"].mean()*100

54.226420922986975

In [39]:
model.evaluate(X_test,y_test)

[1m20873/20873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 854us/step - loss: 1085924442112.0000 - mae: 295279.2500 - r2_keras: 0.1475


[2202979532800.0, 297103.40625, 0.12908028066158295]

In [40]:
preidiction=model.predict(X_test)

[1m20873/20873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 609us/step


#### saving

In [41]:
model.save("ann.keras")
#joblib.dump(model,"model.joblib")





In [42]:
state_categoric=categoric['state'].unique()
city_categoric=categoric['city'].unique()

In [43]:
joblib.dump(state_categoric,"state_names.joblib")
joblib.dump(city_categoric,"city_names.joblib")

['city_names.joblib']

In [44]:
x=np.array([[1,2,3,4,5,6,7]])
model.predict(x)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step


array([[286799.88]], dtype=float32)