In [306]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model

df = pd.read_csv('final_data.csv')

scaler = MinMaxScaler()
inputs = scaler.fit_transform(df.drop(columns='House Value', axis=1))
print(inputs)
years = inputs[:, 0]
longitudes = inputs[:, 2]
latitudes = inputs[:, 3]
X = np.vstack((years, longitudes, latitudes)).T
print(X)

[[0.         0.         0.62797619 ... 0.639201   0.00281563 0.03720406]
 [0.         0.00104348 0.86607143 ... 0.32459426 0.01953852 0.03945885]
 [0.         0.0057971  0.81051587 ... 0.51810237 0.09617236 0.04735062]
 ...
 [1.         0.8284058  0.4156746  ... 0.405767   0.         0.02367531]
 [1.         0.82910145 0.39880952 ... 0.43196005 0.16895606 0.04960541]
 [1.         1.         0.39384921 ... 0.63420724 0.00147528 0.04171364]]
[[0.         0.62797619 0.54690832]
 [0.         0.86607143 0.34115139]
 [0.         0.81051587 0.39872068]
 ...
 [1.         0.4156746  0.6641791 ]
 [1.         0.39880952 0.7228145 ]
 [1.         0.39384921 1.        ]]


In [307]:
inc_model_ols = linear_model.LinearRegression()
incs = inputs[:, 4]
inc_model_ols.fit(X, incs)
print(inc_model_ols.coef_)
print(inc_model_ols.intercept_)

[ 0.06980302 -0.38360147 -0.32875502]
0.5292285408701546


In [308]:
age_model_ols = linear_model.LinearRegression()
ages = inputs[:, 5]
age_model_ols.fit(X, ages)
print(age_model_ols.coef_)
print(age_model_ols.intercept_)

[0.02694295 0.16858457 0.24819535]
0.21693590808510604


In [309]:
pop_model_ols = linear_model.LinearRegression()
pops = inputs[:, 6]
pop_model_ols.fit(X, pops)
print(pop_model_ols.coef_)
print(pop_model_ols.intercept_)

[ 0.01263346 -0.33613542 -0.53810437]
0.5564608659347152


In [310]:
room_model_ols = linear_model.LinearRegression()
rooms = inputs[:, 7]
room_model_ols.fit(X, rooms)
print(room_model_ols.coef_)
print(room_model_ols.intercept_)

[-0.0006191  -0.00452866 -0.00377596]
0.047386584133950686


In [311]:
from sklearn.model_selection import train_test_split
value_scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(inputs, value_scaler.fit_transform(df['House Value'].to_numpy().reshape(-1, 1)), test_size=0.2, random_state=42)


In [312]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input


model = Sequential([
    Dense(500, input_shape=(inputs.shape[1],), activation='LeakyReLU'), #rip relu
    Dense(1000, activation='LeakyReLU'),
    Dense(10, activation='LeakyReLU'),
    Dense(1, activation='LeakyReLU')
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_94 (Dense)            (None, 500)               4500      
                                                                 
 dense_95 (Dense)            (None, 1000)              501000    
                                                                 
 dense_96 (Dense)            (None, 10)                10010     
                                                                 
 dense_97 (Dense)            (None, 1)                 11        
                                                                 
Total params: 515,521
Trainable params: 515,521
Non-trainable params: 0
_________________________________________________________________


In [313]:
model.fit(X_train, y_train, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1850bf11460>

In [314]:
loss = model.evaluate(X_test, y_test)
print(loss)

0.007207685615867376


In [315]:
import statistics
data_cols = ["Year", "Zipcode", "Longitude", "Latitude", "Income", "Age", "Population", "Num Rooms", "House Value"]

error = []
for i in range(0, 17690):
    row = scaler.inverse_transform(pd.DataFrame(data=inputs[i].reshape(1, -1), columns=data_cols[:8]))
    prediction = value_scaler.inverse_transform(model(inputs[i].reshape(1, -1), training=False))
    err = prediction - df['House Value'].iloc[i]
    print(f'inputs: {row}, prediction: {prediction[0][0]}, error: {err[0][0]}')
    error.append(err[0][0])
print(f'error mean: {statistics.mean(map(abs, error))}')
print(f'error median: {statistics.median(map(abs, error))}')



inputs: [[ 2.0110e+03  8.9010e+04 -1.1793e+02  3.7680e+01  4.5893e+04  5.9100e+01
   3.1300e+02  4.6000e+00]], prediction: 256584.5076739788, error: 115284.5076739788
inputs: [[ 2.0110e+03  8.9019e+04 -1.1553e+02  3.5750e+01  3.5825e+04  3.3900e+01
   2.1720e+03  4.8000e+00]], prediction: 177528.89975905418, error: 61828.899759054184
inputs: [[ 2.0110e+03  8.9060e+04 -1.1609e+02  3.6290e+01  3.9211e+04  4.9400e+01
   1.0691e+04  5.5000e+00]], prediction: 162563.70410323143, error: 21363.70410323143
inputs: [[ 2.0110e+03  8.9061e+04 -1.1592e+02  3.6080e+01  4.4722e+04  4.5900e+01
   4.6580e+03  5.8000e+00]], prediction: 197137.0328962803, error: 34437.03289628029
inputs: [[ 2.0110e+03  8.9439e+04 -1.1999e+02  3.9510e+01  6.2008e+04  5.9100e+01
   1.4570e+03  5.4000e+00]], prediction: 342890.49834012985, error: 102890.49834012985
inputs: [[ 2.0110e+03  9.0001e+04 -1.1824e+02  3.3970e+01  3.6386e+04  2.6600e+01
   5.4760e+04  4.4000e+00]], prediction: 300256.0356259346, error: -9143.96437

In [316]:
from keras.models import load_model
model.save('housing_value_model')



INFO:tensorflow:Assets written to: housing_value_model\assets


INFO:tensorflow:Assets written to: housing_value_model\assets


In [317]:

pred_df = pd.DataFrame(columns = data_cols)
for index, row in df.iterrows():
    year = row['Year'] + 10
    zipcode = row['Zipcode']
    longitude = row['Longitude']
    latitude = row['Latitude']
    
    transform = scaler.transform(pd.DataFrame(data=[[year, zipcode, longitude, latitude, 0, 0, 0, 0]], columns=data_cols[:8]))

    reg_inputs = np.vstack((transform[:, 0], transform[:, 2], transform[:, 3])).T
    
    income_pred = inc_model_ols.predict(reg_inputs)[0]
    age_pred = age_model_ols.predict(reg_inputs)[0]
    pop_pred = pop_model_ols.predict(reg_inputs)[0]
    rooms_pred = room_model_ols.predict(reg_inputs)[0]

    transform = np.vstack((transform[:, 0], transform[:, 1], transform[:, 2], transform[:, 3], [income_pred], [age_pred], [pop_pred], [rooms_pred])).T

    housevalue_pred = value_scaler.inverse_transform(model(transform))[0][0]
    invtransform = scaler.inverse_transform(pd.DataFrame(data=[[0, 0, 0, 0, income_pred, age_pred, pop_pred, rooms_pred]], columns=data_cols[:8]))
    income_pred = invtransform[:, [4]][0][0]
    age_pred = invtransform[:, [5]][0][0]
    pop_pred = invtransform[:, [6]][0][0]
    rooms_pred = invtransform[:, [7]][0][0]
    print([year, zipcode, longitude, latitude, income_pred, age_pred, pop_pred, rooms_pred, housevalue_pred])

    pred_df.loc[len(pred_df.index)] = [year, zipcode, longitude, latitude, income_pred, age_pred, pop_pred, rooms_pred, housevalue_pred]

print(pred_df.head())
pred_df.to_csv('final_pred_data.csv', index=False)

[2021.0, 89010.0, -117.93, 37.68, 48558.7615193744, 47.02722476522248, 7238.981424080371, 5.006747113417678, 218681.37896060944]
[2021.0, 89019.0, -115.53, 35.75, 42695.472144438034, 46.15183335050964, 10650.241804023666, 4.980019825384848, 172479.5340001583]
[2021.0, 89060.0, -116.09, 36.29, 43285.758942098655, 46.54613542945965, 9282.456521808164, 4.983054506504583, 175983.32472145557]
[2021.0, 89061.0, -115.92, 36.08, 43506.211628738325, 46.328790461115254, 9991.484929518063, 4.983778315158327, 177592.95091032982]
[2021.0, 89439.0, -119.99, 39.51, 52087.114281599264, 48.14615047706186, 3205.0657933130883, 5.023495927143096, 273378.83815169334]
[2021.0, 90001.0, -118.24, 33.97, 83661.00319837792, 38.748772540425165, 32047.654910249894, 5.151571979726414, 790605.8514118196]
[2021.0, 90002.0, -118.24, 33.94, 83921.23838456458, 38.68518901850497, 32238.971666460267, 5.152643175815312, 797062.0888471605]
[2021.0, 90003.0, -118.27, 33.96, 84030.31183985184, 38.687388676924876, 32222.63696