In [40]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense


In [41]:
# Import Data
data_import = pd.read_csv(".//house_predictions.csv")
print(data_import)

        price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0    13300000  7420         4          2        3      yes        no       no   
1    12250000  8960         4          4        4      yes        no       no   
2    12250000  9960         3          2        2      yes        no      yes   
3    12215000  7500         4          2        2      yes        no      yes   
4    11410000  7420         4          1        2      yes       yes      yes   
..        ...   ...       ...        ...      ...      ...       ...      ...   
540   1820000  3000         2          1        1      yes        no      yes   
541   1767150  2400         3          1        1       no        no       no   
542   1750000  3620         2          1        1      yes        no       no   
543   1750000  2910         3          1        1       no        no       no   
544   1750000  3850         3          1        2      yes        no       no   

    hotwaterheating aircond

## Prepare Data

In [42]:
# Create pandas dataframe
house_info = pd.DataFrame(data_import)
house_info.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [43]:
# Check for nulls
house_info.isnull().sum

<bound method DataFrame.sum of      price   area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0    False  False     False      False    False     False      False   
1    False  False     False      False    False     False      False   
2    False  False     False      False    False     False      False   
3    False  False     False      False    False     False      False   
4    False  False     False      False    False     False      False   
..     ...    ...       ...        ...      ...       ...        ...   
540  False  False     False      False    False     False      False   
541  False  False     False      False    False     False      False   
542  False  False     False      False    False     False      False   
543  False  False     False      False    False     False      False   
544  False  False     False      False    False     False      False   

     basement  hotwaterheating  airconditioning  parking  prefarea  \
0       False            False    

In [44]:
# Drop unnecessary columns
house_df = house_info.drop(columns=["guestroom", "prefarea", "furnishingstatus"], axis=1)
house_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,basement,hotwaterheating,airconditioning,parking
0,13300000,7420,4,2,3,yes,no,no,yes,2
1,12250000,8960,4,4,4,yes,no,no,yes,3
2,12250000,9960,3,2,2,yes,yes,no,no,2
3,12215000,7500,4,2,2,yes,yes,no,yes,3
4,11410000,7420,4,1,2,yes,yes,no,yes,2


In [45]:
# Convert Yes/No to 1/0 for scaling
converted_house_df = house_df.copy()
converted_house_df[["mainroad", "basement", "hotwaterheating", "airconditioning"]] = converted_house_df[["mainroad", "basement", "hotwaterheating", "airconditioning"]].replace({"yes": 1, "no": 0})
converted_house_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,basement,hotwaterheating,airconditioning,parking
0,13300000,7420,4,2,3,1,0,0,1,2
1,12250000,8960,4,4,4,1,0,0,1,3
2,12250000,9960,3,2,2,1,1,0,0,2
3,12215000,7500,4,2,2,1,1,0,1,3
4,11410000,7420,4,1,2,1,1,0,1,2


In [46]:
# Split X and Y
y = converted_house_df["price"]

x = converted_house_df.copy()
x = x.drop(columns="price")

In [47]:
# Train Test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4)

In [48]:
# Scale numeric data
scaler = StandardScaler().fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)

## Create Model

In [49]:
# Initial Model - Linear Regression
linear_regression_model = LinearRegression()

linear_regression_model.fit(x_train, y_train)
lr_model_predictions = linear_regression_model.predict(x_test)


In [50]:
# Test model
lr_model_r2_score = metrics.r2_score(y_test, lr_model_predictions)
lr_model_rmse = metrics.root_mean_squared_error(y_test, lr_model_predictions)
lr_model_mse = lr_model_rmse ** 2

print(f"R2 score: {lr_model_r2_score}  Mean Square Error: {lr_model_mse}  Root Mean Square Error: {lr_model_rmse}")

AttributeError: module 'sklearn.metrics' has no attribute 'root_mean_squared_error'

In [24]:
# Alternate model - Random Forrest

In [75]:
# Neural network model



# Load csv into  DataFrame
data_import = pd.read_csv("./house_predictions.csv")

# Drop unnecessary columns
house_df = data_import.drop(columns=["guestroom", "prefarea", "furnishingstatus"], axis=1)

# Convert yes/no to 1/0 for scaling
house_df[["mainroad", "basement", "hotwaterheating", "airconditioning"]] = house_df[["mainroad", "basement", "hotwaterheating", "airconditioning"]].replace({"yes": 1, "no": 0})

# missing values
house_df = house_df.fillna(house_df.mean())

# Split X and y
y = house_df["price"]
X = house_df.drop("price", axis=1)

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential([
    Dense(512, activation='linear', input_shape=(X_train.shape[1],)),
    Dense(256,activation='linear'),
    Dense(256, activation='linear'),
    Dense(128, activation='linear'),
    Dense(64, activation='linear'),
    Dense(1, activation='linear')  # Single output neuron for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=200, validation_split=0.2)

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print(f"Mean Squared Error on test set: {loss}")

# Make predictions
y_pred = model.predict(X_test)

# Calculate r2 value

r2 = r2_score(y_test, y_pred)

print(f"R-squared: {r2}")


Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 24915927367680.0000 - val_loss: 24781451689984.0000
Epoch 2/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 26140615901184.0000 - val_loss: 24779365023744.0000
Epoch 3/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 25238729392128.0000 - val_loss: 24758911500288.0000
Epoch 4/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 24742885064704.0000 - val_loss: 24642353889280.0000
Epoch 5/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 24228103454720.0000 - val_loss: 24088261165056.0000
Epoch 6/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 24463240331264.0000 - val_loss: 22267685765120.0000
Epoch 7/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 21384851881984.0000 - val_loss: 19414520954880.0

## Visualization

In [None]:
# Visualization