In [1]:
import pandas as pd

In [2]:
# Reading data
df = pd.read_csv('Data/archive/Melbourne_Housing_Data_Cleaned.csv')

In [3]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,LandArea,Council,Latitude,Longitude,Region,State
0,Abbotsford,85 Turner St,2,h,1480000.0,3/12/2016,2.5,3067,1,1,202.0,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,VIC
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,4/02/2016,2.5,3067,1,0,156.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,VIC
2,Abbotsford,5 Charles St,3,h,1465000.0,4/03/2017,2.5,3067,2,0,134.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,VIC
3,Abbotsford,5 Charles St,3,h,1465000.0,4/03/2017,2.5,3067,2,0,134.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,VIC
4,Abbotsford,5 Charles St,3,h,1465000.0,4/03/2017,2.5,3067,2,0,134.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,VIC


In [4]:
# Dropping irrelevant columns
df = df.drop(columns=['Address', 'Date', 'Postcode', 'Council', 'Latitude', 'Longitude', 'State', 'Region'])

In [5]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bathroom,Car,LandArea
0,Abbotsford,2,h,1480000.0,2.5,1,1,202.0
1,Abbotsford,2,h,1035000.0,2.5,1,0,156.0
2,Abbotsford,3,h,1465000.0,2.5,2,0,134.0
3,Abbotsford,3,h,1465000.0,2.5,2,0,134.0
4,Abbotsford,3,h,1465000.0,2.5,2,0,134.0


In [6]:
# One-Hot Encoding of categorical values
df = pd.get_dummies(df, columns=['Suburb', 'Type'], drop_first=True)

In [7]:
df.head()

Unnamed: 0,Rooms,Price,Distance,Bathroom,Car,LandArea,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,Suburb_Albert Park,...,Suburb_Williamstown North,Suburb_Windsor,Suburb_Wollert,Suburb_Wonga Park,Suburb_Wyndham Vale,Suburb_Yallambie,Suburb_Yarra Glen,Suburb_Yarraville,Type_t,Type_u
0,2,1480000.0,2.5,1,1,202.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1035000.0,2.5,1,0,156.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1465000.0,2.5,2,0,134.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1465000.0,2.5,2,0,134.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,1465000.0,2.5,2,0,134.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Splitting features from label
from sklearn.model_selection import train_test_split

X = df.drop('Price', axis=1)
y = df['Price']

# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Scaling features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Training a linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [11]:
# Evaluating model
y_pred = model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 5.686650875811283e+16


In [12]:
# Entering property preferences

In [13]:
suburb = input()

Caulfield


In [14]:
property_type = input()

h


In [15]:
bedrooms = input()

2


In [16]:
bathrooms = input()

2


In [17]:
car_spaces = input()

1


In [19]:
# Creating a new data
new_data = pd.DataFrame({
    'Suburb': [suburb],
    'Rooms': [bedrooms],
    'Type': [property_type],
    'Bathroom': [bathrooms],
    'Car': [car_spaces]
})

new_data

Unnamed: 0,Suburb,Rooms,Type,Bathroom,Car
0,Caulfield,2,h,2,1


In [20]:
# Preprocessing the new data in the same way as the training data
new_data = pd.get_dummies(new_data, columns=['Suburb', 'Type'], drop_first=True).reindex(columns = X.columns, fill_value=0)
new_data_scaled = scaler.transform(new_data)

# Making predictions
predicted_price = model.predict(new_data_scaled)

print(f"Expected Price: ${predicted_price[0]}")

Expected Price: $1395059.9453856745
