# **Import File**

### Import File

In [5]:
# Import libraries
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'Data/Melbourne_housing_FULL.csv'
df = pd.read_csv(file_path)

# Printing how many rows the file has
row_count = len(df)

print(f'The DataFrame has {row_count} rows.')

# Preview the data
df


The DataFrame has 34857 rows.


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.0,PI,Jas,24/02/2018,6.3,3013.0,...,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
34853,Yarraville,29A Murray St,2,h,888000.0,SP,Sweeney,24/02/2018,6.3,3013.0,...,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
34854,Yarraville,147A Severn St,2,t,705000.0,S,Jas,24/02/2018,6.3,3013.0,...,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
34855,Yarraville,12/37 Stephen St,3,h,1140000.0,SP,hockingstuart,24/02/2018,6.3,3013.0,...,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


# **Data Processing**

Removing null values and dropping unecessarry columns

In [8]:
# Remove rows where critical columns have missing values
df = df.dropna(subset=['Price', 'Distance', 'Landsize', 
'BuildingArea', 'Bedroom2', 'Bathroom', 'Car'])

# Drop unnecessary columns to simplify the dataset
df = df.drop(columns=['Rooms','Postcode', 'YearBuilt','Type','Propertycount', 'Longtitude', 
'Lattitude',  'CouncilArea', 'Regionname', 'Suburb', 'Address', 'Method', 'SellerG', 'Date'])
row_count = len(df)

print(f'The DataFrame has {row_count} rows.')
df

# Drop rows where Price is less than 100k or more than 10M
df = df[(df['Price'] >= 100000) & (df['Price'] <= 10000000)]

row_count = len(df)
print(f'The DataFrame has {row_count} rows.')
df


The DataFrame has 9244 rows.
The DataFrame has 9244 rows.


Unnamed: 0,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea
2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0
4,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0
6,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0
11,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0
14,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0
...,...,...,...,...,...,...,...
34847,500000.0,25.5,3.0,2.0,2.0,383.0,118.0
34849,570000.0,25.5,3.0,2.0,2.0,404.0,158.0
34853,888000.0,6.3,2.0,2.0,1.0,98.0,104.0
34854,705000.0,6.3,2.0,1.0,2.0,220.0,120.0


Removing decimal places and converting feilds to integers

Checking for duplicates and removing them (None in this dataset

In [11]:
# Check for duplicates
duplicates = df.duplicated()
print(f"Number of duplicate rows is {duplicates.sum()}")

# Remove duplicate rows
df = df.drop_duplicates()

row_count = len(df)
print(f'The DataFrame has {row_count} rows.')


Number of duplicate rows is 16
The DataFrame has 9228 rows.


Data Normalisation

In [13]:
pd.options.mode.copy_on_write = True
# Define price bins 
bins = np.arange(100000, 10000000, 100000)  
labels = range(len(bins) - 1)  # Create labels for the bins

# Create a new column for price ranges
df.loc[:, 'PriceRange'] = pd.cut(df['Price'], bins=bins, labels=labels, right=False)




# **Model Building**

### Random Forest Clustering

Split Data

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans


# Split data into X and y
X = df.drop(columns=['PriceRange'])
y = df['PriceRange']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Training the Model

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_classifier.predict(X_test)



Evaulate performance

In [21]:
import warnings
warnings.filterwarnings('ignore') 
# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Accuracy
print(f'Accuracy: {accuracy_score(y_test, y_pred_rf)}')

Random Forest Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.74      0.85        19
           2       0.92      1.00      0.96        54
           3       1.00      1.00      1.00       120
           4       0.99      1.00      1.00       187
           5       1.00      0.99      1.00       197
           6       1.00      1.00      1.00       177
           7       0.99      1.00      1.00       156
           8       1.00      1.00      1.00       134
           9       1.00      0.99      0.99        96
          10       0.95      0.98      0.96        96
          11       0.89      0.98      0.94       103
          12       0.88      0.95      0.91        91
          13       0.78      0.88      0.83        65
          14       0.75      0.73      0.74        45
          15       0.77      0.67      0.71        45
          16       0.77      0.73      0.75        41
          17       0.69      0.56      0.62 

Training the Model

In [23]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=2, random_state=100)
rf.fit(X_train, y_train)

Applying the Model

In [25]:
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)

Evaluating Performance

In [27]:
from sklearn.metrics import mean_squared_error, r2_score

rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)


rf_results = pd.DataFrame(['Random Forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']

rf_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Random Forest,5.616114,0.8817,5.014194,0.887547
