In [None]:
import pandas as pd

df = pd.read_csv("../data/melb_data.csv")

df = df.drop(columns=["Address", "SellerG", "Date", "Postcode", "BuildingArea", "CouncilArea", "YearBuilt", "Lattitude", "Longtitude", "Regionname", "Propertycount", "Unnamed: 0"])
df.isnull().sum()
df = df.dropna()

df = pd.get_dummies(df, drop_first=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(df["Price"], kde=True)
plt.title("Price Distribution")
plt.show()

sns.histplot(df["Distance"], kde=True)
plt.title("Distance Distribution")
plt.show()

sns.histplot(df["Rooms"], kde=True)
plt.title("Rooms Distribution") 
plt.show()

sns.histplot(df["Car"], kde=True)
plt.title("Cars Distribution")  
plt.show()

sns.histplot(df["Landsize"], kde=True)
plt.title("Landsize Distribution")
plt.show()


In [None]:
sns.scatterplot(x="Rooms", y="Price", data=df)
plt.title("Price vs Rooms")
plt.show()

sns.scatterplot(x="Distance", y="Price", data=df)  
plt.title("Price vs Distance")
plt.show()

sns.scatterplot(x="Landsize", y="Price", data=df)
plt.title("Price vs Land Size")
plt.show()

In [None]:
corr = df.corr(numeric_only=True)['Price'].sort_values(ascending=False)

top_corr = corr[1:11]
plt.figure(figsize=(10, 6))
sns.barplot(x=top_corr.values, y=top_corr.index)
plt.title("Top 10 Features Correlated with Price")
plt.xlabel("Correlation Coefficient")
plt.ylabel("Features")
plt.show()

filtered_cols = [col for col in df.columns if col.startswith('Suburb_') == False]
plt.figure(figsize=(12, 8))
sns.heatmap(df[filtered_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Heatmap (excluding Suburb dummies)")
plt.show()

In [None]:
X = df.drop(columns=["Price"])
y = df["Price"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))

import matplotlib.pyplot as plt
plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Price")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("R²:", r2_score(y_test, y_pred_rf))


plt.scatter(y_test, y_pred_rf, alpha=0.3)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Random Forest: Actual vs Predicted Price")
plt.show()

importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values(ascending=False).head(10).plot(kind='barh')
plt.title("Top 10 Important Features")
plt.xlabel("Feature Importance")
plt.show()