# 🏠 Melbourne Housing Market Analysis
Analysis and regression modeling using real estate data from Melbourne, Australia.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load dataset
df = pd.read_csv("melb_data.csv")
df.head()

## 🧹 Data Cleaning

In [None]:
# Drop columns with many missing values
df = df.drop(columns=["BuildingArea", "YearBuilt", "CouncilArea", "Car", "Propertycount", "Postcode"])

# Drop rows with missing values
df = df.dropna()

# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

df.info()

## 📊 Exploratory Analysis

In [None]:
# Average price by suburb
df.groupby("Suburb")["Price"].mean().sort_values(ascending=False).head(10)

In [None]:
# Distribution of house types
sns.countplot(data=df, x="Type")
plt.title("Distribution of Property Types")
plt.show()

In [None]:
# Price vs. Rooms
sns.boxplot(data=df, x="Rooms", y="Price")
plt.title("Price by Number of Rooms")
plt.show()

## 🤖 Simple Price Prediction Model

In [None]:
# Use only numeric features
X = df[["Rooms", "Distance", "Landsize", "Bathroom"]]
y = df["Price"]

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)
print(f"Mean Squared Error: {mse:.2f}")

### 📈 Additional Evaluation

In [None]:
# R² score (coefficient of determination)
r2 = model.score(X_test, y_test)
print(f"R^2 Score: {r2:.2f}")

In [None]:
# Scatter plot of actual vs predicted
plt.figure(figsize=(6,6))
plt.scatter(y_test, preds, alpha=0.5)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Housing Prices")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.tight_layout()
plt.show()

In [None]:
# Coefficients of the model
coef_df = pd.DataFrame(model.coef_, index=X.columns, columns=["Coefficient"])
coef_df