In [10]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
[70]
# Load the dataset 
df = pd.read_csv("house_prices.csv")

# Show the first 5 rows
df.head()

# --- Data Wrangling ---

# 1. Remove duplicates
df = df.drop_duplicates()

# 2. Convert 'date' column to datetime format
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

# 3. Drop rows with missing values in key columns
important_columns = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'floors']
df = df.dropna(subset=important_columns)

# 4. Remove rows with incorrect data (e.g. 0 bedrooms or bathrooms)
df = df[(df['bedrooms'] > 0) & (df['bathrooms'] > 0)]

[72]
# Data cleaning – Drop rows with missing values for selected features
features = ['sqft_living', 'bedrooms', 'bathrooms', 'floors']
df = df.dropna(subset=features + ['price'])

[74]
# Prepare input features (X) and target (y)
X = df[features]
y = df['price']
[76]
# Train the regression model
# This is where I trained my regression model. It learns from the dataset by finding the mathematical relationship 
# between input features (like house size) and the target (price). After this, the model is ready to predict prices for new houses.
model = LinearRegression()
model.fit(X, y)

[78]
# Predict house price for a new example
example = pd.DataFrame([[2000, 3, 2, 1]], columns=features)
predicted_price = model.predict(example)[0]

print(f"Predicted price: £{predicted_price:,.2f}")

Predicted price: £535,819.78

[80]
# Evaluate the model
y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)

print(f"R² Score: {r2:.2f}")
print(f"Mean Squared Error: {mse:,.0f}")
R² Score: 0.51
Mean Squared Error: 66,455,438,852

[82]
# Create a DataFrame to compare actual vs predicted values
comparison = pd.DataFrame({
    "Actual": y[:10],
    "Predicted": y_pred[:10]
})

# Format the currency output
comparison.style.format({
    "Actual": "£{:,.2f}",
    "Predicted": "£{:,.2f}"
})


[84]
#  9. Visualize Actual vs Predicted Prices
plt.figure(figsize=(8, 6))
plt.scatter(y, y_pred, alpha=0.3, color='blue')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')

# Add green dot for a specific prediction
# You already created this earlier:
example = pd.DataFrame([[2000, 3, 2, 1]], columns=features)
predicted_price = model.predict(example)[0]

# Place it on the graph
plt.scatter(predicted_price, predicted_price, color='green', s=150,
            label=f"Predicted Example (£{predicted_price:,.0f})")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted House Prices")
plt.grid(True)
plt.tight_layout()
plt.show()
print(f"Predicted price: {predicted_price:,.2f}")
print(f"Predicted price: {predicted_price:,.2f}")

SyntaxError: invalid character '£' (U+00A3) (2925547350.py, line 54)