In [1]:
# House Price Prediction – Custom Regression Project
# Author: Koussay Kraiem
#
# In this project, I am creating my own synthetic "house price" dataset.
# Kaggle blocks internet downloads, so instead of using a real estate dataset,
# I generate realistic features myself (area, rooms, age, location score…).
#
# This project shows:
# - how a dataset is created
# - how regression can be used on real-estate style data
# - how prices depend on multiple features
# - how to evaluate a model using RMSE and R²

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import random

# 1. Create a synthetic dataset
# These represent realistic house features
np.random.seed(42)

num_samples = 500

area = np.random.normal(120, 30, num_samples)             # m²
rooms = np.random.randint(2, 7, num_samples)              # 2–6 rooms
age = np.random.randint(1, 50, num_samples)               # age of house
distance_to_center = np.random.uniform(1, 20, num_samples) # km
location_score = np.random.uniform(3, 10, num_samples)     # 3–10 scale

# House price formula (realistic)
price = (
    area * 1200
    + rooms * 15000
    - age * 800
    - distance_to_center * 2000
    + location_score * 10000
    + np.random.normal(0, 20000, num_samples)   # noise
)

df = pd.DataFrame({
    "area_m2": area,
    "rooms": rooms,
    "house_age": age,
    "distance_to_center_km": distance_to_center,
    "location_score": location_score,
    "price_usd": price
})

print("First rows of my custom dataset:")
print(df.head())

# 2. Split data
X = df.drop("price_usd", axis=1)
y = df["price_usd"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 4. Train regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# 5. Evaluate model
y_pred = model.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print("RMSE:", round(rmse, 2))
print("R² Score:", round(r2, 3))

# 6. Predict example house
example_house = X_test.iloc[0:1]
example_scaled = scaler.transform(example_house)
predicted_price = model.predict(example_scaled)[0]

print("\nExample house features:")
print(example_house)

print(f"\nPredicted price: ${predicted_price:,.0f}")

First rows of my custom dataset:
      area_m2  rooms  house_age  distance_to_center_km  location_score  \
0  134.901425      3         10               9.622001        9.942670   
1  115.852071      3         30               5.484959        3.694247   
2  139.430656      4         25               2.396438        5.810460   
3  165.690896      4         39               4.225400        8.600497   
4  112.975399      6         20              10.875705        4.428249   

       price_usd  
0  259328.682615  
1  215635.484394  
2  255563.414883  
3  280894.056631  
4  212813.477797  

Model Performance:
RMSE: 17248.5
R² Score: 0.898

Example house features:
        area_m2  rooms  house_age  distance_to_center_km  location_score
361  165.982167      6         46              11.170892        3.657001

Predicted price: $263,607
