In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from datetime import datetime 

In [2]:
df = pd.read_csv('uk_real_estate.csv')

df.head()

Unnamed: 0,Price,Bedrooms,Bathrooms,SqFt,City,Postcode,Year_Built,Type,Garage,Lot_Area
0,310506,5,3,1136,Birmingham,B,1834,Bungalow,0,4067
1,316926,1,1,2718,Manchester,SW,1983,House,1,3601
2,65337,3,1,2882,Glasgow,SW,1899,Bungalow,0,3693
3,206705,1,2,2660,Glasgow,SW,1971,House,1,2540
4,634260,1,1,1475,Edinburgh,B,1978,House,0,1582


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Price       5000 non-null   int64 
 1   Bedrooms    5000 non-null   int64 
 2   Bathrooms   5000 non-null   int64 
 3   SqFt        5000 non-null   int64 
 4   City        5000 non-null   object
 5   Postcode    5000 non-null   object
 6   Year_Built  5000 non-null   int64 
 7   Type        5000 non-null   object
 8   Garage      5000 non-null   int64 
 9   Lot_Area    5000 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 390.8+ KB


In [4]:
df.describe()

Unnamed: 0,Price,Bedrooms,Bathrooms,SqFt,Year_Built,Garage,Lot_Area
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,517988.8596,3.0292,1.9982,1730.879,1911.69,0.5,2761.5972
std,271061.387684,1.402976,0.817759,722.729818,64.294739,0.50005,1308.415414
min,50148.0,1.0,1.0,500.0,1800.0,0.0,500.0
25%,278808.75,2.0,1.0,1103.75,1855.75,0.0,1617.75
50%,520051.5,3.0,2.0,1715.5,1912.0,0.5,2746.0
75%,749902.5,4.0,3.0,2352.0,1967.0,1.0,3893.5
max,999981.0,5.0,3.0,2998.0,2022.0,1.0,4999.0


In [5]:
df['Total_rooms'] = df['Bedrooms'] + df['Bathrooms']

In [6]:
df.head()

Unnamed: 0,Price,Bedrooms,Bathrooms,SqFt,City,Postcode,Year_Built,Type,Garage,Lot_Area,Total_rooms
0,310506,5,3,1136,Birmingham,B,1834,Bungalow,0,4067,8
1,316926,1,1,2718,Manchester,SW,1983,House,1,3601,2
2,65337,3,1,2882,Glasgow,SW,1899,Bungalow,0,3693,4
3,206705,1,2,2660,Glasgow,SW,1971,House,1,2540,3
4,634260,1,1,1475,Edinburgh,B,1978,House,0,1582,2


In [7]:
df = df.drop(['Bedrooms', 'Bathrooms', 'City', 'Postcode'], axis = 1)

In [8]:
df.sample(5)

Unnamed: 0,Price,SqFt,Year_Built,Type,Garage,Lot_Area,Total_rooms
4161,241779,1250,1821,House,0,3810,4
45,195053,2682,1988,House,1,4705,7
1435,152724,2532,1802,Flat,0,3183,3
1206,924206,1070,1832,House,1,3088,5
399,567549,1351,2022,Flat,1,1995,3


In [9]:
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]

In [10]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        current_year = datetime.now().year
        X['House_Age'] = current_year - X['Year_Built']
        X['Lot_per_SqFt'] = X['Lot_Area'] / X['SqFt']

        return X[["House_Age", "Lot_per_SqFt", "Type", "Garage", "Total_rooms"]]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [12]:
X_train = pd.DataFrame(X_train, columns=X_train.columns)

In [13]:
df.columns

Index(['Price', 'SqFt', 'Year_Built', 'Type', 'Garage', 'Lot_Area',
       'Total_rooms'],
      dtype='object')

In [14]:
num_feature = ["House_Age", "Lot_per_SqFt", "Garage", "Total_rooms"]
cat_feature = ["Type"]

preprocessor = ColumnTransformer([
    ("log", FunctionTransformer(np.log1p), num_feature),
    ("cat", OneHotEncoder(handle_unknown='ignore'), cat_feature)
])

pipeline = Pipeline([
    ("feature", FeatureEngineer()),
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor())
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print(mean_squared_error(y_pred, y_test))
print(root_mean_squared_error(y_pred, y_test))

78737580471.77199
280602.1747452646


In [15]:
import pickle

file = 'houseprice.pkl'

with open(file, 'wb') as f:
    pickle.dump(pipeline, f)