# load data

In [1]:
import pandas as pd

data = pd.read_csv("house_prices.csv")
data

Unnamed: 0.1,Unnamed: 0,property_type,price,location,city,baths,purpose,bedrooms,Area_in_Marla
0,0,Flat,10000000,G-10,Islamabad,2,For Sale,2,4.0
1,1,Flat,6900000,E-11,Islamabad,3,For Sale,3,5.6
2,2,House,16500000,G-15,Islamabad,6,For Sale,5,8.0
3,3,House,43500000,Bani Gala,Islamabad,4,For Sale,4,40.0
4,4,House,7000000,DHA Defence,Islamabad,3,For Sale,3,8.0
...,...,...,...,...,...,...,...,...,...
99494,168435,Flat,7500000,Bahria Town Karachi,Karachi,3,For Sale,3,8.0
99495,168436,House,8800000,Bahria Town Karachi,Karachi,4,For Sale,3,8.0
99496,168438,House,14000000,Bahria Town Karachi,Karachi,3,For Sale,3,8.0
99497,168439,House,14000000,Bahria Town Karachi,Karachi,4,For Sale,4,14.0


# data engineering

In [2]:
def load_data():
    data = pd.read_csv("house_prices.csv")
    data = data.set_index(data.columns[0])
    data = data.rename(columns={"Area_in_Marla": "area"})
    data = data[data["area"] > 0]
    data = data.dropna()
    return data

data = load_data()
data

Unnamed: 0_level_0,property_type,price,location,city,baths,purpose,bedrooms,area
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Flat,10000000,G-10,Islamabad,2,For Sale,2,4.0
1,Flat,6900000,E-11,Islamabad,3,For Sale,3,5.6
2,House,16500000,G-15,Islamabad,6,For Sale,5,8.0
3,House,43500000,Bani Gala,Islamabad,4,For Sale,4,40.0
4,House,7000000,DHA Defence,Islamabad,3,For Sale,3,8.0
...,...,...,...,...,...,...,...,...
168435,Flat,7500000,Bahria Town Karachi,Karachi,3,For Sale,3,8.0
168436,House,8800000,Bahria Town Karachi,Karachi,4,For Sale,3,8.0
168438,House,14000000,Bahria Town Karachi,Karachi,3,For Sale,3,8.0
168439,House,14000000,Bahria Town Karachi,Karachi,4,For Sale,4,14.0


# training data

In [3]:
data = data.sample(frac=0.2, random_state=1234)
data = data.reset_index(drop=True)
data

Unnamed: 0,property_type,price,location,city,baths,purpose,bedrooms,area
0,House,10500000,Korangi,Karachi,1,For Sale,6,5.1
1,Flat,17000,Gulistan-e-Jauhar,Karachi,2,For Rent,2,3.1
2,House,10500000,Taramrri,Islamabad,5,For Sale,5,5.0
3,Flat,20000,Scheme 33,Karachi,2,For Rent,2,4.0
4,Flat,22000000,DHA Defence,Karachi,4,For Sale,4,8.0
...,...,...,...,...,...,...,...,...
19893,House,5800000,Farooq-e-Azam Road,Rawalpindi,3,For Sale,4,3.0
19894,House,17000000,Anda Mor Road,Karachi,3,For Sale,4,5.0
19895,House,32500000,Nazimabad,Karachi,4,For Sale,5,8.6
19896,Flat,3500000,Korangi,Karachi,2,For Sale,2,3.4


In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.3, random_state=1234)
test, valid = train_test_split(test, test_size=0.5, random_state=1234)
len(train), len(test), len(valid)

ModuleNotFoundError: No module named 'sklearn'

In [None]:
features = ["location", "city", "baths", "bedrooms", "area"]
train[features]

In [None]:
cities = sorted(train["city"].unique())
cities

In [None]:
label = "price"
train[label]

In [None]:
train[label].hist()

In [None]:
train[label].describe()

In [None]:
# from sklearn.preprocessing import FunctionTransformer
# import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import KBinsDiscretizer

label_pipeline = make_pipeline(
    # FunctionTransformer(func=np.log2, inverse_func=np.exp2),
    KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile", subsample=10000, random_state=1234)
)

label_pipeline.fit(train[[label]])

train[label] = label_pipeline.transform(train[[label]])
test[label] = label_pipeline.transform(test[[label]])
valid[label] = label_pipeline.transform(valid[[label]])

pd.DataFrame(train[[label]]).hist()

In [None]:
edges = label_pipeline.named_steps["kbinsdiscretizer"].bin_edges_[0]
classes = [f"{int(edges[i]):,} - {int(edges[i + 1]):,}" for i in range(len(edges) - 1)]
classes

## baseline

In [None]:
from sklearn.dummy import DummyClassifier

# baseline = DummyClassifier(strategy="uniform", random_state=1234) # Uniform random
# baseline = DummyClassifier(strategy="stratified", random_state=1234) # Stratified random
baseline = DummyClassifier(strategy="most_frequent", random_state=1234) # Zero-rule (most common)
# baseline = DummyClassifier(strategy="constant", constant=4, random_state=1234) # Zero-rule (constant)

# from sklearn.dummy import DummyRegressor

# baseline = DummyRegressor(strategy="mean")
# baseline = DummyRegressor(strategy="median")
# baseline = DummyRegressor(strategy="quantile", quantile=0.3)
# baseline = DummyRegressor(strategy="constant", constant=1000.0)

baseline.fit(train[features], train[label])

train_score = baseline.score(train[features], train[label])
valid_score = baseline.score(valid[features], valid[label])

print(f"train score: {train_score:.3f}")
print(f"valid score: {valid_score:.3f}")

# feature engineering

In [None]:
train["area"].hist()

In [None]:
train["area"].describe(percentiles=[0.9, 0.99, 0.999])

In [None]:
import numpy as np

np.log2(train["area"]).hist()

In [None]:
train["baths"].hist()

In [None]:
train["bedrooms"].hist()

# 3.5 bathrooms?, 1.5 bedrooms?

In [None]:
train["property_type"].hist()

# model development

In [None]:
from imblearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

def create_pipeline():
    return make_pipeline(
        make_column_transformer(
            (TfidfVectorizer(decode_error="ignore"), "location"),
            (HashingVectorizer(n_features=5, decode_error="ignore"), "city"),
            # (KBinsDiscretizer(n_bins=7, encode="onehot", strategy="uniform"), ["baths"]),
            (OneHotEncoder(handle_unknown="infrequent_if_exist"), ["baths"]),
            # (KBinsDiscretizer(n_bins=7, encode="onehot", strategy="uniform"), ["bedrooms"]),
            (OneHotEncoder(handle_unknown="infrequent_if_exist"), ["bedrooms"]),
            # (FunctionTransformer(func=np.log2, inverse_func=np.exp2), ["area"]),
            (StandardScaler(), ["area"]),
            # (KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile", random_state=1234), ["area"]),
            remainder="passthrough",
        ),
        RandomUnderSampler(random_state=1234),
        RandomForestClassifier(n_estimators=10, random_state=1234),
        # RandomForestRegressor(n_estimators=10, random_state=1234),
    )

pipeline = create_pipeline()
%time pipeline.fit(train[features], train[label])

train_score = round(pipeline.score(train[features], train[label]), 3)
valid_score = round(pipeline.score(valid[features], valid[label]), 3)

print(f"train score: {train_score:.3f}")
print(f"valid score: {valid_score:.3f}")

In [None]:
pipeline.steps[-1][1].n_features_in_

## hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

pipeline = create_pipeline()

params = dict(
    randomforestclassifier__criterion=["gini", "entropy", "log_loss"],
    # randomforestregressor__criterion=["squared_error", "absolute_error", "friedman_mse", "poisson"],
    randomforestclassifier__max_features=[None, "sqrt", "log2"],
)

search = RandomizedSearchCV(estimator=pipeline, param_distributions=params, n_iter=10, cv=5, random_state=1234)
%time search.fit(train[features], train[label])

print(f"best params: {search.best_params_}")
print(f"best score: {search.best_score_:.3f}")

## classification report

In [None]:
from sklearn.metrics import classification_report

y_pred = search.best_estimator_.predict(valid[features])
print(classification_report(valid[label], y_pred))

# model evaluation

In [None]:
def predict(pipeline, label_pipeline, sample):
    sample = pd.DataFrame([sample])
    y_pred = pipeline.predict_proba(sample)[0]
    y_pred = dict(zip(classes, y_pred))
    return y_pred

sample = {
    "city": "Lahore",
    "location": "Bani Gala",
    "baths": 1,
    "bedrooms": 1,
    "area": 1.0,
}

%time y_pred = predict(search.best_estimator_, label_pipeline, sample)
print(y_pred)

In [None]:
sample = {
    "city": "Lahore",
    "location": "Bani Gala",
    "baths": 1,
    "bedrooms": 1,
    "area": 1.1,
}

%time y_pred = predict(search.best_estimator_, label_pipeline, sample)
print(y_pred)

In [None]:
sample = {
    "city": "Lahore",
    "location": "Bani Gala",
    "baths": 5,
    "bedrooms": 5,
    "area": 10.0,
}

%time y_pred = predict(search.best_estimator_, label_pipeline, sample)
print(y_pred)

In [None]:
from sklearn.model_selection import cross_val_score

pipeline = create_pipeline()
pipeline.set_params(**search.best_params_)

%time cv_scores = cross_val_score(pipeline, train[features], train[label], cv=5)
cv_scores

In [None]:
print(f"cv mean score: {cv_scores.mean():.3f}")

## model calibration (classification only)

In [None]:
from sklearn.calibration import CalibratedClassifierCV

pipeline = create_pipeline()
pipeline.set_params(**search.best_params_)

pipeline = CalibratedClassifierCV(estimator=pipeline, cv=5)
pipeline.fit(train[features], train[label])

train_score = round(pipeline.score(train[features], train[label]), 3)
valid_score = round(pipeline.score(valid[features], valid[label]), 3)

print(f"train score: {train_score:.3f}")
print(f"valid score: {valid_score:.3f}")

In [None]:
print(f"test score: {pipeline.score(test[features], test[label]):.3f}")

# export "artifacts"

In [None]:
import joblib

joblib.dump(pipeline, "pipeline.joblib")
joblib.dump(label_pipeline, "label_pipeline.joblib")

In [None]:
cities = sorted(train["city"].unique())
cities

In [None]:
joblib.dump(cities, "cities.joblib")

In [None]:
edges = label_pipeline.named_steps["kbinsdiscretizer"].bin_edges_[0]
edges

In [None]:
classes = [f"{int(edges[i]):,} - {int(edges[i + 1]):,}" for i in range(len(edges) - 1)]
classes

In [None]:
joblib.dump(classes, "classes.joblib")

In [None]:
# !pip install gradio

In [None]:
%%writefile app.py
# !pip install gradio ipywidgets
import pandas as pd
import gradio as gr
import joblib
import numpy as np

# "Artifacts"
pipeline = joblib.load("pipeline.joblib")
label_pipeline = joblib.load("label_pipeline.joblib")
cities = joblib.load("cities.joblib")
classes = joblib.load("classes.joblib")

def predict(city, location, area, bedrooms, baths):
    sample = dict()
    sample["city"] = city
    sample["location"] = location
    sample["area"] = area # Column names matching feature names
    sample["bedrooms"] = bedrooms
    sample["baths"] = baths

    sample = pd.DataFrame([sample])
    y_pred = pipeline.predict_proba(sample)[0]
    y_pred = dict(zip(classes, y_pred))
    return y_pred

# https://www.gradio.app/guides
with gr.Blocks() as demo:
    city = gr.Dropdown(cities, value=city_init, label="City")
    location = gr.Textbox(label="Location", placeholder="E.g. Bangkhen")
    area = gr.Number(label="Area", value=area_init, minimum=0.5, step=0.5)
    bedrooms = gr.Slider(value=bedrooms_init, label="Bedrooms", minimum=0, maximum=10, step=1)
    baths = gr.Slider(value=baths_init, label="Baths", minimum=0, maximum=10, step=1)
   
    # with gr.Row():
    #     city_init = np.random.choice(cities)
    #     city = gr.Dropdown(cities, value=city_init, label="City")
        
    #     location = gr.Textbox(label="Location", placeholder="E.g. Bangken")
    
    # with gr.Row():
    #     area_init = np.random.choice(np.arange(0, 50, 0.5))
    #     area = gr.Number(label="Area", value=area_init, minimum=0.5, step=0.5)

    #     bedrooms_init = np.random.choice(np.arange(0, 10, 1))
    #     bedrooms = gr.Slider(value=bedrooms_init, label="Bedrooms", minimum=0, maximum=10, step=1)
        
    #     baths_init = np.random.choice(np.arange(0, 10, 1))
        # baths = gr.Slider(value=baths_init, label="Baths", minimum=0, maximum=10, step=1)
    
    predict_btn = gr.Button("Predict", variant="primary")
    price = gr.Label(label="Price")

    inputs = [city, location, area, bedrooms, baths]
    outputs = [price]
    
    predict_btn.click(predict, inputs=inputs, outputs=outputs)

if __name__ == "__main__":
    demo.launch() # Local machine only
    # demo.launch(server_name="0.0.0.0") # LAN access to local machine
    # demo.launch(share=True) # Public access to local machine

In [None]:
%run app.py

In [None]:
%%writefile requirements.txt
pandas
scikit-learn
imbalanced-learn
gradio