In [1]:
# import libraries
import pandas as pd
import warnings
from numpy import mean
from numpy import std
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# insert libraries for the required regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score

## Defining a function to contain the functions

In [2]:
# create a dict of standard models to evaluate {name:object}
def get_models(models=dict()):
    # linear models
    models['lr'] = LinearRegression()
    models['svr'] = SVR(kernel = 'rbf')
    models['dt'] = DecisionTreeRegressor()
    models['rf'] = RandomForestRegressor()
    return models

Creating a pipeline to standardize the data

In [3]:
# create a feature preparation pipeline for a model
def make_pipeline(model):
    steps = list()
    # standardization
    steps.append(('standardize', StandardScaler()))
    # normalization
    steps.append(('normalize', MinMaxScaler()))
    # the model
    steps.append(('model', model))
    # create pipeline
    pipeline = Pipeline(steps=steps)
    return pipeline

In [4]:
# evaluate a single model
def evaluate_model(X, y, model, folds, metric):
    # create the pipeline
    pipeline = make_pipeline(model)
    # evaluate model
    scores = cross_val_score(pipeline, X, y, scoring=metric, cv=folds, n_jobs=-1)
    return scores

In [5]:
# evaluate a model and try to trap errors and and hide warnings
def robust_evaluate_model(X, y, model, folds, metric):
    scores = None
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            scores = evaluate_model(X, y, model, folds, metric)
    except:
        scores = None
    return scores

In [None]:
# evaluate a dict of models {name:object}, returns {name:score}
def evaluate_models(X, y, models, folds=10, metric='accuracy'):
    results = dict()
    for name, model in models.items():
        # evaluate the model
        scores = robust_evaluate_model(X, y, model, folds, metric)
        # show process
        if scores is not None:
            # store a result
            results[name] = scores
            mean_score, std_score = mean(scores), std(scores)
            print('>%s: %.3f (+/-%.3f)' % (name, mean_score, std_score))
        else:
            print('>%s: error' % name)
    return results

In [7]:
# print and plot the top n results
def summarize_results(results, maximize=True, top_n=10):
    # check for no results
    if len(results) == 0:
        print('no results')
        return
    # determine how many results to summarize  
    n = min(top_n, len(results))
    # create a list of (name, mean(scores)) tuples
    mean_scores = [(k,mean(v)) for k,v in results.items()]
    # sort tuples by mean score
    mean_scores = sorted(mean_scores, key=lambda x: x[1])
    # reverse for descending order (e.g. for metric)
    if maximize:
        mean_scores = list(reversed(mean_scores))
    # retrieve the top n for summarization
    names = [x[0] for x in mean_scores[:n]]
    scores = [results[x[0]] for x in mean_scores[:n]]
    # print the top n
    print()
    for i in range(n):
        name = names[i]
        mean_score, std_score = mean(results[name]), std(results[name])
        print('Rank=%d, Name=%s, Score=%.3f (+/- %.3f)' % (i+1, name, mean_score, std_score))

## Load data and run the ML algorithms

In [8]:
# load dataset
path = ""
 # define path
df = pd.read_csv("cleaned.csv")
df.info()

# df.set_index('No', inplace = True)
    
X = df.drop(['Y house price of unit area','X6 longitude','X1 transaction date'], axis =1)
y = df['Y house price of unit area'].values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413 entries, 0 to 412
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Unnamed: 0                              413 non-null    int64  
 1   X1 transaction date                     413 non-null    object 
 2   X2 house age                            413 non-null    float64
 3   X3 distance to the nearest MRT station  413 non-null    float64
 4   X4 number of convenience stores         413 non-null    int64  
 5   X5 latitude                             413 non-null    float64
 6   X6 longitude                            413 non-null    float64
 7   Y house price of unit area              413 non-null    float64
dtypes: float64(5), int64(2), object(1)
memory usage: 25.9+ KB


In [13]:
# get model list
models = get_models()

# evaluate models
results = evaluate_models(X, y, models, metric='neg_root_mean_squared_error')

# summarize results
summarize_results(results)

>lr: -8.069 (+/-1.148)
>svr: -8.070 (+/-1.322)
>dt: -8.379 (+/-1.377)
>rf: -6.519 (+/-0.918)

Rank=1, Name=rf, Score=-6.519 (+/- 0.918)
Rank=2, Name=lr, Score=-8.069 (+/- 1.148)
Rank=3, Name=svr, Score=-8.070 (+/- 1.322)
Rank=4, Name=dt, Score=-8.379 (+/- 1.377)


In [10]:
# get model list
models = get_models()

# evaluate models
results = evaluate_models(X, y, models, metric='r2')

# summarize results
summarize_results(results)

>lr: 0.601 (+/-0.098)
>svr: 0.600 (+/-0.104)
>dt: 0.557 (+/-0.134)
>rf: 0.738 (+/-0.056)

Rank=1, Name=rf, Score=0.738 (+/- 0.056)
Rank=2, Name=lr, Score=0.601 (+/- 0.098)
Rank=3, Name=svr, Score=0.600 (+/- 0.104)
Rank=4, Name=dt, Score=0.557 (+/- 0.134)


The Random Forest Regressor algorithm has the best performance, based on the root mean square error and R2 metrics.
- The deviation of errors for LR is around 6.519 (10,000/ping), so around 65190 TWD/ping
- R2 of 0.738 is quite high as the closer to 1 it is, there is quite a good correlation between the factors and the pricing