In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.utils import resample

Loading the data and setting hyperparameters

In [2]:
data = pd.read_csv('Melbourne_housing_FULL.csv')

In [3]:
target_column = 'Price'

In [4]:
# hyperparameters
lof_n_neighbors = 20
pca_n_components = 3
n_random_subset = 5

In [5]:
X = data.drop([target_column], axis=1)
y = data[target_column]

# transform categorical variables to numerical variables
le = LabelEncoder()
X['Suburb'] = le.fit_transform(X['Suburb'])
X['Address'] = le.fit_transform(X['Address'])
X['Regionname'] = le.fit_transform([str(rn) for rn in X['Regionname']])
X['CouncilArea'] = le.fit_transform([str(ca) for ca in X['CouncilArea']])
X['SellerG'] = le.fit_transform(X['SellerG'])
X['Type'] = le.fit_transform(X['Type'])
X['Method'] = le.fit_transform(X['Method'])
X['Date'] = [d.split('/')[2] for d in X['Date']]

Choosing of the methods for datapreprocessing automatically by MCPS

In [6]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)
data_train = pd.concat([X_train, y_train], axis=1)
data_test = pd.concat([X_test, y_test], axis=1)

In [7]:
# predict the accuracy of the preprocessed training dataset with a 10-fold cross-validation
def predict_accuracy(dataset_train):
    X, y = getXy(dataset_train)
    cv_results = cross_validate(DecisionTreeRegressor(random_state=0), X, y, cv=10, scoring=('r2'))
    return cv_results['test_score'].mean()

In [8]:
#variables
best_score = 0
best_method = ''
best_train_data = data_train.copy()
mcps = []
all_scores = {}

In [9]:
def init_score():
    global best_score, best_method
    best_score = 0
    best_method = ""

In [10]:
def update_score(dataset_train, method):
    global best_score, best_method, best_train_data, all_scores
    score = predict_accuracy(dataset_train)
    all_scores[method] = score
    if score > best_score:
        best_score = score
        best_method = method
        best_train_data = dataset_train.copy()

In [11]:
def getXy(dataset):
    X_data = dataset.drop([target_column], axis=1)
    y_data = dataset[target_column]
    return X_data, y_data

In [12]:
# handling missing values
init_score()
data_train = best_train_data.copy()

# drop rows with missing values
update_score(data_train.dropna(), 'drop rows')

# replace missing values with zero
update_score(data_train.fillna(0), 'replace with zero')

# replace missing values with mean
update_score(data_train.fillna(data_train.mean()), 'replace with mean')

# replace missing values with median
update_score(data_train.fillna(data_train.median()), 'replace with median')

# replace missing values with min
update_score(data_train.fillna(data_train.min()), 'replace with min')

# replace missing values with max
update_score(data_train.fillna(data_train.max()), 'replace with max')

filled_train_data = best_train_data.copy()
mcps.append(best_method)

In [13]:
# Outliers
init_score()
data_train = best_train_data.copy()

# no outlier handling
update_score(data_train, 'no outlier handling')

# remove outliers with Interquartile Range
q25_train = np.percentile(data_train[target_column], 25)
q75_train = np.percentile(data_train[target_column], 75)
iqr_train = data_train.drop(data_train[(data_train[target_column] < q25_train) | 
                                       (data_train[target_column] > q75_train)].index, axis=0)
update_score(iqr_train, 'Interquartile Range')

# remove outliers with BaggedLOF
lof = LocalOutlierFactor(n_neighbors=lof_n_neighbors, contamination=0.1)
outlier_train_pred = lof.fit_predict(data_train)
lof_train = data_train.drop(data_train[outlier_train_pred == -1].index, axis=0)
update_score(lof_train, 'BaggedLOF')

outliers_handled_train_data = best_train_data.copy()
mcps.append(best_method)

In [14]:
# Transformation
init_score()
data_train = best_train_data.copy()

# no transformation
update_score(data_train, 'no transformation')

# normalization
normalizer = preprocessing.Normalizer(norm='l2')
X_train = data_train.drop([target_column], axis=1)
y_train = data_train[target_column]
normalized_X_train = pd.DataFrame(data=normalizer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
normalized_train = pd.concat([normalized_X_train, y_train], axis=1)
update_score(normalized_train, 'normalization')

# standardize
standardizer = preprocessing.StandardScaler()
X_train = data_train.drop([target_column], axis=1)
y_train = data_train[target_column]
standardized_X_train = pd.DataFrame(data=standardizer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
standardized_train = pd.concat([standardized_X_train, y_train], axis=1)
update_score(standardized_train, 'standardization')

transformed_train_data = best_train_data.copy()
mcps.append(best_method)

In [15]:
# Dimensionality Reduction
init_score()
data_train = best_train_data.copy()

# no Dimensionality Reduction
update_score(data_train, 'no reduction')

# PCA
if 'standardization' in mcps or 'normalization' in mcps:
    pca = PCA(n_components=pca_n_components)
    X_train = data_train.drop([target_column], axis=1)
    y_train = data_train[target_column]
    pca_X_train = pd.DataFrame(data=pca.fit_transform(X_train), index=X_train.index)
    pca_train = pd.concat([pca_X_train, y_train], axis=1)
    update_score(pca_train, 'pca')
else:
    normalizer = preprocessing.Normalizer(norm='l2')
    X_train = data_train.drop([target_column], axis=1)
    y_train = data_train[target_column]
    normalized_X_train = pd.DataFrame(data=normalizer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    pca = PCA(n_components=pca_n_components)
    pca_X_train = pd.DataFrame(data=pca.fit_transform(normalized_X_train), index=normalized_X_train.index)
    pca_train = pd.concat([pca_X_train, y_train], axis=1)
    update_score(pca_train, 'pca')
    
    
# RandomSubset
X_train = data_train.drop([target_column], axis=1)
y_train = data_train[target_column]
random_X_train = X_train.sample(n_random_subset, axis=1)
random_train = pd.concat([random_X_train, y_train], axis=1)
update_score(random_train, 'random subset')

reduced_train_data = best_train_data.copy()
mcps.append(best_method)

In [16]:
# Sampling
init_score()
data_train = best_train_data.copy()

# no sampling
update_score(data_train, 'no sampling')

# resampling
resampled_train = resample(data_train, random_state=0)
#update_score(resampled_train, 'resampling')

sampled_train_data = best_train_data.copy()
mcps.append(best_method)

In [17]:
mcps

['drop rows', 'BaggedLOF', 'standardization', 'no reduction', 'no sampling']

In [18]:
training_score = best_score
training_score

0.65844556082184735

In [19]:
all_scores

{'BaggedLOF': 0.65654237400252369,
 'Interquartile Range': 0.20179587793229831,
 'drop rows': 0.55010413872021025,
 'no outlier handling': 0.55010413872021025,
 'no reduction': 0.65844556082184735,
 'no sampling': 0.65844556082184735,
 'no transformation': 0.65654237400252369,
 'normalization': 0.39209854369079211,
 'pca': 0.2670175646113101,
 'random subset': -0.17293456787528497,
 'replace with max': -0.28301151717654333,
 'replace with mean': 0.082972890387454201,
 'replace with median': 0.012429201119353939,
 'replace with min': -0.26632788118824335,
 'replace with zero': -0.30557042870737516,
 'standardization': 0.65844556082184735}

Application of the preprocessed dataset

In [20]:
def make_prediction(dataset_train, dataset_test):
    X_train, y_train = getXy(dataset_train)
    X_test, y_test = getXy(dataset_test)
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    pred = regressor.predict(X_test)
    return metrics.r2_score(y_test, pred)

In [21]:
data_test.dropna(inplace=True)
if 'standardization' in mcps:
    standardizer = preprocessing.StandardScaler()
    X_test = data_test.drop([target_column], axis=1)
    y_test = data_test[target_column]
    standardized_X_test = pd.DataFrame(standardizer.fit_transform(X_test), columns=X_test.columns, index=X_test.index)
    data_test = pd.concat([standardized_X_test, y_test], axis=1)
if 'normalization' in mcps:
    normalizer = preprocessing.Normalizer(norm='l2')
    X_test = data_test.drop([target_column], axis=1)
    y_test = data_test[target_column]
    normalized_X_test = pd.DataFrame(normalizer.fit_transform(X_test), columns=X_test.columns, index=X_test.index)
    data_test = pd.concat([normalized_X_test, y_test], axis=1)
if 'random subset' in mcps:
    X_test = data_test.drop([target_column], axis=1)
    y_test = data_test[target_column]
    random_X_test = X_test[reduced_train_data.columns.drop([target_column])]
    data_test = pd.concat([random_X_test, y_test], axis=1)
if 'pca' in mcps:
    if 'standardization' in mcps or 'normalization' in mcps:
        pca = PCA(n_components=pca_n_components)
        X_test = data_test.drop([target_column], axis=1)
        y_test = data_test[target_column]
        principal_components_test = pca.fit_transform(X_test)
        pca_X_test = pd.DataFrame(data=principal_components_test)
        data_test = pd.concat([pca_X_test, y_test], axis=1)
    else:
        normalizer = preprocessing.Normalizer(norm='l2')
        normalized_test = pd.DataFrame(normalizer.fit_transform(data_test), columns=data_test.columns)
        X_normalized_test = normalized_test.drop([target_column], axis=1)
        y_normalized_test = normalized_test[target_column]
        pca = PCA(n_components=pca_n_components)
        principal_components_test = pca.fit_transform(X_normalized_test)
        pca_X_test = pd.DataFrame(data=principal_components_test)
        data_test = pd.concat([pca_X_test, y_normalized_test], axis=1)

In [22]:
test_score = make_prediction(best_train_data, data_test)
test_score

0.60501118978640744