In [1]:
import sys
sys.path.append("../src")

In [2]:
import pandas as pd

In [None]:
from feature_engineering import FeatureSelection

In [4]:
df = pd.read_csv("../data/final/final_dataset.csv")

# import Algorithms / K-Fold

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost

In [6]:
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

# Feature Selection

In [7]:
fs = FeatureSelection(df, target="price")
fs.filter_by_correlation()
selected_features = fs.get_selected_features()

In [28]:
selected_features

['bedrooms',
 'bathrooms',
 'sqft_living',
 'floors',
 'view',
 'grade',
 'sqft_above',
 'sqft_basement',
 'lat',
 'sqft_living15',
 'bath_per_bed']

In [8]:
x = df[selected_features]
y = df['price']

# Splits

In [9]:
from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# LinearRegression

In [11]:
lr = LinearRegression()

lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

lr_score = r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)
lr_scores = cross_val_score(lr, x, y, cv=kf, scoring='r2')

print(f'linear regression score: {lr_score}')
print(f'linear regression mae: {lr_mae}')
print('--------------------------------------')
print("K-Fold mean:", lr_scores.mean())
print("K-Fold std:", lr_scores.std())
print("Selected features:", list(selected_features))

linear regression score: 0.7301693342555966
linear regression mae: 0.21268748393159562
--------------------------------------
K-Fold mean: 0.728613798934763
K-Fold std: 0.004136340661863084
Selected features: ['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'sqft_living15', 'bath_per_bed']


# Lasso

In [12]:
lasso = Lasso(alpha=0.001)

lasso.fit(x_train, y_train)

y_pred = lasso.predict(x_test)

lasso_score = r2_score(y_test, y_pred)
lasso_mae = mean_absolute_error(y_test, y_pred)
lasso_scores = cross_val_score(lasso, x, y, cv=kf, scoring='r2')

print(f'Lasso regression score: {lasso_score}')
print(f'Lasso regression mae: {lasso_mae}')
print('--------------------------------------')
print("K-Fold mean:", lasso_scores.mean())
print("K-Fold std:", lasso_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.7302675797069346
Lasso regression mae: 0.21321093423425927
--------------------------------------
K-Fold mean: 0.7277262389294289
K-Fold std: 0.00392180139710801
Selected features: ['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'sqft_living15', 'bath_per_bed']


# Ridge

In [13]:
ridge = Ridge(alpha=0.1)

ridge.fit(x_train, y_train)

y_pred = ridge.predict(x_test)

ridge_score = r2_score(y_test, y_pred)
ridge_mae = mean_absolute_error(y_test, y_pred)
ridge_scores = cross_val_score(ridge, x, y, cv=kf, scoring='r2')

print(f'Ridge regression score: {ridge_score}')
print(f'Ridge regression mae: {ridge_mae}')
print('--------------------------------------')
print("K-Fold mean:", ridge_scores.mean())
print("K-Fold std:", ridge_scores.std())
print("Selected features:", list(selected_features))

Ridge regression score: 0.7301555721863293
Ridge regression mae: 0.21269432496695376
--------------------------------------
K-Fold mean: 0.728618000584331
K-Fold std: 0.004132989292379794
Selected features: ['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'sqft_living15', 'bath_per_bed']


# ElasticNet

In [14]:
elastic = ElasticNet(alpha=0.0001)

elastic.fit(x_train, y_train)

y_pred = elastic.predict(x_test)

elastic_score = r2_score(y_test, y_pred)
elastic_mae = mean_absolute_error(y_test, y_pred)
elastic_scores = cross_val_score(elastic, x, y, cv=kf, scoring='r2')

print(f'ElasticNet regression score: {elastic_score}')
print(f'ElasticNet regression mae: {elastic_mae}')
print('--------------------------------------')
print("K-Fold mean:", elastic_scores.mean())
print("K-Fold std:", elastic_scores.std())
print("Selected features:", list(selected_features))

ElasticNet regression score: 0.7299917815889386
ElasticNet regression mae: 0.21279327957484673
--------------------------------------
K-Fold mean: 0.7286025134243854
K-Fold std: 0.0040747242799721315
Selected features: ['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'sqft_living15', 'bath_per_bed']


# Decision Tree

In [15]:
dt = DecisionTreeRegressor(random_state=42)

dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

dt_score = r2_score(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)
dt_scores = cross_val_score(dt, x, y, cv=kf, scoring='r2')

print(f'DecisionTree regression score: {dt_score}')
print(f'DecisionTree regression mae: {dt_mae}')
print('--------------------------------------')
print("K-Fold mean:", dt_scores.mean())
print("K-Fold std:", dt_scores.std())

DecisionTree regression score: 0.7005376992110801
DecisionTree regression mae: 0.21376610028292217
--------------------------------------
K-Fold mean: 0.6879585748201944
K-Fold std: 0.012111751163723589


# Random Forest

In [16]:
rf = RandomForestRegressor(random_state=42, n_estimators=200)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

rf_score = r2_score(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)
rf_scores = cross_val_score(rf, x, y, cv=kf, scoring='r2')

print(f'RandomForest regression score: {rf_score}')
print(f'RandomForest regression mae: {rf_mae}')
print('--------------------------------------')
print("K-Fold mean:", rf_scores.mean())
print("K-Fold std:", rf_scores.std())

RandomForest regression score: 0.8395371569651913
RandomForest regression mae: 0.15495356568369276
--------------------------------------
K-Fold mean: 0.8364134771748066
K-Fold std: 0.004347968327639967


# Gradient Boosting

In [17]:
gb = GradientBoostingRegressor(random_state=42, learning_rate=0.1, max_depth=5)

gb.fit(x_train, y_train)

y_pred = gb.predict(x_test)

gb_score = r2_score(y_test, y_pred)
gb_mae = mean_absolute_error(y_test, y_pred)
gb_scores = cross_val_score(gb, x, y, cv=kf, scoring='r2')

print(f'Gradient Boosting regression score: {gb_score}')
print(f'Gradient Boosting regression mae: {gb_mae}')
print('--------------------------------------')
print("K-Fold mean:", gb_scores.mean())
print("K-Fold std:", gb_scores.std())

Gradient Boosting regression score: 0.8431809155480193
Gradient Boosting regression mae: 0.15562328972915015
--------------------------------------
K-Fold mean: 0.8442008997676991
K-Fold std: 0.0026080919372961377


# Extra trees

In [18]:
et = ExtraTreesRegressor(random_state=42)

et.fit(x_train, y_train)

y_pred = et.predict(x_test)

et_score = r2_score(y_test, y_pred)
et_mae = mean_absolute_error(y_test, y_pred)
et_scores = cross_val_score(et, x, y, cv=kf, scoring='r2')

print(f'Extra Trees regression score: {et_score}')
print(f'Extra Trees regression mae: {et_mae}')
print('--------------------------------------')
print("K-Fold mean:", et_scores.mean())
print("K-Fold std:", et_scores.std())

Extra Trees regression score: 0.8350939833751782
Extra Trees regression mae: 0.15538686090139103
--------------------------------------
K-Fold mean: 0.8324133089550003
K-Fold std: 0.003920504702615041


# Hist Gradient Boosting

In [19]:
hgb = HistGradientBoostingRegressor(random_state=42) # 200 ta daraht

hgb.fit(x_train, y_train)
y_pred = hgb.predict(x_test)

hgb_score = r2_score(y_test, y_pred)
hgb_mae = mean_absolute_error(y_test, y_pred)
hgb_scores = cross_val_score(hgb, x, y, cv=kf, scoring='r2')

print(f'Hist Gradient Boosting score: {hgb_score}')
print(f'Hist Gradient Boosting mae: {hgb_mae}')
print('--------------------------------------')
print("K-Fold mean:", hgb_scores.mean())
print("K-Fold std:", hgb_scores.std())

Hist Gradient Boosting score: 0.8460028492674364
Hist Gradient Boosting mae: 0.15387987211575135
--------------------------------------
K-Fold mean: 0.8463043736434495
K-Fold std: 0.002766600725330245


# SVM

In [20]:
svm = SVR(kernel='rbf', C=20.0)

svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

svm_score = r2_score(y_test, y_pred)
svm_mae = mean_absolute_error(y_test, y_pred)
svm_scores = cross_val_score(svm, x, y, cv=kf, scoring='r2')

print(f'CVM score: {svm_score}')
print(f'CVM mae: {svm_mae}')
print('--------------------------------------')
print("K-Fold mean:", svm_scores.mean())
print("K-Fold std:", svm_scores.std())

CVM score: 0.8296088436511094
CVM mae: 0.16095221480095925
--------------------------------------
K-Fold mean: 0.8312504419510625
K-Fold std: 0.002960753579104105


In [21]:
# kernel='linear' bulganda result:

'''
CVM score: 0.7289856972461891
CVM mae: 0.21224108471243086
--------------------------------------
K-Fold mean: 0.7276188699009725
K-Fold std: 0.003959560032352466

'''

'\nCVM score: 0.7289856972461891\nCVM mae: 0.21224108471243086\n--------------------------------------\nK-Fold mean: 0.7276188699009725\nK-Fold std: 0.003959560032352466\n\n'

# KNN

In [22]:
knn = KNeighborsRegressor(n_neighbors=5)

knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

knn_score = r2_score(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)
knn_scores = cross_val_score(knn, x, y, cv=kf, scoring='r2')

print(f'KNN score: {knn_score}')
print(f'KNN mae: {knn_mae}')
print('--------------------------------------')
print("K-Fold mean:", knn_scores.mean())
print("K-Fold std:", knn_scores.std())

KNN score: 0.8158230831988531
KNN mae: 0.1662507092814746
--------------------------------------
K-Fold mean: 0.8128531249455697
K-Fold std: 0.00339300547690263


# XGBoost

In [23]:
xgb = xgboost.XGBRegressor()

xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)

xgb_score = r2_score(y_test, y_pred)
xgb_mae = mean_absolute_error(y_test, y_pred)
xgb_scores = cross_val_score(xgb, x, y, cv=kf, scoring='r2')

print(f'Lasso regression score: {xgb_score}')
print(f'Lasso regression mae: {xgb_mae}')
print('--------------------------------------')
print("K-Fold mean:", xgb_scores.mean())
print("K-Fold std:", xgb_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.8376814809620308
Lasso regression mae: 0.15694966700300658
--------------------------------------
K-Fold mean: 0.8385877943204555
K-Fold std: 0.0023073048182251812
Selected features: ['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'sqft_living15', 'bath_per_bed']


# Adaboost

In [24]:
ab = AdaBoostRegressor()

ab.fit(x_train, y_train)
y_pred = ab.predict(x_test)

ab_score = r2_score(y_test, y_pred)
ab_mae = mean_absolute_error(y_test, y_pred)
ab_scores = cross_val_score(ab, x, y, cv=kf, scoring='r2')

print(f'Lasso regression score: {ab_score}')
print(f'Lasso regression mae: {ab_mae}')
print('--------------------------------------')
print("K-Fold mean:", ab_scores.mean())
print("K-Fold std:", ab_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.7422633256159186
Lasso regression mae: 0.213167934410372
--------------------------------------
K-Fold mean: 0.7378818622124161
K-Fold std: 0.007441284887468434
Selected features: ['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'sqft_living15', 'bath_per_bed']


# Tabulate

In [None]:
result = [
    ['Linear Regression', lr_score, lr_mae],
    ['Lasso', lasso_score, lasso_mae],
    ['Ridge', ridge_score, ridge_mae],
    ['ElasticNet', elastic_score, elastic_score],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['Gradient Boosting', gb_score, gb_mae],
    ['Extra Trees', et_score, et_mae],
    ['Hist Gradient Boosting', hgb_score, hgb_mae],
    ['SVM', svm_score, svm_mae],
    ['KNN', knn_score, knn_mae],
    ['XGBoost', xgb_score, xgb_mae],
    ['AdoBoost', ab_score, ab_mae],
]

headers = ['Algorithm', 'r2_score', 'mean_absolute_error']

best_model = max(result, key=lambda x: x[1])
worst_model = min(result, key=lambda x: x[1])

green = "\033[92m"
red = "\x1b[31m"
reset = "\033[0m"

for row in result:
    if row == best_model:
        row[:] = [green + str(i) + reset for i in row]
    elif row == worst_model:
        row[:] = [red + str(i) + reset for i in row]

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.10f')

print(table)

+------------------------+--------------+-----------------------+
| Algorithm              |     r2_score |   mean_absolute_error |
| Linear Regression      | 0.7301693343 |          0.2126874839 |
+------------------------+--------------+-----------------------+
| Lasso                  | 0.7302675797 |          0.2132109342 |
+------------------------+--------------+-----------------------+
| Ridge                  | 0.7301555722 |          0.2126943250 |
+------------------------+--------------+-----------------------+
| ElasticNet             | 0.7299917816 |          0.7299917816 |
+------------------------+--------------+-----------------------+
| [31mDecision Tree[0m          | [31m0.7005376992[0m |          [31m0.2137661003[0m |
+------------------------+--------------+-----------------------+
| Random Forest          | 0.8395371570 |          0.1549535657 |
+------------------------+--------------+-----------------------+
| Gradient Boosting      | 0.8431809155 |        

In [None]:
'''
without feature selection
+-------------------------------+------------+-----------------------+---------------+--------------+
| Algorithm                     |   r2_score |   mean_absolute_error |   K-Fold Mean |   K-Fold Std |
+===============================+============+=======================+===============+==============+
| LinearRegression              |   0.778616 |              0.192870 |      0.777548 |     0.004691 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| Lasso                         |   0.771071 |              0.195671 |      0.770952 |     0.003623 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| Ridge                         |   0.778267 |              0.193038 |      0.777024 |     0.004473 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| ElasticNet                    |   0.777794 |              0.193305 |      0.776612 |     0.004442 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| DecisionTreeRegressor         |   0.779006 |              0.179849 |      0.769882 |     0.012785 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| RandomForestRegressor         |   0.890724 |              0.125577 |      0.889131 |     0.003517 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| GradientBoostingRegressor     |   0.903773 |              0.120866 |      0.899562 |     0.002871 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| ExtraTreesRegressor           |   0.893124 |              0.123205 |      0.890864 |     0.002751 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| HistGradientBoostingRegressor |   0.906147 |              0.118156 |      0.903152 |     0.002346 |   -> best model
+-------------------------------+------------+-----------------------+---------------+--------------+
| SVR                           |   0.881330 |              0.132474 |      0.876161 |     0.004658 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| KNeighborsRegressor           |   0.787709 |              0.178295 |      0.787801 |     0.002358 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| XGBRegressor                  |   0.904740 |              0.118965 |      0.899465 |     0.003641 |
+-------------------------------+------------+-----------------------+---------------+--------------+
| AdaBoostRegressor             |   0.765273 |              0.202334 |      0.765347 |     0.007268 |   -> worst model
+-------------------------------+------------+-----------------------+---------------+--------------+

'''

