In [1]:
import pickle
import pathlib

import numpy as np
import pandas as pd

RANDOM_SEED = 42

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

d:\machine-learning\ames\data


In [3]:
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

In [4]:
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

In [74]:
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

In [81]:
# creates pipeline that scales the date and do polynomial features only to the numerical variables
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

num_attribs = list(X.select_dtypes(include=[np.number]).columns)
# cat_atttribs are all the other cats
cat_attribs = list(X.select_dtypes(exclude=[np.number]).columns)

X[cat_attribs] = X[cat_attribs].astype(str)

In [82]:

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False))
])

# cat_pipeline with one hot encoder
cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(drop= 'first'))
])

full_pipeline = ColumnTransformer([

    ('num', num_pipeline, num_attribs),

    ('cat', cat_pipeline, cat_attribs)

])


X_trans = full_pipeline.fit_transform(X)

In [83]:
#getdummies the categorical variables

X = pd.get_dummies(X, drop_first=True)

In [84]:
# apply PCA to X
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9, random_state=RANDOM_SEED)

X_pca = pca.fit_transform(X_trans)

In [85]:
X_pca.shape

(2877, 54)

In [86]:
# train tet split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=.2, random_state=RANDOM_SEED)

In [87]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

model = GradientBoostingRegressor()

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

np.sqrt(-scores)

array([0.08520233, 0.0745052 , 0.08010973, 0.07285055, 0.07563516])

In [88]:
model.fit(X_train, y_train)

In [90]:
y_pred = model.predict(X_test)

In [91]:
from sklearn.metrics import mean_squared_error

RMSE = np.sqrt(mean_squared_error(y_test, y_pred))

RMSE

0.07777646330022847

In [93]:
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 19.61%


# Now without PCA

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=RANDOM_SEED)

In [59]:
model = RandomForestRegressor()

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

np.sqrt(-scores)

array([0.06760563, 0.05845945, 0.06349053, 0.05321552, 0.06113474])

In [60]:
model.fit(X_train, y_train)

In [61]:
y_pred = model.predict(X_test)

In [63]:
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))

error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 14.47%
