In [3]:
# load and summarize the dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

df = fetch_california_housing(as_frame=True)
# retrieve the array
data = df.frame.values

# split into inpiut and output elements
X, y = data[:, :-1], data[:, -1]
# summarize the shape of the dataset
print(X.shape, y.shape)
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the train and test sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(20640, 8) (20640,)
(13828, 8) (6812, 8) (13828,) (6812,)


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

data = df.frame.values

# split into inpiut and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

MAE: 0.534


In [15]:
# Removing outliers from the training dataset.

from sklearn.neighbors import LocalOutlierFactor

# split into inpiut and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
yhat

(13828, 8) (13828,)


array([1, 1, 1, ..., 1, 1, 1])

In [16]:

# select all rows that are not outliers
mask = yhat != -1
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [17]:
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

(13733, 8) (13733,)
MAE: 0.524
