#### Lab1：Regression

In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

# import sklearn's metrics
from sklearn.metrics import mean_squared_error, r2_score
# define a function for adjusted r2_score
def adj_r2_score(r2, n, k):
    return 1 - (1 - r2) * (n - 1) / (n - k)

from IPython.display import display
import pandas as pd
from sklearn.preprocessing import StandardScaler
def load_data(train_file, test_file, is_normalize=True):
    FILE_DIR = "./dataset/"
    # load the data
    train_file = pd.read_csv(FILE_DIR + train_file)
    test_file = pd.read_csv(FILE_DIR + test_file)
    # extract the house_price column for y
    train_y = train_file["house_value"]
    test_y = test_file["house_value"]
    # drop the house_price column for X
    train_file.drop("house_value", axis=1, inplace=True)
    test_file.drop("house_value", axis=1, inplace=True)
    # in any case, return it as numpy array
    train_X = train_file.to_numpy()
    test_X = test_file.to_numpy()
    train_y = train_y.to_numpy()
    test_y = test_y.to_numpy()
    if is_normalize:
        # normalize the data with sklearn scaler
        scaler = StandardScaler()
        train_file = scaler.fit_transform(train_X)
        test_file = scaler.transform(test_X)

    return train_X, train_y, test_X, test_y

X, y, test_X, test_y = load_data("train_set.csv", "test_set.csv", True)
# print the size of X, y, test_X, test_y
print("X shape: ", X.shape)
print("y shape: ", y.shape)
print("test_X shape: ", test_X.shape)
print("test_y shape: ", test_y.shape)
dataset_whole = pd.read_csv("./dataset/data_set.csv")
dataset_whole.describe()

X shape:  (15419, 12)
y shape:  (15419,)
test_X shape:  (3949, 12)
test_y shape:  (3949,)


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Unnamed: 0,median_income,house_age,avg_rooms,avg_bedrooms,block_population,avg_occupancy,latitude,longitude,house_value,diag_coord,bed_per_rooms,rooms_per_occup,pop_per_occup
count,19368.0,19368.0,19368.0,19368.0,19368.0,19368.0,19368.0,19368.0,19368.0,19368.0,19368.0,19368.0,19368.0
mean,3.673811,28.504337,5.231582,1.068762,1434.512443,2.947061,35.640001,-119.56748,1.924389,-83.927479,0.214865,1.882022,501.111266
std,1.551153,12.47344,1.324958,0.173759,1055.291048,0.975075,2.143433,2.003215,0.970605,0.79897,0.056603,0.62185,366.403133
min,0.4999,1.0,0.846154,0.333333,5.0,1.089286,32.54,-124.35,0.14999,-85.87,0.1,0.10441,2.0
25%,2.52925,18.0,4.408579,1.005401,805.0,2.45,33.93,-121.77,1.167,-84.41,0.177625,1.496966,284.0
50%,3.45115,29.0,5.170473,1.047675,1185.0,2.840974,34.26,-118.49,1.741,-84.17,0.204537,1.907405,414.0
75%,4.5833,37.0,5.945098,1.096985,1749.0,3.304772,37.72,-118.0,2.484,-83.51,0.240879,2.237978,607.0
max,12.5,52.0,37.063492,8.207547,28566.0,63.75,41.95,-114.55,5.0,-78.29,1.0,17.353982,6082.0


In [2]:
def train_test(model_name, model, X, y, test_X, test_y):
    # train the model
    model.fit(X, y)
    # calculate train set metrics
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    adj_r2 = adj_r2_score(r2, X.shape[0], X.shape[1])
    # round the metrics to 4 decimal places
    mse = round(mse, 4)
    r2 = round(r2, 4)
    adj_r2 = round(adj_r2, 4)
    train_set_metrics = [mse, r2, adj_r2]
    # predict the model
    y_pred = model.predict(test_X)
    # calculate the metrics
    mse = mean_squared_error(test_y, y_pred)
    r2 = r2_score(test_y, y_pred)
    adj_r2 = adj_r2_score(r2, test_X.shape[0], test_X.shape[1])
        # round the metrics to 4 decimal places
    mse = round(mse, 4)
    r2 = round(r2, 4)
    adj_r2 = round(adj_r2, 4)
    test_set_metrics = [mse, r2, adj_r2]
    metrics = {
        "train_set": train_set_metrics,
        "test_set": test_set_metrics
    }
    return metrics

### Linear Models

- Linear regression
- Lasso regression (cross validation)
- Ridge regression (cross validation)

In [3]:
# import sklearn's linear models
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression, Lasso

LinearRegression_model = LinearRegression()
LinearRegression_metrics = train_test("LinearRegression", LinearRegression_model, X, y, test_X, test_y)
display("Linear Regression", LinearRegression_metrics)

#SGDRegression_model = SGDRegressor(penalty=None, random_state=42)
#SGDRegression_metrics = train_test("SGDRegression", SGDRegression_model, X, y, test_X, test_y)
#display("SGDRegression_metrics: ", SGDRegression_metrics)

Lasso_model = Lasso(random_state=42)
Lasso_metrics = train_test("Lasso", Lasso_model, X, y, test_X, test_y)
display("Lasso_metrics: ", Lasso_metrics)

LassoCV_model = LassoCV(random_state=42)
LassoCV_metrics = train_test("LassoCV", LassoCV_model, X, y, test_X, test_y)
display("LassoCV_metrics: ", LassoCV_metrics)

RidgeCV_model = RidgeCV()
RidgeCV_metrics = train_test("RidgeCV", RidgeCV_model, X, y, test_X, test_y)
display("RidgeCV_metrics: ", RidgeCV_metrics)


'Linear Regression'

{'train_set': [0.3271, 0.6517, 0.6514], 'test_set': [0.3588, 0.6238, 0.6227]}

'Lasso_metrics: '

{'train_set': [0.8904, 0.0519, 0.0512], 'test_set': [0.9154, 0.04, 0.0373]}

'LassoCV_metrics: '

{'train_set': [0.3571, 0.6198, 0.6195], 'test_set': [0.3872, 0.5939, 0.5928]}

'RidgeCV_metrics: '

{'train_set': [0.3271, 0.6517, 0.6514], 'test_set': [0.3587, 0.6238, 0.6228]}

### Feature Selection Linear Model

- Lars (cross validation)

In [4]:
# import sklearn's linear model with feature selection
from sklearn.linear_model import LarsCV

LarsCV_model = LarsCV()
LarsCV_metrics = train_test("LarsCV", LarsCV_model, X, y, test_X, test_y)
display("LarsCV_metrics: ", LarsCV_metrics)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LarsCV())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




'LarsCV_metrics: '

{'train_set': [0.3271, 0.6517, 0.6514], 'test_set': [0.3588, 0.6238, 0.6227]}

### Bayesian regression

- ARD Regression

In [5]:
# import sklearn's Bayesian regression
from sklearn.linear_model import ARDRegression

ARDRegression_model = ARDRegression()
ARDRegression_metrics = train_test("ARDRegression", ARDRegression_model, X, y, test_X, test_y)
display("ARDRegression_metrics: ", ARDRegression_metrics)

'ARDRegression_metrics: '

{'train_set': [0.3406, 0.6373, 0.637], 'test_set': [0.3674, 0.6147, 0.6136]}

### Non-linear models

- SVR
- KNN
- Decision Tree

In [6]:
# import sklearn's support vector regression
from sklearn.svm import SVR, LinearSVR

SVR_model = SVR()
SVR_metrics = train_test("SVR", SVR_model, X, y, test_X, test_y)
display("SVR_metrics: ", SVR_metrics)


'SVR_metrics: '

{'train_set': [0.8613, 0.0828, 0.0821], 'test_set': [0.8816, 0.0755, 0.0729]}

In [9]:
# import sklearn's support vector regression
from sklearn.svm import SVR, LinearSVR

SVR_model = LinearSVR(max_iter=1000000, random_state=42)
SVR_metrics = train_test("SVR", SVR_model, X, y, test_X, test_y)
display("SVR_metrics: ", SVR_metrics)



'SVR_metrics: '

{'train_set': [0.4224, 0.5502, 0.5499], 'test_set': [0.4758, 0.501, 0.4996]}

In [12]:
# import sklearn's nearest neighbor regression
from sklearn.neighbors import KNeighborsRegressor

KNeighborsRegressor_model = KNeighborsRegressor(n_neighbors=100)
KNeighborsRegressor_metrics = train_test("KNeighborsRegressor", KNeighborsRegressor_model, X, y, test_X, test_y)
display("KNeighborsRegressor_metrics: ", KNeighborsRegressor_metrics)

'KNeighborsRegressor_metrics: '

{'train_set': [0.8386, 0.107, 0.1063], 'test_set': [0.8744, 0.083, 0.0805]}

In [13]:
# import sklearn's decision tree regression
from sklearn.tree import DecisionTreeRegressor

DecisionTreeRegressor_model = DecisionTreeRegressor(random_state=42)
DecisionTreeRegressor_metrics = train_test("DecisionTreeRegressor", DecisionTreeRegressor_model, X, y, test_X, test_y)
display("DecisionTreeRegressor_metrics: ", DecisionTreeRegressor_metrics)

'DecisionTreeRegressor_metrics: '

{'train_set': [0.0, 1.0, 1.0], 'test_set': [0.3764, 0.6053, 0.6042]}

### Ensemble model

- GBDT

In [14]:
# import sklearn's ensemble regression
from sklearn.ensemble import GradientBoostingRegressor

GradientBoostingRegressor_model = GradientBoostingRegressor(random_state=42)
GradientBoostingRegressor_metrics = train_test("GradientBoostingRegressor", GradientBoostingRegressor_model, X, y, test_X, test_y)
display("GradientBoostingRegressor_metrics: ", GradientBoostingRegressor_metrics)

'GradientBoostingRegressor_metrics: '

{'train_set': [0.1907, 0.7969, 0.7968], 'test_set': [0.2135, 0.7761, 0.7754]}

### Neural Network

- MLP

In [40]:
# import sklearn's neural network regression
from sklearn.neural_network import MLPRegressor

MLPRegressor_model = MLPRegressor(learning_rate_init=0.08, random_state=42, hidden_layer_sizes=(16,), max_iter=50000, learning_rate='adaptive')
MLPRegressor_metrics = train_test("MLPRegressor", MLPRegressor_model, X, y, test_X, test_y)
display("MLPRegressor_metrics: ", MLPRegressor_metrics)


'MLPRegressor_metrics: '

{'train_set': [0.4148, 0.5583, 0.558], 'test_set': [0.4732, 0.5038, 0.5024]}