In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

import warnings     # filter warning messages
warnings.simplefilter(action="ignore")


In [None]:
# Read and preprocess the data
house_data_train = pd.read_csv('train.csv').drop(columns=['Unnamed: 0', 'zipcode'])
house_data_test = pd.read_csv('test.csv').drop(columns=['Unnamed: 0', 'id', 'date', 'zipcode'])

# Make copies of the original data
house_data_train_nz = house_data_train.copy()
house_data_test_nz = house_data_test.copy()

# Normalize all columns except the first one
scaler = StandardScaler()
house_data_train_nz.iloc[:, 1:] = scaler.fit_transform(house_data_train_nz.iloc[:, 1:])
house_data_test_nz.iloc[:, 1:] = scaler.transform(house_data_test_nz.iloc[:, 1:])

# Divide values of the first column by 1000
house_data_train_nz.iloc[:, 0] /= 1000
house_data_test_nz.iloc[:, 0] /= 1000

In [None]:
# Assign response variable (y) and features (X) for training and test data
y_train = house_data_train_nz.iloc[:, 0]
X_train = house_data_train_nz.iloc[:, 1:]

y_test = house_data_test_nz.iloc[:, 0]
X_test = house_data_test_nz.iloc[:, 1:]

# Add a column of ones to X_train for the intercept term
X_train_with_intercept = np.hstack([np.ones((X_train.shape[0], 1)), X_train])

# Calculate coefficients using closed-form solution
coefficients = np.linalg.inv(X_train_with_intercept.T @ X_train_with_intercept) @ X_train_with_intercept.T @ y_train

In [None]:
# Extract model intercept and coefficients
intercept = coefficients[0]
coef_values = coefficients[1:]

# Display model intercept
print(f'Model Intercept: {intercept:.3f}\n')

# Round the coefficients to two decimal places
rounded_coefs = [round(coef, 3) for coef in coef_values]

# Create a DataFrame to associate coefficients with feature names
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': rounded_coefs})

# Display the DataFrame
print(coef_df)

# Predict on the training set
y_train_pred = X_train_with_intercept @ coefficients

# Calculate Mean Squared Error (MSE) and R-squared (R^2) score for training set
mse_train = np.mean((y_train - y_train_pred) ** 2)
r2_train = 1 - np.sum((y_train - y_train_pred) ** 2) / np.sum((y_train - np.mean(y_train)) ** 2)
print('\nMean Squared Error (MSE) on Training Set:', f'{mse_train:.3f}')
print('R-squared (R^2) Score on Training Set:', f'{r2_train:.3f}')

# Add a column of ones to X_test for the intercept term
X_test_with_intercept = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

# Predict on the test set
y_test_pred = X_test_with_intercept @ coefficients

# Calculate Mean Squared Error (MSE) and R-squared (R^2) score for testing set
mse_test = np.mean((y_test - y_test_pred) ** 2)
r2_test = 1 - np.sum((y_test - y_test_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)
print('\nMean Squared Error (MSE) on Testing Set:', f'{mse_test:.3f}')
print('R-squared (R^2) Score on Testing Set:', f'{r2_test:.3f}')

Model Intercept: 520.415

          Feature  Coefficient
0        bedrooms      -70.282
1       bathrooms       60.187
2     sqft_living       49.634
3        sqft_lot       10.130
4          floors        4.368
5      waterfront       62.201
6            view       52.341
7       condition       13.744
8           grade       85.879
9      sqft_above       81.564
10  sqft_basement       36.016
11       yr_built      -67.643
12   yr_renovated       17.271
13            lat       78.376
14           long       -1.035
15  sqft_living15       45.578
16     sqft_lot15      -12.930

Mean Squared Error (MSE) on Training Set: 34534.472
R-squared (R^2) Score on Training Set: 0.700

Mean Squared Error (MSE) on Testing Set: 56733.009
R-squared (R^2) Score on Testing Set: 0.660


In [None]:
# Predict on the training set
y_train_pred = X_train_with_intercept @ coefficients

# Calculate Mean Squared Error (MSE) and R-squared (R^2) score for training set
mse_train = np.mean((y_train - y_train_pred) ** 2)
r2_train = 1 - np.sum((y_train - y_train_pred) ** 2) / np.sum((y_train - np.mean(y_train)) ** 2)
print('Mean Squared Error (MSE) on Training Set:', f'{mse_train:.3f}')
print('R-squared (R^2) Score on Training Set:', f'{r2_train:.3f}')

Mean Squared Error (MSE) on Training Set: 34534.472
R-squared (R^2) Score on Training Set: 0.700


In [None]:
# Add a column of ones to X_test for the intercept term
X_test_with_intercept = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

# Predict on the test set
y_test_pred = X_test_with_intercept @ coefficients

# Calculate Mean Squared Error (MSE) and R-squared (R^2) score for testing set
mse_test = np.mean((y_test - y_test_pred) ** 2)
r2_test = 1 - np.sum((y_test - y_test_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)
print('Mean Squared Error (MSE) on Testing Set:', f'{mse_test:.3f}')
print('R-squared (R^2) Score on Testing Set:', f'{r2_test:.3f}')


Mean Squared Error (MSE) on Testing Set: 56733.009
R-squared (R^2) Score on Testing Set: 0.660


In [None]:
# Predict the response on a new testing point

def predict_response(X_new, coefficients):
    predicted_response = np.dot(X_new, coefficients)
    return predicted_response

X_new = np.array([1, -0.410, -0.063,
         -0.553, -0.202, -0.863,
         -0.090, -0.310, -0.673,
         -0.523, -0.241, -0.668,
         -0.073, -0.207, -0.835,
         0.262, -0.563, -0.189])

predicted_response = predict_response(X_new, coefficients)
print('Predicted Response:', predicted_response)

Predicted Response: 304.9029643995305
