In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

import warnings     # filter warning messages
warnings.simplefilter(action="ignore")


def calc_metrics(y_true, y_pred):
    """
    Calculate Mean Squared Error (MSE) and R-squared (R^2) score.
    """
    mse = np.mean((y_true - y_pred) ** 2)
    r2 = 1 - np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2)
    return mse, r2

def gradient_descent(X, y, alpha, num_iterations):
    """
    Perform gradient descent to minimize the cost function.
    """
    num_samples, num_features = X.shape
    theta = np.zeros(num_features + 1)  # Initialize model parameters to zero, including intercept

    # Insert a column of ones at the beginning of X (intercept term)
    X_with_intercept = np.hstack([np.ones((num_samples, 1)), X])

    for _ in range(num_iterations):
        # Calculate predictions
        y_pred = X_with_intercept @ theta

        # Calculate gradients
        gradients = -(2/num_samples) * X_with_intercept.T @ (y - y_pred)

        # Update parameters
        theta -= alpha * gradients

    return theta

# Read and preprocess the data
house_data_train = pd.read_csv('train.csv').drop(columns=['Unnamed: 0', 'zipcode'])
house_data_test = pd.read_csv('test.csv').drop(columns=['Unnamed: 0', 'id', 'date', 'zipcode'])

# Make copies of the original data
house_data_train_nz = house_data_train.copy()
house_data_test_nz = house_data_test.copy()

# Normalize all columns except the first one
scaler = StandardScaler()
house_data_train_nz.iloc[:, 1:] = scaler.fit_transform(house_data_train_nz.iloc[:, 1:])
house_data_test_nz.iloc[:, 1:] = scaler.transform(house_data_test_nz.iloc[:, 1:])

# Divide values of the first column by 1000
house_data_train_nz.iloc[:, 0] /= 1000
house_data_test_nz.iloc[:, 0] /= 1000

# Assign response variable (y) and features (X) for training and test data
y_train = house_data_train_nz.iloc[:, 0]
X_train = house_data_train_nz.iloc[:, 1:]

y_test = house_data_test_nz.iloc[:, 0]
X_test = house_data_test_nz.iloc[:, 1:]

# Define learning rates and number of iterations
learning_rates = [0.01, 0.1, 0.5]
num_iterations_list = [10, 50, 100]

# Initialize results dictionary
results = {'Learning Rate': [], 'Num Iterations': [],
           'MSE Train': [], 'R2 Train': [],
           'MSE Test': [], 'R2 Test': []}

# Iterate over learning rates and number of iterations
for alpha in learning_rates:
    for num_iterations in num_iterations_list:
        # Perform gradient descent
        theta = gradient_descent(X_train.values,
                                 y_train.values, alpha, num_iterations)

        # Predict on the training set
        y_train_pred = np.hstack([np.ones((len(X_train), 1)), X_train.values]) @ theta

        # Calculate evaluation metrics for training set
        mse_train, r2_train = calc_metrics(y_train, y_train_pred)

        # Predict on the test set
        y_test_pred = np.hstack([np.ones((len(X_test), 1)), X_test.values]) @ theta

        # Calculate evaluation metrics for testing set
        mse_test, r2_test = calc_metrics(y_test, y_test_pred)

        # Append results to the dictionary
        results['Learning Rate'].append(alpha)
        results['Num Iterations'].append(num_iterations)
        results['MSE Train'].append(mse_train)
        results['R2 Train'].append(r2_train)
        results['MSE Test'].append(mse_test)
        results['R2 Test'].append(r2_test)

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame(results)

# Display the results
print(results_df)
