In [48]:
#packages
import numpy as np
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.linalg import qr
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from autograd import grad, hessian

In [49]:
def sketch_matrix(m, n_columns, non_zero_entries):
    """Generates a sketching matrix S with random ±1 entries."""
    S = np.zeros((m, n_columns))
    scaling_factor = 1 / np.sqrt(non_zero_entries)
    
    for col in range(n_columns):
        nz_positions = np.random.choice(m, non_zero_entries, replace=False)
        values = np.random.choice([1, -1], non_zero_entries) * scaling_factor
        for idx, value in zip(nz_positions, values):
            S[idx, col] = value
    
    return S

def unconstrained_newton_sketch(f, x0, m, non_zero_entries,tolerance = 1e-6, a = 0.1, b = 0.5 , max_iter=1000):
    grad_f = grad(f)  # Gradient of f
    hess_f = hessian(f)  # Hessian of f
    
    xt = x0  # Starting point
    n = len(x0)
    for t in range(max_iter):
        # Generate the sketching matrix at each iteration
        St = sketch_matrix(m, n, non_zero_entries)

        # Compute the gradient and Hessian at the current point using autograd
        grad_value = grad_f(xt)  # Gradient at xt
        hess_value = hess_f(xt)  # Hessian at xt

        sketched_hessian = St @ hess_value @ St.T
        sketched_grad = St @ grad_value
        delta_xt_sketched = np.linalg.solve(sketched_hessian, sketched_grad)
        delta_xt = St.T @ delta_xt_sketched

        # Compute the approximate Newton decrement
        lambda_t = np.dot(grad_value, delta_xt)

        # Check stopping condition
        if lambda_t**2 / 2 <= tolerance:
            break
        
        # Backtracking line search
        step_size = 1.0
        while f(xt - step_size * delta_xt) > f(xt) - a * step_size * lambda_t:
            step_size *= b
        
        # Update
        xt = xt - step_size * delta_xt
    
    return xt, np.abs(lambda_t)

#function to be used
def least_squares_loss(x, A, b):
    """
    Computes the least squares loss: f(x) = ||Ax - b||_2^2
    """
    return np.linalg.norm(A @ x - b)**2

In [50]:
df = pd.read_csv("../Dataset/Housing.csv")

In [51]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [52]:
X = df.drop(columns=['price'])
y = df['price']
numerical_cols = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                    'airconditioning', 'prefarea', 'furnishingstatus']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Standardize numerical features
        ('cat', OneHotEncoder(), categorical_cols)   # OneHotEncode categorical features
    ])

X_transformed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)


In [53]:
m = X_train.shape[0]
non_zero_entries = 10
x0 = np.zeros((X_train.shape[1], 1))
x_optimized, _ = unconstrained_newton_sketch(lambda x: least_squares_loss(x, X_train, y_train), x0, m, non_zero_entries)
x_optimized_flat = x_optimized.flatten()
y_pred = X_test @ x_optimized_flat
y_pred = y_pred.flatten()
# Compute performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

TypeError: loop of ufunc does not support argument 0 of type ArrayBox which has no callable sqrt method