# 03. Modeling and Evaluation

## Objective
Implement and compare two models using **NumPy exclusively**:
1.  **Model 1 (OLS)**: Ordinary Least Squares Linear Regression.
2.  **Model 2 (Lasso)**: Lasso Regression (L1 Regularization) using **Coordinate Descent**.

**Evaluation Strategy**:
- **Hold-out**: 20% of data reserved for final testing.
- **Cross-Validation**: 5-Fold CV on the remaining 80% training data.
- **Metrics**: Mean Squared Error (MSE) and $R^2$ Score.

In [39]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
PROCESSED_DATA_PATH = '../data/processed/airbnb_processed.csv'

# Update dtype to reflect scaled float64 columns from preprocessing
processed_dtype = np.dtype([
    ('latitude', np.float64),
    ('longitude', np.float64),
    ('price', np.int32),           # Original Value
    ('minimum_nights', np.float64),
    ('number_of_reviews', np.float64),
    ('reviews_per_month', np.float64),
    ('calculated_host_listings_count', np.float64),
    ('availability_365', np.float64),
    ('price_log', np.float64),     # Target Variable Y
    ('neighbourhood_group_Brooklyn', np.float64),
    ('neighbourhood_group_Manhattan', np.float64),
    ('neighbourhood_group_Queens', np.float64),
    ('neighbourhood_group_Staten Island', np.float64),
    ('room_type_Private room', np.float64),
    ('room_type_Shared room', np.float64),
    ('review_activity', np.float64),
    ('review_quality_score', np.float64),
    ('is_high_value_core', np.float64),
    ('interaction_nights_reviews', np.float64) # New Feature
])

# --- 2. Data Loading Function (Using csv and NumPy) ---

def load_processed_data(file_path, target_dtype):
    with open(file_path, mode='r', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader) 
        raw_data = list(reader)
    
    raw_array = np.array(raw_data, dtype=object)
    
    N = len(raw_array)
    structured_data = np.zeros(N, dtype=target_dtype)
    
    for i, name in enumerate(target_dtype.names):
        column_data = raw_array[:, i]
        target_type = target_dtype[name].type
        try:
            structured_data[name] = column_data.astype(target_type)
        except ValueError:
            # Fallback for empty strings if any remain
            column_data[column_data == ''] = '0'
            structured_data[name] = column_data.astype(target_type)
            
    return header, structured_data

header, data = load_processed_data(PROCESSED_DATA_PATH, processed_dtype)

print("--- Data Loading Complete ---")
print(f"Data Shape: {data.shape}")
print(f"Column Names: {data.dtype.names}")

# --- Prepare X (Features) and Y (Target) ---
target_col = 'price_log'
exclude_cols = {'price', 'price_log'}
feature_names = [name for name in data.dtype.names if name not in exclude_cols]

Y_all = data[target_col]
X_list = [data[name] for name in feature_names]
X_all = np.column_stack(X_list)

print(f"\nFeature Matrix X Shape: {X_all.shape}")
print(f"Target Vector Y Shape: {Y_all.shape}")
print(f"Features used: {feature_names}")

--- Data Loading Complete ---
Data Shape: (48258,)
Column Names: ('latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'price_log', 'neighbourhood_group_Brooklyn', 'neighbourhood_group_Manhattan', 'neighbourhood_group_Queens', 'neighbourhood_group_Staten Island', 'room_type_Private room', 'room_type_Shared room', 'review_activity', 'review_quality_score', 'is_high_value_core', 'interaction_nights_reviews')

Feature Matrix X Shape: (48258, 17)
Target Vector Y Shape: (48258,)
Features used: ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'neighbourhood_group_Brooklyn', 'neighbourhood_group_Manhattan', 'neighbourhood_group_Queens', 'neighbourhood_group_Staten Island', 'room_type_Private room', 'room_type_Shared room', 'review_activity', 'review_quality_score', 'is_high_value_core', 'interaction_nights_rev

## 1. Setup and Splitting
We split the data into **Train (80%)** and **Test (20%)** sets using random shuffling.  
The 20% independent test set will strictly be used ONLY for the final evaluation.

In [40]:
np.random.seed(77)
indices = np.arange(len(X_all))
np.random.shuffle(indices)

test_size = 0.2
split_idx = int(len(X_all) * (1 - test_size))

train_indices = indices[:split_idx]
test_indices = indices[split_idx:]

X_train_full = X_all[train_indices]
Y_train = Y_all[train_indices]

X_test_full = X_all[test_indices]
Y_test = Y_all[test_indices]

print(f"Training Set: {X_train_full.shape[0]} samples")
print(f"Test Set: {X_test_full.shape[0]} samples")

Training Set: 38606 samples
Test Set: 9652 samples


## 2. Models Implementation: Theory & Code

### OLS

The goal of OLS is to find the parameter vector $W \in \mathbb{R}^d$ that minimizes the **Sum of Squared Errors (SSE)** between the true values $Y \in \mathbb{R}^n$ and the predicted values $\hat{Y} = XW$:

$$ J(W) = ||Y - XW||^2_2 = \sum_{i=1}^n (y_i - x_i^T W)^2 $$

To find the minimum, we take the gradient with respect to $W$ and set it to zero:

$$ \nabla_W J(W) = \nabla_W (Y^TY - 2Y^TXW + W^TX^TXW) = -2X^T(Y - XW) = 0 $$

Rearranging the terms gives us the **Normal Equation**:

$$ X^T X W = X^T Y \implies W = (X^T X)^{-1} X^T Y $$

**NumPy Implementation Note**:
Directly computing the inverse $(X^T X)^{-1}$ can be numerically unstable if features are multicollinear. We use `np.linalg.lstsq`, which employs **Singular Value Decomposition (SVD)** to solve the system, offering higher numerical precision and stability.



In [41]:
# --- Feature Preparation ---
def add_intercept(X):
    intercept = np.ones((X.shape[0], 1))
    return np.hstack((intercept, X))

# --- Linear Models ---

def train_linear_regression(X, Y):
    # Uses Normal Equation via SVD (lstsq) for OLS
    W, residuals, rank, singular_values = np.linalg.lstsq(X, Y, rcond=None)
    return W

def predict_linear(X, W):
    return X @ W



# --- Metrics ---
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / (ss_tot + 1e-8))

## 3. K-Fold Cross Validation: Logic & Implementation

**Objective**: To robustly estimate model performance and tune hyperparameters ($\lambda$) without touching the Test Set.

**Logic**:
1.  Divide the `X_train_full` data into $k=5$ equal folds.
2.  Iterate $i$ from $0$ to $k-1$:
    -   **Validation Fold**: Fold $i$.
    -   **Training Folds**: All folds except $i$.
3.  Train model on Training Folds $\rightarrow$ Predict on Validation Fold.
4.  Average the MSE/R2 scores across all $k$ iterations.

**NumPy Implementation Details**:
-   We use `np.arange` to generate indices.
-   We slice indices `indices[val_start:val_end]` for validation.
-   We use `np.concatenate` to merge the remaining indices for training.
-   **Note**: We manually add the intercept column inside the loop to ensure clean separation.

In [42]:
def k_fold_cross_validation(X, Y, model_type='linear', k=5, **kwargs):
    """
    Perform K-Fold CV
    X should be passed WITHOUT intercept (Intercept added inside).
    """
    fold_size = len(X) // k
    indices = np.arange(len(X))
    # Data is already shuffled at start, so sequential chunking is fine
    
    mse_scores = []
    r2_scores = []
    
    for i in range(k):
        val_start = i * fold_size
        val_end = (i + 1) * fold_size
        
        val_idx = indices[val_start:val_end]
        train_idx = np.concatenate([indices[:val_start], indices[val_end:]])
        
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]
        
        # Linear models need intercept added explicitly here
        X_train_int = add_intercept(X_train_fold)
        X_val_int = add_intercept(X_val_fold)
        
        if model_type == 'linear':
            W = train_linear_regression(X_train_int, Y_train_fold)
        else:
            raise ValueError(f"Unknown model type: {model_type}")
        
        Y_pred = predict_linear(X_val_int, W)
        
        mse_val = mse(Y_val_fold, Y_pred)
        r2_val = r2_score(Y_val_fold, Y_pred)
        
        mse_scores.append(mse_val)
        r2_scores.append(r2_val)
        
    return np.mean(mse_scores), np.mean(r2_scores)


## 4. Final Evaluation

We train the final models on the **Entire Training Set (80%)** using the optimal hyperparameters found for Lasso, and evaluate them on the **Independent Test Set (20%)**.

Metrics used:

1.  **Mean Squared Error (MSE)** - The primary loss function:
    $$ \text{MSE} = \frac{1}{n_{test}} \sum_{i=1}^{n_{test}} (y_i - \hat{y}_i)^2 $$

2.  **Coefficient of Determination ($R^2$ Score)** - Measures the proportion of variance in the dependent variable that is predictable from the independent variables:
    $$ R^2 = 1 - \frac{SS_{res}}{SS_{tot}} = 1 - \frac{\sum (y_i - \hat{y}_i)^2}{\sum (y_i - \bar{y})^2} $$
    *   $R^2 = 1$: Perfect prediction.
    *   $R^2 = 0$: The model is only as good as predicting the mean ($\bar{y}$).
    *   $R^2 < 0$: The model performs worse than a simple horizontal line (mean).

3.  **RMSE (Original Scale)**: The square root of MSE, calculated after transforming the target variable back to its original scale ($e^y - 1$). This provides interpretability in the original unit ($ USD).

In [43]:
# Train Final Models on Full Training Set
X_train_int = add_intercept(X_train_full)
X_test_int = add_intercept(X_test_full)

# 1. OLS
W_ols = train_linear_regression(X_train_int, Y_train)
Y_pred_ols = predict_linear(X_test_int, W_ols)

# Metrics
mse_ols = mse(Y_test, Y_pred_ols)
r2_ols = r2_score(Y_test, Y_pred_ols)

print("{:<15} {:<15} {:<10}".format("Model", "MSE (Log)", "R2 Score"))
print("-" * 40)
print("{:<15} {:<15.4f} {:<10.4f}".format("OLS (Linear)", mse_ols, r2_ols))

# RMSE Original Scale Calculation (USD)


# Transform back: exp(log_price) - 1
Y_test_orig = np.exp(Y_test) - 1
Y_pred_orig = np.exp(Y_pred_ols) - 1

rmse_price = np.sqrt(mse(Y_test_orig, Y_pred_orig))


Model           MSE (Log)       R2 Score  
----------------------------------------
OLS (Linear)    0.1874          0.5362    


## 5. Verification: Comparison with Scikit-learn

In this final section, we verify the correctness of our custom **NumPy** implementations by comparing them against the industry-standard **Scikit-learn** versions. 

- We rely on `sklearn.linear_model` for the models.
- We continue to use our **custom NumPy metrics** (`mse`, `r2_score`) for evaluation to ensure an apples-to-apples comparison.

In [44]:
from sklearn.linear_model import LinearRegression, Lasso

# 1. Scikit-learn OLS
sklearn_ols = LinearRegression()
sklearn_ols.fit(X_train_full, Y_train)
Y_pred_sklearn_ols = sklearn_ols.predict(X_test_full)

mse_sklearn_ols = mse(Y_test, Y_pred_sklearn_ols)
r2_sklearn_ols = r2_score(Y_test, Y_pred_sklearn_ols)
rmse_sklearn_ols = np.sqrt(mse(np.exp(Y_test)-1, np.exp(Y_pred_sklearn_ols)-1))

# 2. Scikit-learn Lasso 
sklearn_lasso = Lasso(alpha=0.0001)
sklearn_lasso.fit(X_train_full, Y_train)
Y_pred_sklearn_lasso = sklearn_lasso.predict(X_test_full)

mse_sklearn_lasso = mse(Y_test, Y_pred_sklearn_lasso)
r2_sklearn_lasso = r2_score(Y_test, Y_pred_sklearn_lasso)
rmse_sklearn_lasso = np.sqrt(mse(np.exp(Y_test)-1, np.exp(Y_pred_sklearn_lasso)-1))

# Calculate RMSE for Custom models for the table
rmse_ols = np.sqrt(mse(np.exp(Y_test)-1, np.exp(Y_pred_ols)-1))

print("=================================================================================")
print("{:<10} {:<15} {:<15} {:<10} {:<15}".format("Model", "Implementation", "MSE (Log)", "R2", "RMSE (Orig $)"))
print("-" * 75)

print("{:<10} {:<15} {:<15.4f} {:<10.4f} ${:<15.2f}".format("OLS", "Custom NumPy", mse_ols, r2_ols, rmse_ols))
print("{:<10} {:<15} {:<15.4f} {:<10.4f} ${:<15.2f}".format("OLS", "Scikit-learn", mse_sklearn_ols, r2_sklearn_ols, rmse_sklearn_ols))
print("-" * 75)
print("{:<10} {:<15} {:<15.4f} {:<10.4f} ${:<15.2f}".format("Lasso", "Scikit-learn", mse_sklearn_lasso, r2_sklearn_lasso, rmse_sklearn_lasso))
print("=================================================================================")

Model      Implementation  MSE (Log)       R2         RMSE (Orig $)  
---------------------------------------------------------------------------
OLS        Custom NumPy    0.1874          0.5362     $77.28          
OLS        Scikit-learn    0.1874          0.5362     $77.28          
---------------------------------------------------------------------------
Lasso      Scikit-learn    0.1874          0.5362     $77.32          
