# 03. Modeling and Evaluation

## Objective
Implement and compare three models using **NumPy exclusively**:
1.  **Model 1 (Reduced)**: Linear Regression on key features.
2.  **Model 2 (Full - Stabilized)**: Ridge Regression on all features (including Advanced Features).
3.  **Model 3 (KNN)**: K-Nearest Neighbors Regression.

**Evaluation Strategy**:
- **Hold-out**: 20% of data reserved for final testing.
- **Cross-Validation**: 5-Fold CV on the remaining 80% training data.
- **Metrics**: Mean Squared Error (MSE) and $R^2$ Score.

In [1]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns

PROCESSED_DATA_PATH = '../data/processed/airbnb_processed.csv'

def load_processed_data(file_path):
    with open(file_path, mode='r', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader)
        data = list(reader)
    return header, np.array(data, dtype=float)

print("Loading processed data...")
header, data = load_processed_data(PROCESSED_DATA_PATH)
feature_names = header[:-1]
target_name = header[-1]

X_all = data[:, :-1]
Y_all = data[:, -1]

print("Features:", feature_names)
print("Data Shape:", X_all.shape)

Loading processed data...
Features: ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'price_log', 'neighbourhood_group_Brooklyn', 'neighbourhood_group_Manhattan', 'neighbourhood_group_Queens', 'neighbourhood_group_Staten Island', 'room_type_Private room', 'room_type_Shared room', 'review_activity', 'review_quality_score']
Data Shape: (48258, 17)


## 1. Setup and Splitting
Split data into **Train (80%)** and **Test (20%)** sets explicitly.

In [2]:
np.random.seed(42)
indices = np.arange(len(X_all))
np.random.shuffle(indices)

test_size = 0.2
split_idx = int(len(X_all) * (1 - test_size))

train_indices = indices[:split_idx]
test_indices = indices[split_idx:]

X_train_full = X_all[train_indices]
Y_train = Y_all[train_indices]

X_test_full = X_all[test_indices]
Y_test = Y_all[test_indices]

print(f"Training Set: {X_train_full.shape[0]} samples")
print(f"Test Set: {X_test_full.shape[0]} samples")

Training Set: 38606 samples
Test Set: 9652 samples


## 2. Models Implementation (Linear & KNN)
1. **Linear Regression** (Normal Equation)
2. **Ridge Regression** (Stabilized Normal Eq)
3. **KNN Regression** (Euclidean Distance)

In [3]:
def add_intercept(X):
    intercept = np.ones((X.shape[0], 1))
    return np.hstack((intercept, X))

# --- Linear Models ---
def train_linear_regression(X, Y):
    X_T = X.T
    try:
        W = np.linalg.inv(X_T @ X) @ X_T @ Y
    except np.linalg.LinAlgError:
        W = np.linalg.pinv(X_T @ X) @ X_T @ Y
    return W

def train_ridge_regression(X, Y, lambda_reg=0.001):
    X_T = X.T
    n_features = X.shape[1]
    I = np.eye(n_features)
    I[0, 0] = 0 # No reg on intercept
    try:
        W = np.linalg.inv(X_T @ X + lambda_reg * I) @ X_T @ Y
    except np.linalg.LinAlgError:
         W = np.linalg.pinv(X_T @ X + lambda_reg * I) @ X_T @ Y
    return W

def predict_linear(X, W):
    return X @ W

# --- KNN Model ---
class KNNRegressor:
    def __init__(self, k=5):
        self.k = k
        self.X_train = None
        self.Y_train = None

    def fit(self, X, Y):
        self.X_train = X
        self.Y_train = Y

    def predict(self, X_test):
        predictions = []
        # Loop through each test point (vectorized distance calc per point)
        # For large datasets, this can be slow. We optimize where possible.
        for i in range(len(X_test)):
            test_point = X_test[i]
            
            # Euclidean Distance: sqrt(sum((x - y)^2))
            # Broadcasting test_point across all X_train
            distances = np.sqrt(np.sum((self.X_train - test_point) ** 2, axis=1))
            
            # Find K nearest indices
            # argpartition is faster than sort for finding top k
            nearest_indices = np.argpartition(distances, self.k)[:self.k]
            
            # Get mean of Y values
            k_nearest_y = self.Y_train[nearest_indices]
            pred = np.mean(k_nearest_y)
            predictions.append(pred)
            
        return np.array(predictions)

# --- Metrics ---
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)

## 3. K-Fold Cross Validation Helper
Supports both Linear Weights and KNN Object approaches.

In [4]:
def k_fold_cross_validation(X, Y, model_type='linear', k=5, **kwargs):
    fold_size = len(X) // k
    indices = np.arange(len(X))
    
    mse_scores = []
    r2_scores = []
    
    print(f"Starting {k}-Fold CV for {model_type}...")
    
    for i in range(k):
        val_start = i * fold_size
        val_end = (i + 1) * fold_size
        
        val_idx = indices[val_start:val_end]
        train_idx = np.concatenate([indices[:val_start], indices[val_end:]])
        
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]
        
        if model_type == 'linear':
            W = train_linear_regression(X_train_fold, Y_train_fold)
            Y_pred = predict_linear(X_val_fold, W)
        elif model_type == 'ridge':
            W = train_ridge_regression(X_train_fold, Y_train_fold, **kwargs)
            Y_pred = predict_linear(X_val_fold, W)
        elif model_type == 'knn':
            knn = KNNRegressor(**kwargs)
            knn.fit(X_train_fold, Y_train_fold)
            Y_pred = knn.predict(X_val_fold)
            
        mse_val = mse(Y_val_fold, Y_pred)
        r2_val = r2_score(Y_val_fold, Y_pred)
        
        mse_scores.append(mse_val)
        r2_scores.append(r2_val)
        # print(f"Fold {i+1}: MSE={mse_val:.4f}, R2={r2_val:.4f}")
        
    return np.mean(mse_scores), np.mean(r2_scores)

## 4. Model Comparisons

### Model 1: Reduced Features (Standard Linear Regression)
Updated to include relevant advanced features if applicable. 
Let's select: `min_nights_std`, `availability_365_std`, `review_density_std`, `poly_min_nights_sq`, `interact_min_nights_reviews_month` + Room Types.

In [5]:
# Define Reduced Feature Set (Manual Selection based on intuition/correlation)
reduced_features = [
    'min_nights_std', 
    'availability_365_std', 
    'review_density_std',
    'poly_min_nights_sq',             # New Polynomial
    'interact_min_nights_reviews_month' # New Interaction
]
# Add Room Type Features dynamically
rt_features = [f for f in feature_names if 'Private room' in f or 'Entire home' in f or 'Shared room' in f]
reduced_features += rt_features

reduced_indices = [feature_names.index(f) for f in reduced_features]

# Prepare Reduced X (Add Intercept for LR)
X_train_red = add_intercept(X_train_full[:, reduced_indices])
X_test_red = add_intercept(X_test_full[:, reduced_indices])

mse_cv_red, r2_cv_red = k_fold_cross_validation(X_train_red, Y_train, model_type='linear', k=5)
print(f"Model 1 (Reduced) - 5-Fold CV: MSE = {mse_cv_red:.4f}, R2 = {r2_cv_red:.4f}")

ValueError: 'min_nights_std' is not in list

### Model 2: Full Features (Ridge Regression)
Using all 300+ features (including OHE neighbourhoods and new interactions).

In [6]:
X_train_ridge = add_intercept(X_train_full)
X_test_ridge = add_intercept(X_test_full)

lambda_val = 0.001
mse_cv_full, r2_cv_full = k_fold_cross_validation(X_train_ridge, Y_train, model_type='ridge', k=5, lambda_reg=lambda_val)
print(f"Model 2 (Full Ridge) - 5-Fold CV: MSE = {mse_cv_full:.4f}, R2 = {r2_cv_full:.4f}")

Starting 5-Fold CV for ridge...
Model 2 (Full Ridge) - 5-Fold CV: MSE = 0.1119, R2 = 0.3577


### Model 3: KNN Regression (Full Features)
Distance-based model. 
**Note**: KNN is computationally expensive (O(N*M)) for prediction. We use a subset of training data or smaller K-Folds if it's too slow, but for this assignment we run full on Full Features.

In [7]:
# No intercept needed for KNN
# Note: Running KNN CV on Full Features might be slow depending on dataset size (~30k rows).
# We will proceed with k=5 neighbors.

mse_cv_knn, r2_cv_knn = k_fold_cross_validation(X_train_full, Y_train, model_type='knn', k=5)
print(f"Model 3 (KNN)        - 5-Fold CV: MSE = {mse_cv_knn:.4f}, R2 = {r2_cv_knn:.4f}")

Starting 5-Fold CV for knn...


KeyboardInterrupt: 

## 5. Final Evaluation on Test Set
Train strictly on Train Set, Evaluate on Test Set.

In [None]:
print("\n---- Final Test Set Evaluation ----")

# Model 1 Final
W_red = train_linear_regression(X_train_red, Y_train)
Y_pred_red = predict_linear(X_test_red, W_red)
test_mse_red = mse(Y_test, Y_pred_red)
test_r2_red = r2_score(Y_test, Y_pred_red)

# Model 2 Final
W_ridge = train_ridge_regression(X_train_ridge, Y_train, lambda_reg=lambda_val)
Y_pred_ridge = predict_linear(X_test_ridge, W_ridge)
test_mse_ridge = mse(Y_test, Y_pred_ridge)
test_r2_ridge = r2_score(Y_test, Y_pred_ridge)

# Model 3 Final
knn_final = KNNRegressor(k=5)
knn_final.fit(X_train_full, Y_train)
Y_pred_knn = knn_final.predict(X_test_full)
test_mse_knn = mse(Y_test, Y_pred_knn)
test_r2_knn = r2_score(Y_test, Y_pred_knn)

print(f"Model 1 (Reduced)    : MSE = {test_mse_red:.4f}, R2 = {test_r2_red:.4f}")
print(f"Model 2 (Full Ridge) : MSE = {test_mse_ridge:.4f}, R2 = {test_r2_ridge:.4f}")
print(f"Model 3 (KNN)        : MSE = {test_mse_knn:.4f}, R2 = {test_r2_knn:.4f}")

# Conclusion Logic
best_r2 = max(test_r2_red, test_r2_ridge, test_r2_knn)
if best_r2 == test_r2_knn:
    print("\nConclusion: Non-linear KNN model performed best, suggesting non-linear relationships dominate.")
elif best_r2 == test_r2_ridge:
    print("\nConclusion: Stabilized Full Feature Linear model performed best.")
else:
    print("\nConclusion: Reduced Feature Linear model performed best.")


---- Final Test Set Evaluation ----
Model 1 (Reduced)    : MSE = 0.5771, R2 = 0.4283
Model 2 (Full Ridge) : MSE = 0.5013, R2 = 0.5035
Model 3 (KNN)        : MSE = 0.5332, R2 = 0.4719

Conclusion: Stabilized Full Feature Linear model performed best.
