In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, mean_squared_error
from sklearn.decomposition import PCA

# Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [5]:
# Deep Learning
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

import tensorflow as tf
import random

# Set random seed for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

In [6]:
columns = ['Page Popularity/likes', 'Page Checkins', 'Page talking about', 'Page Category', 'Derived Feature 5', 'Derived Feature 6', 'Derived Feature 7', 
    'Derived Feature 8', 'Derived Feature 9', 'Derived Feature 10', 'Derived Feature 11', 'Derived Feature 12', 'Derived Feature 13', 
    'Derived Feature 14', 'Derived Feature 15', 'Derived Feature 16', 'Derived Feature 17', 'Derived Feature 18', 'Derived Feature 19', 'Derived Feature 20', 
    'Derived Feature 21', 'Derived Feature 22', 'Derived Feature 23', 'Derived Feature 24', 'Derived Feature 25', 'Derived Feature 26', 'Derived Feature 27', 
    'Derived Feature 28', 'Derived Feature 29', 'CC1', 'CC2', 'CC3', 'CC4', 'CC5', 'Base time', 'Post length', 'Post Share Count', 'Post Promotion Status',
    'H Local', 'Post Published Sunday', 'Post Published Monday', 'Post Published Tuesday',  'Post Published Wednesday', 'Post Published Thursday', 
    'Post Published Friday', 'Post Published Saturday', 'Base DateTime Sunday', 'Base DateTime Monday', 'Base DateTime Tuesday','Base DateTime Wednesday', 
    'Base DateTime Thursday', 'Base DateTime Friday', 'Base DateTime Saturday', 'Target Variable' ]

data = pd.read_csv('Features_Variant_1.csv', sep=',', header=None, names=columns)

In [7]:
def test(model, X_test, y_test, y_pred=None):
    '''
    We test our model and print various metrics for comparison

    Params:
    model: to test
    X_test: which are features to test
    y_test: the real values that match X_test
    '''
    if y_pred is None:
        y_pred = model.predict(X_test)
    
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mabse = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Root mean Squared Error: {rmse:.4f}")
    print(f"Mean absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Median absolute Error: {mabse:.4f}")
    print(f"R² Score: {r2:.4f}")

We firstly try without any preprocessing in order to see that it can be useful or not

In [8]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# KNN
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print("KNN")
test(knn, X_test, y_test)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
print("Decision Tree")
test(dt, X_test, y_test)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
print("Linear Regression")
test(lr, X_test, y_test)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
print("Random Forest")
test(rf, X_test, y_test)


KNN
Root mean Squared Error: 27.8425
Mean absolute Error: 6.3345
Mean Squared Error: 775.20
Median absolute Error: 0.8000
R² Score: 0.2366
Decision Tree
Root mean Squared Error: 30.8626
Mean absolute Error: 5.5978
Mean Squared Error: 952.50
Median absolute Error: 1.0000
R² Score: 0.0620
Linear Regression
Root mean Squared Error: 25.8992
Mean absolute Error: 8.2284
Mean Squared Error: 670.77
Median absolute Error: 4.3224
R² Score: 0.3395
Random Forest
Root mean Squared Error: 19.3486
Mean absolute Error: 4.0103
Mean Squared Error: 374.37
Median absolute Error: 0.5400
R² Score: 0.6313


# Autoencoder

 ### Why Use a Deep Autoencoder?

The purpose of using a deep autoencoder in this project is to perform automatic feature compression before feeding the data into regression models.

**Why Autoencoders?**
- High-dimensional feature sets may contain noise and redundant information.
- Autoencoders can learn non-linear relationships and compress the data into a lower-dimensional representation.
- This can reduce overfitting and improve generalization, especially for distance-based models like KNN.

**What we do here:**
- We experiment with various encoder output dimensions: 8, 16, and 32.
- We try different layer sizes (64 and 128) to understand which architecture gives the best downstream performance.


Dim 8 Size 64

In [9]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 8 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163us/step
<----KNN after autoencoder---->
Root mean Squared Error: 27.3021
Mean absolute Error: 6.1458
Mean Squared Error: 745.40
Median absolute Error: 0.6000
R² Score: 0.2660
<----Decision tree after autoencoder---->
Root mean Squared Error: 40.1229
Mean absolute Error: 8.4191
Mean Squared Error: 1609.85
Median absolute Error: 1.0000
R² Score: -0.5853
<----Linear Regression after autoencoder---->
Root mean Squared Error: 28.8518
Mean absolute Error: 11.0325
Mean Squared Error: 832.43
Median absolute Error: 7.0630
R² Score: 0.1803
<----Random Forest after autoencoder---->
Root mean Squared Error: 26.6721
Mean absolute Error: 6.5213
Mean Squared Error: 711.40
Median absolute Error: 0.8800
R² Score: 0.2995


Dim 16 Size 64

In [10]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 16 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166us/step
<----KNN after autoencoder---->
Root mean Squared Error: 26.4927
Mean absolute Error: 5.8924
Mean Squared Error: 701.86
Median absolute Error: 0.6000
R² Score: 0.3088
<----Decision tree after autoencoder---->
Root mean Squared Error: 41.9934
Mean absolute Error: 8.5478
Mean Squared Error: 1763.44
Median absolute Error: 1.0000
R² Score: -0.7365
<----Linear Regression after autoencoder---->
Root mean Squared Error: 26.2687
Mean absolute Error: 9.6763
Mean Squared Error: 690.04
Median absolute Error: 6.3191
R² Score: 0.3205
<----Random Forest after autoencoder---->
Root mean Squared Error: 25.8123
Mean absolute Error: 6.1131
Mean Squared Error: 666.27
Median absolute Error: 0.8300
R² Score: 0.3439


Dim 32 Size 64

In [11]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 32 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170us/step
<----KNN after autoencoder---->
Root mean Squared Error: 26.0204
Mean absolute Error: 5.9155
Mean Squared Error: 677.06
Median absolute Error: 0.6000
R² Score: 0.3333
<----Decision tree after autoencoder---->
Root mean Squared Error: 43.1192
Mean absolute Error: 8.1569
Mean Squared Error: 1859.27
Median absolute Error: 1.0000
R² Score: -0.8309
<----Linear Regression after autoencoder---->
Root mean Squared Error: 25.9471
Mean absolute Error: 8.5122
Mean Squared Error: 673.25
Median absolute Error: 4.8431
R² Score: 0.3370
<----Random Forest after autoencoder---->
Root mean Squared Error: 25.1757
Mean absolute Error: 5.9525
Mean Squared Error: 633.82
Median absolute Error: 0.7800
R² Score: 0.3759


Dim 8 Size 128

In [12]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 8 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167us/step
<----KNN after autoencoder---->
Root mean Squared Error: 28.0003
Mean absolute Error: 6.2701
Mean Squared Error: 784.02
Median absolute Error: 0.6000
R² Score: 0.2279
<----Decision tree after autoencoder---->
Root mean Squared Error: 37.4713
Mean absolute Error: 7.8851
Mean Squared Error: 1404.10
Median absolute Error: 1.0000
R² Score: -0.3827
<----Linear Regression after autoencoder---->
Root mean Squared Error: 28.2485
Mean absolute Error: 9.6045
Mean Squared Error: 797.98
Median absolute Error: 4.6434
R² Score: 0.2142
<----Random Forest after autoencoder---->
Root mean Squared Error: 27.2697
Mean absolute Error: 6.4656
Mean Squared Error: 743.64
Median absolute Error: 0.8300
R² Score: 0.2677


Dim 16 Size 128

In [13]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 16 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165us/step
<----KNN after autoencoder---->
Root mean Squared Error: 27.1258
Mean absolute Error: 5.9540
Mean Squared Error: 735.81
Median absolute Error: 0.6000
R² Score: 0.2754
<----Decision tree after autoencoder---->
Root mean Squared Error: 37.8544
Mean absolute Error: 7.5707
Mean Squared Error: 1432.95
Median absolute Error: 1.0000
R² Score: -0.4111
<----Linear Regression after autoencoder---->
Root mean Squared Error: 26.3381
Mean absolute Error: 8.8536
Mean Squared Error: 693.70
Median absolute Error: 5.1244
R² Score: 0.3169
<----Random Forest after autoencoder---->
Root mean Squared Error: 25.3281
Mean absolute Error: 5.9628
Mean Squared Error: 641.51
Median absolute Error: 0.7500
R² Score: 0.3683


Dim 32 Size 128

In [14]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 32 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164us/step
<----KNN after autoencoder---->
Root mean Squared Error: 26.4577
Mean absolute Error: 5.9090
Mean Squared Error: 700.01
Median absolute Error: 0.6000
R² Score: 0.3107
<----Decision tree after autoencoder---->
Root mean Squared Error: 42.5061
Mean absolute Error: 8.5607
Mean Squared Error: 1806.77
Median absolute Error: 1.0000
R² Score: -0.7792
<----Linear Regression after autoencoder---->
Root mean Squared Error: 25.9111
Mean absolute Error: 8.5182
Mean Squared Error: 671.39
Median absolute Error: 4.7773
R² Score: 0.3389
<----Random Forest after autoencoder---->
Root mean Squared Error: 25.2683
Mean absolute Error: 6.1596
Mean Squared Error: 638.49
Median absolute Error: 0.8300
R² Score: 0.3712


### Conclusion: What Did We Learn from Using Autoencoders?

**Best Configuration:**
- Encoder dimension = **32**
- Hidden layer size = **64**

**Why?**
- This setting retained enough information to represent the features well without over-compressing.
- It led to the best performance improvement in **K-Nearest Neighbors (KNN)**.

**Impact on Models:**
- **KNN improved significantly**, because reducing dimensionality helped mitigate the "curse of dimensionality" which affects distance-based models.
- **Other models (Decision Tree, Linear Regression, Random Forest)** showed little to no improvement, and in some cases performed slightly worse.
  - This suggests those models are either more robust to feature space size or already benefit from the full feature information.

**Takeaway:**
Using a deep autoencoder can be a valuable preprocessing step, especially when:
- The input space is high-dimensional
- We're using models sensitive to feature space structure (like KNN)


# PCA
When data is linearly separable, simple models like linear regression, logistic regression, or linear SVMs can more easily separate the classes and predict values correctly.
To try and improve the linearity of our data, previously we applied autoencoder and decoder to reduce the dimensionality of our dataset. Now, we will apply PCA.

#### Why PCA?
As seen in class, PCA:

- Reduces dimensionality while preserving variance in the data
- Eliminates multicollinearity between highly correlated features. This could help because the derived features of our dataset are highly correlated and could assist in reducing redundancy.
- Reduces noise that may be present in less relevant features. We think that this point could help with the outliers contained in the data.
- Improves computational efficiency

In [15]:
def PCA_Transform(X_train_, X_test_):    
    # 2. Standardize features FIRST
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_)
    X_test_scaled = scaler.transform(X_test_)
    
    # 3. THEN apply PCA to scaled data
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Reduced dimensions from {X_train_.shape[1]} to {X_train_pca.shape[1]} features")
    
    return (X_train_pca, X_test_pca)

In [16]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test = PCA_Transform(X_train, X_test)

# KNN
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print("<----KNN---->")
test(knn, X_test, y_test)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
print("<----Decision Tree---->")
test(dt, X_test, y_test)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
print("<----Linear Regression---->")
test(lr, X_test, y_test)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
print("<----Random Forest---->")
test(rf, X_test, y_test)


Reduced dimensions from 53 to 25 features
<----KNN---->
Root mean Squared Error: 26.5127
Mean absolute Error: 5.8084
Mean Squared Error: 702.92
Median absolute Error: 0.6000
R² Score: 0.3078
<----Decision Tree---->
Root mean Squared Error: 37.8055
Mean absolute Error: 7.4654
Mean Squared Error: 1429.26
Median absolute Error: 1.0000
R² Score: -0.4075
<----Linear Regression---->
Root mean Squared Error: 26.0287
Mean absolute Error: 8.6338
Mean Squared Error: 677.50
Median absolute Error: 4.9093
R² Score: 0.3328
<----Random Forest---->
Root mean Squared Error: 24.0771
Mean absolute Error: 5.5520
Mean Squared Error: 579.71
Median absolute Error: 0.6600
R² Score: 0.4291


### Conclusions: What did we learned from using PCA? 
Our results demonstrate mixed outcomes across models when applying PCA:

- KNN showed clear improvement (RMSE: 27.84 → 26.51, R²: 0.24 → 0.31) as expected for distance-based algorithms. Especially since in class we saw the curse of dimensionality in which KNN is highly sensitive to high dimensional features.
- Linear regression remained relatively stable with minimal change. We expected the model to improve slightly because it is a linear model.
- Tree-based models (Decision Tree and Random Forest) performed worse after PCA, likely because they already handle feature selection implicitly (and could improve by limiting size of the tree) and we are reducing the amount of information. Also, decision trees are more robust against outliers.