In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, mean_squared_error
from sklearn.decomposition import PCA

# Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Deep Learning
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

In [9]:
columns = ['Page Popularity/likes', 'Page Checkins', 'Page talking about', 'Page Category', 'Derived Feature 5', 'Derived Feature 6', 'Derived Feature 7', 
    'Derived Feature 8', 'Derived Feature 9', 'Derived Feature 10', 'Derived Feature 11', 'Derived Feature 12', 'Derived Feature 13', 
    'Derived Feature 14', 'Derived Feature 15', 'Derived Feature 16', 'Derived Feature 17', 'Derived Feature 18', 'Derived Feature 19', 'Derived Feature 20', 
    'Derived Feature 21', 'Derived Feature 22', 'Derived Feature 23', 'Derived Feature 24', 'Derived Feature 25', 'Derived Feature 26', 'Derived Feature 27', 
    'Derived Feature 28', 'Derived Feature 29', 'CC1', 'CC2', 'CC3', 'CC4', 'CC5', 'Base time', 'Post length', 'Post Share Count', 'Post Promotion Status',
    'H Local', 'Post Published Sunday', 'Post Published Monday', 'Post Published Tuesday',  'Post Published Wednesday', 'Post Published Thursday', 
    'Post Published Friday', 'Post Published Saturday', 'Base DateTime Sunday', 'Base DateTime Monday', 'Base DateTime Tuesday','Base DateTime Wednesday', 
    'Base DateTime Thursday', 'Base DateTime Friday', 'Base DateTime Saturday', 'Target Variable' ]

data = pd.read_csv('./Dataset/Training/Features_Variant_1.csv', sep=',', header=None, names=columns)

In [10]:
def test(model, X_test, y_test, y_pred=None):
    '''
    We test our model and print various metrics for comparison

    Params:
    model: to test
    X_test: which are features to test
    y_test: the real values that match X_test
    '''
    if y_pred is None:
        y_pred = model.predict(X_test)
    
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mabse = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Root mean Squared Error: {rmse:.4f}")
    print(f"Mean absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Median absolute Error: {mabse:.4f}")
    print(f"R² Score: {r2:.4f}")

We firstly try without any preprocessing in order to see that it can be useful or not

In [11]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# KNN
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print("KNN")
test(knn, X_test, y_test)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
print("Decision Tree")
test(dt, X_test, y_test)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
print("Linear Regression")
test(lr, X_test, y_test)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
print("Random Forest")
test(rf, X_test, y_test)


KNN
Root mean Squared Error: 27.8424
Mean absolute Error: 6.3344
Mean Squared Error: 775.20
Median absolute Error: 0.8000
R² Score: 0.2366
Decision Tree
Root mean Squared Error: 30.8626
Mean absolute Error: 5.5978
Mean Squared Error: 952.50
Median absolute Error: 1.0000
R² Score: 0.0620
Linear Regression
Root mean Squared Error: 25.8992
Mean absolute Error: 8.2284
Mean Squared Error: 670.77
Median absolute Error: 4.3224
R² Score: 0.3395
Random Forest
Root mean Squared Error: 19.3486
Mean absolute Error: 4.0103
Mean Squared Error: 374.37
Median absolute Error: 0.5400
R² Score: 0.6313


# Autoencoder

 ### Why Use a Deep Autoencoder?

The purpose of using a deep autoencoder in this project is to perform automatic feature compression before feeding the data into regression models.

**Why Autoencoders?**
- High-dimensional feature sets may contain noise and redundant information.
- Autoencoders can learn non-linear relationships and compress the data into a lower-dimensional representation.
- This can reduce overfitting and improve generalization, especially for distance-based models like KNN.

**What we do here:**
- We experiment with various encoder output dimensions: 8, 16, and 32.
- We try different layer sizes (64 and 128) to understand which architecture gives the best downstream performance.


Dim 8 Size 64

In [20]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 8 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165us/step
<----KNN after autoencoder---->
Root mean Squared Error: 22.7580
Mean absolute Error: 5.0522
Mean Squared Error: 517.93
Median absolute Error: 0.6000
R² Score: 0.4642
<----Decision tree after autoencoder---->
Root mean Squared Error: 31.0678
Mean absolute Error: 6.6559
Mean Squared Error: 965.21
Median absolute Error: 1.0000
R² Score: 0.0014
<----Linear Regression after autoencoder---->
Root mean Squared Error: 24.2972
Mean absolute Error: 12.1064
Mean Squared Error: 590.36
Median absolute Error: 9.0747
R² Score: 0.3893
<----Random Forest after autoencoder---->
Root mean Squared Error: 20.8889
Mean absolute Error: 4.9870
Mean Squared Error: 436.35
Median absolute Error: 0.7200
R² Score: 0.5486


Dim 16 Size 64

In [21]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 16 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166us/step
<----KNN after autoencoder---->
Root mean Squared Error: 21.3925
Mean absolute Error: 4.7361
Mean Squared Error: 457.64
Median absolute Error: 0.6000
R² Score: 0.5266
<----Decision tree after autoencoder---->
Root mean Squared Error: 36.0676
Mean absolute Error: 6.9967
Mean Squared Error: 1300.87
Median absolute Error: 1.0000
R² Score: -0.3458
<----Linear Regression after autoencoder---->
Root mean Squared Error: 21.9840
Mean absolute Error: 9.0810
Mean Squared Error: 483.30
Median absolute Error: 5.7983
R² Score: 0.5000
<----Random Forest after autoencoder---->
Root mean Squared Error: 21.0502
Mean absolute Error: 4.9027
Mean Squared Error: 443.11
Median absolute Error: 0.7100
R² Score: 0.5416


Dim 32 Size 64

In [22]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 32 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169us/step
<----KNN after autoencoder---->
Root mean Squared Error: 22.1247
Mean absolute Error: 4.7540
Mean Squared Error: 489.50
Median absolute Error: 0.6000
R² Score: 0.4936
<----Decision tree after autoencoder---->
Root mean Squared Error: 32.4660
Mean absolute Error: 6.1632
Mean Squared Error: 1054.04
Median absolute Error: 1.0000
R² Score: -0.0904
<----Linear Regression after autoencoder---->
Root mean Squared Error: 21.1119
Mean absolute Error: 5.3770
Mean Squared Error: 445.71
Median absolute Error: 1.9281
R² Score: 0.5389
<----Random Forest after autoencoder---->
Root mean Squared Error: 21.0156
Mean absolute Error: 4.7940
Mean Squared Error: 441.65
Median absolute Error: 0.6900
R² Score: 0.5431


Dim 8 Size 128

In [23]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 8 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164us/step
<----KNN after autoencoder---->
Root mean Squared Error: 22.2540
Mean absolute Error: 4.9397
Mean Squared Error: 495.24
Median absolute Error: 0.6000
R² Score: 0.4877
<----Decision tree after autoencoder---->
Root mean Squared Error: 34.2901
Mean absolute Error: 6.8371
Mean Squared Error: 1175.81
Median absolute Error: 1.0000
R² Score: -0.2164
<----Linear Regression after autoencoder---->
Root mean Squared Error: 22.8160
Mean absolute Error: 11.2708
Mean Squared Error: 520.57
Median absolute Error: 8.6747
R² Score: 0.4614
<----Random Forest after autoencoder---->
Root mean Squared Error: 21.3678
Mean absolute Error: 4.9097
Mean Squared Error: 456.58
Median absolute Error: 0.7400
R² Score: 0.5276


Dim 16 Size 128

In [24]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 16 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170us/step
<----KNN after autoencoder---->
Root mean Squared Error: 21.8257
Mean absolute Error: 4.7603
Mean Squared Error: 476.36
Median absolute Error: 0.6000
R² Score: 0.5072
<----Decision tree after autoencoder---->
Root mean Squared Error: 33.0705
Mean absolute Error: 6.7394
Mean Squared Error: 1093.65
Median absolute Error: 1.0000
R² Score: -0.1314
<----Linear Regression after autoencoder---->
Root mean Squared Error: 22.5850
Mean absolute Error: 10.6199
Mean Squared Error: 510.08
Median absolute Error: 6.9628
R² Score: 0.4723
<----Random Forest after autoencoder---->
Root mean Squared Error: 21.0097
Mean absolute Error: 4.8621
Mean Squared Error: 441.41
Median absolute Error: 0.7300
R² Score: 0.5433


Dim 32 Size 128

In [25]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

# Standardize features before autoencoding
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define autoencoder architecture
input_dim = X_scaled.shape[1]
encoding_dim = 32 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


# KNN
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train_enc)
print("<----KNN after autoencoder---->")
test(knn, X_test_enc, y_test_enc)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_enc, y_train_enc)
print("<----Decision tree after autoencoder---->")
test(dt, X_test_enc, y_test_enc)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)
print("<----Linear Regression after autoencoder---->")
test(lr, X_test_enc, y_test_enc)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_enc, y_train_enc)
print("<----Random Forest after autoencoder---->")
test(rf, X_test_enc, y_test_enc)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179us/step
<----KNN after autoencoder---->
Root mean Squared Error: 21.5372
Mean absolute Error: 4.7119
Mean Squared Error: 463.85
Median absolute Error: 0.6000
R² Score: 0.5201
<----Decision tree after autoencoder---->
Root mean Squared Error: 30.7675
Mean absolute Error: 6.2853
Mean Squared Error: 946.64
Median absolute Error: 1.0000
R² Score: 0.0207
<----Linear Regression after autoencoder---->
Root mean Squared Error: 21.0432
Mean absolute Error: 5.6267
Mean Squared Error: 442.82
Median absolute Error: 2.0977
R² Score: 0.5419
<----Random Forest after autoencoder---->
Root mean Squared Error: 21.0101
Mean absolute Error: 4.7327
Mean Squared Error: 441.43
Median absolute Error: 0.6800
R² Score: 0.5433


### Conclusion: What Did We Learn from Using Autoencoders?

**Best Configuration:**
- Encoder dimension = **32**
- Hidden layer size = **64**

**Why?**
- This setting retained enough information to represent the features well without over-compressing.
- It led to the best performance improvement in **K-Nearest Neighbors (KNN)**.

**Impact on Models:**
- **KNN improved significantly**, because reducing dimensionality helped mitigate the "curse of dimensionality" which affects distance-based models.
- **Other models (Decision Tree, Linear Regression, Random Forest)** showed little to no improvement, and in some cases performed slightly worse.
  - This suggests those models are either more robust to feature space size or already benefit from the full feature information.

**Takeaway:**
Using a deep autoencoder can be a valuable preprocessing step, especially when:
- The input space is high-dimensional
- We're using models sensitive to feature space structure (like KNN)


# PCA
When data is linearly separable, simple models like linear regression, logistic regression, or linear SVMs can more easily separate the classes and predict values correctly.
To try and improve the linearity of our data, previously we applied autoencoder and decoder to reduce the dimensionality of our dataset. Now, we will apply PCA.

#### Why PCA?
As seen in class, PCA:

- Reduces dimensionality while preserving variance in the data
- Eliminates multicollinearity between highly correlated features. This could help because the derived features of our dataset are highly correlated and could assist in reducing redundancy.
- Reduces noise that may be present in less relevant features. We think that this point could help with the outliers contained in the data.
- Improves computational efficiency

In [17]:
def PCA_Transform(X_train_, X_test_):    
    # 2. Standardize features FIRST
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_)
    X_test_scaled = scaler.transform(X_test_)
    
    # 3. THEN apply PCA to scaled data
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Reduced dimensions from {X_train_.shape[1]} to {X_train_pca.shape[1]} features")
    
    return (X_train_pca, X_test_pca)

In [19]:
X = data.drop(columns=["Target Variable"])
y = data["Target Variable"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test = PCA_Transform(X_train, X_test)

# KNN
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print("<----KNN---->")
test(knn, X_test, y_test)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
print("<----Decision Tree---->")
test(dt, X_test, y_test)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
print("<----Linear Regression---->")
test(lr, X_test, y_test)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
print("<----Random Forest---->")
test(rf, X_test, y_test)


Reduced dimensions from 53 to 25 features
<----KNN---->
Root mean Squared Error: 26.5127
Mean absolute Error: 5.8084
Mean Squared Error: 702.92
Median absolute Error: 0.6000
R² Score: 0.3078
<----Decision Tree---->
Root mean Squared Error: 37.8055
Mean absolute Error: 7.4654
Mean Squared Error: 1429.26
Median absolute Error: 1.0000
R² Score: -0.4075
<----Linear Regression---->
Root mean Squared Error: 26.0287
Mean absolute Error: 8.6338
Mean Squared Error: 677.50
Median absolute Error: 4.9093
R² Score: 0.3328
<----Random Forest---->
Root mean Squared Error: 24.0586
Mean absolute Error: 5.5502
Mean Squared Error: 578.82
Median absolute Error: 0.6600
R² Score: 0.4300


### Conclusions: What did we learned from using PCA? 
Our results demonstrate mixed outcomes across models when applying PCA:

- KNN showed clear improvement (RMSE: 27.84 → 26.51, R²: 0.24 → 0.31) as expected for distance-based algorithms. Especially since in class we saw the curse of dimensionality in which KNN is highly sensitive to high dimensional features.
- Linear regression remained relatively stable with minimal change. We expected the model to improve slightly because it is a linear model.
- Tree-based models (Decision Tree and Random Forest) performed worse after PCA, likely because they already handle feature selection implicitly (and could improve by limiting size of the tree) and we are reducing the amount of information. Also, decision trees are more robust against outliers.