In [40]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, mean_squared_error
from sklearn.decomposition import PCA

# Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [41]:
# Deep Learning
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

import tensorflow as tf
import random

# Set random seed for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

In [42]:
train_ = pd.read_csv('train_df.csv')
test_ = pd.read_csv('test_df.csv')

In [43]:
def test(model, X_test, y_test, y_pred=None):
    '''
    We test our model and print various metrics for comparison

    Params:
    model: to test
    X_test: which are features to test
    y_test: the real values that match X_test
    '''
    if y_pred is None:
        y_pred = model.predict(X_test)
    
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mabse = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Root mean Squared Error: {rmse:.4f}")
    print(f"Mean absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Median absolute Error: {mabse:.4f}")

We firstly try without any preprocessing in order to see that it can be useful or not

In [44]:
X_train = train_.drop(columns=["Target_Comment_Volume"])
y_train = train_["Target_Comment_Volume"]
X_test = test_.drop(columns=["Target_Comment_Volume"])
y_test = test_["Target_Comment_Volume"]

# KNN
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print("KNN")
test(knn, X_test, y_test)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
print("Decision Tree")
test(dt, X_test, y_test)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
print("Linear Regression")
test(lr, X_test, y_test)


KNN
Root mean Squared Error: 28.5271
Mean absolute Error: 6.2295
Mean Squared Error: 813.79
Median absolute Error: 0.6000
Decision Tree
Root mean Squared Error: 29.7779
Mean absolute Error: 5.5937
Mean Squared Error: 886.72
Median absolute Error: 1.0000
Linear Regression
Root mean Squared Error: 20.4863
Mean absolute Error: 5.9233
Mean Squared Error: 419.69
Median absolute Error: 2.4599


# Autoencoder

 ### Why Use a Deep Autoencoder?

The purpose of using a deep autoencoder in this project is to perform automatic feature compression before feeding the data into regression models.

**Why Autoencoders?**
- High-dimensional feature sets may contain noise and redundant information.
- Autoencoders can learn non-linear relationships and compress the data into a lower-dimensional representation.
- This can reduce overfitting and improve generalization, especially for distance-based models like KNN.

**What we do here:**
- We experiment with various encoder output dimensions: 8, 16, and 32.
- We try different layer sizes (64 and 128) to understand which architecture gives the best downstream performance.


Dim 8 Size 64

In [45]:
# Split features and target
X_train = train_.drop(columns=["Target_Comment_Volume"])
y_train = train_["Target_Comment_Volume"]
X_test = test_.drop(columns=["Target_Comment_Volume"])
y_test = test_["Target_Comment_Volume"]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define autoencoder
input_dim = X_train_scaled.shape[1]
encoding_dim = 8

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Train and evaluate models
print("<----KNN after autoencoder---->")
knn = KNeighborsRegressor()
knn.fit(X_train_encoded, y_train)
test(knn, X_test_encoded, y_test)

print("<----Decision Tree after autoencoder---->")
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_encoded, y_train)
test(dt, X_test_encoded, y_test)

print("<----Linear Regression after autoencoder---->")
lr = LinearRegression()
lr.fit(X_train_encoded, y_train)
test(lr, X_test_encoded, y_test)


[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172us/step
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182us/step
<----KNN after autoencoder---->
Root mean Squared Error: 23.5009
Mean absolute Error: 5.2205
Mean Squared Error: 552.29
Median absolute Error: 0.6000
<----Decision Tree after autoencoder---->
Root mean Squared Error: 32.3786
Mean absolute Error: 6.4632
Mean Squared Error: 1048.38
Median absolute Error: 1.0000
<----Linear Regression after autoencoder---->
Root mean Squared Error: 29.8426
Mean absolute Error: 11.0653
Mean Squared Error: 890.58
Median absolute Error: 6.4993


Dim 16 Size 64

In [46]:
# Split features and target
X_train = train_.drop(columns=["Target_Comment_Volume"])
y_train = train_["Target_Comment_Volume"]
X_test = test_.drop(columns=["Target_Comment_Volume"])
y_test = test_["Target_Comment_Volume"]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define autoencoder
input_dim = X_train_scaled.shape[1]
encoding_dim = 16

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Train and evaluate models
print("<----KNN after autoencoder---->")
knn = KNeighborsRegressor()
knn.fit(X_train_encoded, y_train)
test(knn, X_test_encoded, y_test)

print("<----Decision Tree after autoencoder---->")
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_encoded, y_train)
test(dt, X_test_encoded, y_test)

print("<----Linear Regression after autoencoder---->")
lr = LinearRegression()
lr.fit(X_train_encoded, y_train)
test(lr, X_test_encoded, y_test)


[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171us/step
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178us/step
<----KNN after autoencoder---->
Root mean Squared Error: 22.4430
Mean absolute Error: 5.0423
Mean Squared Error: 503.69
Median absolute Error: 0.6000
<----Decision Tree after autoencoder---->
Root mean Squared Error: 33.8022
Mean absolute Error: 6.9413
Mean Squared Error: 1142.59
Median absolute Error: 1.0000
<----Linear Regression after autoencoder---->
Root mean Squared Error: 29.4204
Mean absolute Error: 11.3354
Mean Squared Error: 865.56
Median absolute Error: 6.9671


Dim 32 Size 64

In [47]:
# Split features and target
X_train = train_.drop(columns=["Target_Comment_Volume"])
y_train = train_["Target_Comment_Volume"]
X_test = test_.drop(columns=["Target_Comment_Volume"])
y_test = test_["Target_Comment_Volume"]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define autoencoder
input_dim = X_train_scaled.shape[1]
encoding_dim = 32

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Train and evaluate models
print("<----KNN after autoencoder---->")
knn = KNeighborsRegressor()
knn.fit(X_train_encoded, y_train)
test(knn, X_test_encoded, y_test)

print("<----Decision Tree after autoencoder---->")
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_encoded, y_train)
test(dt, X_test_encoded, y_test)

print("<----Linear Regression after autoencoder---->")
lr = LinearRegression()
lr.fit(X_train_encoded, y_train)
test(lr, X_test_encoded, y_test)


[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177us/step
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194us/step
<----KNN after autoencoder---->
Root mean Squared Error: 22.7349
Mean absolute Error: 5.2109
Mean Squared Error: 516.88
Median absolute Error: 0.8000
<----Decision Tree after autoencoder---->
Root mean Squared Error: 32.1299
Mean absolute Error: 6.9558
Mean Squared Error: 1032.33
Median absolute Error: 1.0000
<----Linear Regression after autoencoder---->
Root mean Squared Error: 26.4451
Mean absolute Error: 10.8286
Mean Squared Error: 699.34
Median absolute Error: 5.9003


Dim 8 Size 128

In [48]:
# Split features and target
X_train = train_.drop(columns=["Target_Comment_Volume"])
y_train = train_["Target_Comment_Volume"]
X_test = test_.drop(columns=["Target_Comment_Volume"])
y_test = test_["Target_Comment_Volume"]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define autoencoder
input_dim = X_train_scaled.shape[1]
encoding_dim = 8

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Train and evaluate models
print("<----KNN after autoencoder---->")
knn = KNeighborsRegressor()
knn.fit(X_train_encoded, y_train)
test(knn, X_test_encoded, y_test)

print("<----Decision Tree after autoencoder---->")
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_encoded, y_train)
test(dt, X_test_encoded, y_test)

print("<----Linear Regression after autoencoder---->")
lr = LinearRegression()
lr.fit(X_train_encoded, y_train)
test(lr, X_test_encoded, y_test)


[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197us/step
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194us/step
<----KNN after autoencoder---->
Root mean Squared Error: 23.3649
Mean absolute Error: 5.2004
Mean Squared Error: 545.92
Median absolute Error: 0.8000
<----Decision Tree after autoencoder---->
Root mean Squared Error: 36.8169
Mean absolute Error: 7.5092
Mean Squared Error: 1355.49
Median absolute Error: 1.0000
<----Linear Regression after autoencoder---->
Root mean Squared Error: 29.1636
Mean absolute Error: 11.6546
Mean Squared Error: 850.51
Median absolute Error: 7.4322


Dim 16 Size 128

In [49]:
# Split features and target
X_train = train_.drop(columns=["Target_Comment_Volume"])
y_train = train_["Target_Comment_Volume"]
X_test = test_.drop(columns=["Target_Comment_Volume"])
y_test = test_["Target_Comment_Volume"]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define autoencoder
input_dim = X_train_scaled.shape[1]
encoding_dim = 16

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Train and evaluate models
print("<----KNN after autoencoder---->")
knn = KNeighborsRegressor()
knn.fit(X_train_encoded, y_train)
test(knn, X_test_encoded, y_test)

print("<----Decision Tree after autoencoder---->")
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_encoded, y_train)
test(dt, X_test_encoded, y_test)

print("<----Linear Regression after autoencoder---->")
lr = LinearRegression()
lr.fit(X_train_encoded, y_train)
test(lr, X_test_encoded, y_test)


[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184us/step
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195us/step
<----KNN after autoencoder---->
Root mean Squared Error: 21.4998
Mean absolute Error: 4.7074
Mean Squared Error: 462.24
Median absolute Error: 0.6000
<----Decision Tree after autoencoder---->
Root mean Squared Error: 30.3272
Mean absolute Error: 6.7413
Mean Squared Error: 919.74
Median absolute Error: 1.0000
<----Linear Regression after autoencoder---->
Root mean Squared Error: 29.2884
Mean absolute Error: 11.8357
Mean Squared Error: 857.81
Median absolute Error: 6.8572


Dim 32 Size 128

In [50]:
# Split features and target
X_train = train_.drop(columns=["Target_Comment_Volume"])
y_train = train_["Target_Comment_Volume"]
X_test = test_.drop(columns=["Target_Comment_Volume"])
y_test = test_["Target_Comment_Volume"]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define autoencoder
input_dim = X_train_scaled.shape[1]
encoding_dim = 32

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Train and evaluate models
print("<----KNN after autoencoder---->")
knn = KNeighborsRegressor()
knn.fit(X_train_encoded, y_train)
test(knn, X_test_encoded, y_test)

print("<----Decision Tree after autoencoder---->")
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_encoded, y_train)
test(dt, X_test_encoded, y_test)

print("<----Linear Regression after autoencoder---->")
lr = LinearRegression()
lr.fit(X_train_encoded, y_train)
test(lr, X_test_encoded, y_test)


[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189us/step
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 201us/step
<----KNN after autoencoder---->
Root mean Squared Error: 21.9116
Mean absolute Error: 4.7749
Mean Squared Error: 480.12
Median absolute Error: 0.6000
<----Decision Tree after autoencoder---->
Root mean Squared Error: 31.6728
Mean absolute Error: 6.8829
Mean Squared Error: 1003.17
Median absolute Error: 1.0000
<----Linear Regression after autoencoder---->
Root mean Squared Error: 27.4529
Mean absolute Error: 11.2657
Mean Squared Error: 753.66
Median absolute Error: 5.9439


### Conclusion: What Did We Learn from Using Autoencoders?

**Best Configuration:**
- Encoder dimension = **16**
- Hidden layer size = **128**

**Why?**
- This setting retained enough information to represent the features well without over-compressing.
- It led to the best performance improvement in **K-Nearest Neighbors (KNN)**.

**Impact on Models:**
- **KNN improved significantly**, because reducing dimensionality helped mitigate the "curse of dimensionality" which affects distance-based models.
- **Other models (Decision Tree, Linear Regression)** showed little to no improvement, and in some cases performed slightly worse.
  - This suggests those models are either more robust to feature space size or already benefit from the full feature information.

**Takeaway:**
Using a deep autoencoder can be a valuable preprocessing step, especially when:
- The input space is high-dimensional
- We're using models sensitive to feature space structure (like KNN)


# PCA
When data is linearly separable, simple models like linear regression, logistic regression, or linear SVMs can more easily separate the classes and predict values correctly.
To try and improve the linearity of our data, previously we applied autoencoder and decoder to reduce the dimensionality of our dataset. Now, we will apply PCA.

#### Why PCA?
As seen in class, PCA:

- Reduces dimensionality while preserving variance in the data
- Eliminates multicollinearity between highly correlated features. This could help because the derived features of our dataset are highly correlated and could assist in reducing redundancy.
- Reduces noise that may be present in less relevant features. We think that this point could help with the outliers contained in the data.
- Improves computational efficiency

In [51]:
def PCA_Transform(X_train_, X_test_):    
    # 2. Standardize features FIRST
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_)
    X_test_scaled = scaler.transform(X_test_)
    
    # 3. THEN apply PCA to scaled data
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Reduced dimensions from {X_train_.shape[1]} to {X_train_pca.shape[1]} features")
    
    return (X_train_pca, X_test_pca)

In [52]:
X_train = train_.drop(columns=["Target_Comment_Volume"])
y_train = train_["Target_Comment_Volume"]
X_test = test_.drop(columns=["Target_Comment_Volume"])
y_test = test_["Target_Comment_Volume"]

X_train, X_test = PCA_Transform(X_train, X_test)

# KNN
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print("<----KNN---->")
test(knn, X_test, y_test)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
print("<----Decision Tree---->")
test(dt, X_test, y_test)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
print("<----Linear Regression---->")
test(lr, X_test, y_test)


Reduced dimensions from 130 to 93 features
<----KNN---->
Root mean Squared Error: 21.0321
Mean absolute Error: 4.7462
Mean Squared Error: 442.35
Median absolute Error: 0.6000
<----Decision Tree---->
Root mean Squared Error: 35.2508
Mean absolute Error: 6.6546
Mean Squared Error: 1242.62
Median absolute Error: 1.0000
<----Linear Regression---->
Root mean Squared Error: 20.7841
Mean absolute Error: 5.6914
Mean Squared Error: 431.98
Median absolute Error: 2.2870


### Conclusions: What did we learned from using PCA? 
Our results demonstrate mixed outcomes across models when applying PCA:

- KNN showed clear improvement (RMSE: 27.84 → 26.51, R²: 0.24 → 0.31) as expected for distance-based algorithms. Especially since in class we saw the curse of dimensionality in which KNN is highly sensitive to high dimensional features.
- Linear regression remained relatively stable with minimal change. We expected the model to improve slightly because it is a linear model.
- Decision Tree performed worse after PCA, likely because they already handle feature selection implicitly (and could improve by limiting size of the tree) and we are reducing the amount of information. Also, decision trees are more robust against outliers.