In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, mean_squared_error

# Regression Models
from sklearn.neighbors import KNeighborsRegressor
# Deep Learning
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

In [11]:
data = pd.read_pickle('DataWithFeatureEngineering.pkl')

In [12]:
def test(model, X_test, y_test, y_pred=None):
    '''
    We test our model and print various metrics for comparison

    Params:
    model: to test
    X_test: which are features to test
    y_test: the real values that match X_test
    '''
    if y_pred is None:
        y_pred = model.predict(X_test)
    
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mabse = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Root mean Squared Error: {rmse:.4f}")
    print(f"Mean absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Median absolute Error: {mabse:.4f}")
    print(f"R² Score: {r2:.4f}")

In Data Preproccesing step, we found that Auto Encoder is useful for KNN model. Thus, we decided to use Auto Encoder

In [13]:
# 1. Separate features and target
X = data.drop(columns=["Target_Comment_Volume"])
y = data["Target_Comment_Volume"]

# 2. Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Split data BEFORE fitting the autoencoder
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# 4. Define autoencoder architecture
input_dim = X_train_scaled.shape[1]
encoding_dim = 32  # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')

# 5. Train the autoencoder ONLY on the training set
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# 6. Encode features using the trained encoder
X_train_enc = encoder.predict(X_train_scaled)
X_test_enc = encoder.predict(X_test_scaled)


[1m896/896[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192us/step
[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170us/step


We aim to evaluate how different K-Nearest Neighbors (KNN) configurations affect regression performance.
Specifically, we will experiment with:

- Different values for n_neighbors
- Different distance metrics such as 'euclidean', 'manhattan', and 'minkowski'
This helps identify which combination gives the best predictive results for this dataset.

**Why we Used These Parameters**

- n_neighbors determines how many neighbors the model considers when making predictions. Testing small and larger values helps balance between bias and variance.
- Distance metrics define how similarity between points is measured. Some metrics work better with high-dimensional data or specific feature distributions.

**Configurations we Tested**

- n_neighbors: 3, 5, 10, and √(n_samples)
- metric: 'euclidean', 'manhattan'
- weights: 'uniform', 'distance'

We kept all other parameters constant while changing one at a time to isolate its effect.

In [14]:
knn = KNeighborsRegressor(
    n_neighbors=3,
    metric='euclidean'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 9.0169
Mean absolute Error: 2.9346
Mean Squared Error: 81.30
Median absolute Error: 0.6667
R² Score: 0.9159


In [15]:
knn = KNeighborsRegressor(
    n_neighbors=5,
    metric='euclidean'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 9.0979
Mean absolute Error: 2.9359
Mean Squared Error: 82.77
Median absolute Error: 0.6000
R² Score: 0.9144


In [16]:
knn = KNeighborsRegressor(
    n_neighbors=10,
    metric='euclidean'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 9.2938
Mean absolute Error: 2.9949
Mean Squared Error: 86.37
Median absolute Error: 0.6000
R² Score: 0.9106


In [17]:
knn = KNeighborsRegressor(
    n_neighbors=int(np.sqrt(len(X_train_enc))),
    metric='euclidean'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 18.1191
Mean absolute Error: 4.4382
Mean Squared Error: 328.30
Median absolute Error: 0.8047
R² Score: 0.6604


In [18]:
knn = KNeighborsRegressor(
    n_neighbors=3,
    metric='manhattan'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 9.2867
Mean absolute Error: 3.0063
Mean Squared Error: 86.24
Median absolute Error: 0.6667
R² Score: 0.9108


In [19]:
knn = KNeighborsRegressor(
    n_neighbors=5,
    metric='manhattan'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 9.6658
Mean absolute Error: 3.0176
Mean Squared Error: 93.43
Median absolute Error: 0.6000
R² Score: 0.9033


In [20]:
knn = KNeighborsRegressor(
    n_neighbors=10,
    metric='manhattan'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 10.2003
Mean absolute Error: 3.0852
Mean Squared Error: 104.05
Median absolute Error: 0.6000
R² Score: 0.8924


In [21]:
knn = KNeighborsRegressor(
    n_neighbors=int(np.sqrt(len(X_train_enc))),
    metric='manhattan'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 18.9504
Mean absolute Error: 4.5265
Mean Squared Error: 359.12
Median absolute Error: 0.8107
R² Score: 0.6285


In [22]:
knn = KNeighborsRegressor(
    n_neighbors=5,
    metric='manhattan',
    weights='distance'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 9.0995
Mean absolute Error: 2.9050
Mean Squared Error: 82.80
Median absolute Error: 0.5911
R² Score: 0.9143


In [23]:
knn = KNeighborsRegressor(
    n_neighbors=3,
    metric='euclidean',
    weights='distance'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 8.6851
Mean absolute Error: 2.8457
Mean Squared Error: 75.43
Median absolute Error: 0.6312
R² Score: 0.9220


BEST MODEL

In [24]:
knn = KNeighborsRegressor(
    n_neighbors=5,
    metric='euclidean',
    weights='distance'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 8.6234
Mean absolute Error: 2.8258
Mean Squared Error: 74.36
Median absolute Error: 0.5930
R² Score: 0.9231


**Conclusion**

The best KNN model uses:

- 5 nearest neighbors
- Euclidean distance
- Distance-based weighting

This configuration outperformed other combinations by giving more influence to closer neighbors, which helped the model better capture local variations in the data. As a result, it produced the most accurate predictions in our evaluation.