In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, mean_squared_error

# Regression Models
from sklearn.neighbors import KNeighborsRegressor
# Deep Learning
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

In [2]:
data = pd.read_pickle('optional_features.pkl')

In [3]:
def test(model, X_test, y_test, y_pred=None):
    '''
    We test our model and print various metrics for comparison

    Params:
    model: to test
    X_test: which are features to test
    y_test: the real values that match X_test
    '''
    if y_pred is None:
        y_pred = model.predict(X_test)
    
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mabse = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Root mean Squared Error: {rmse:.4f}")
    print(f"Mean absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Median absolute Error: {mabse:.4f}")
    print(f"R² Score: {r2:.4f}")

In Data Preproccesing step, we found that Auto Encoder is useful for KNN model. Thus, we decided to use Auto Encoder

In [4]:
# 1. Separate features and target
X = data.drop(columns=["Target_Comment_Volume"])
y = data["Target_Comment_Volume"]

# 2. Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Split data BEFORE fitting the autoencoder
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# 4. Define autoencoder architecture
input_dim = X_train_scaled.shape[1]
encoding_dim = 32  # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')

# 5. Train the autoencoder ONLY on the training set
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# 6. Encode features using the trained encoder
X_train_enc = encoder.predict(X_train_scaled)
X_test_enc = encoder.predict(X_test_scaled)


[1m4354/4354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 159us/step
[1m1866/1866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157us/step


We aim to evaluate how different K-Nearest Neighbors (KNN) configurations affect regression performance.
Specifically, we will experiment with:

- Different values for n_neighbors
- Different distance metrics such as 'euclidean', 'manhattan', and 'minkowski'
This helps identify which combination gives the best predictive results for this dataset.

**Why we Used These Parameters**

- n_neighbors determines how many neighbors the model considers when making predictions. Testing small and larger values helps balance between bias and variance.
- Distance metrics define how similarity between points is measured. Some metrics work better with high-dimensional data or specific feature distributions.

**Configurations we Tested**

- n_neighbors: 3, 5, 10, and √(n_samples)
- metric: 'euclidean', 'manhattan'
- weights: 'uniform', 'distance'

We kept all other parameters constant while changing one at a time to isolate its effect.

In [5]:
knn = KNeighborsRegressor(
    n_neighbors=3,
    metric='euclidean'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 8.0753
Mean absolute Error: 2.2208
Mean Squared Error: 65.21
Median absolute Error: 0.3333
R² Score: 0.9364


In [6]:
knn = KNeighborsRegressor(
    n_neighbors=5,
    metric='euclidean'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 8.3438
Mean absolute Error: 2.2654
Mean Squared Error: 69.62
Median absolute Error: 0.4000
R² Score: 0.9322


In [7]:
knn = KNeighborsRegressor(
    n_neighbors=10,
    metric='euclidean'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 8.4748
Mean absolute Error: 2.3890
Mean Squared Error: 71.82
Median absolute Error: 0.5000
R² Score: 0.9300


In [8]:
knn = KNeighborsRegressor(
    n_neighbors=int(np.sqrt(len(X_train_enc))),
    metric='euclidean'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 16.0981
Mean absolute Error: 3.8089
Mean Squared Error: 259.15
Median absolute Error: 0.6863
R² Score: 0.7474


In [9]:
knn = KNeighborsRegressor(
    n_neighbors=3,
    metric='manhattan'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 8.1775
Mean absolute Error: 2.2512
Mean Squared Error: 66.87
Median absolute Error: 0.3333
R² Score: 0.9348


In [10]:
knn = KNeighborsRegressor(
    n_neighbors=5,
    metric='manhattan'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 8.4717
Mean absolute Error: 2.3237
Mean Squared Error: 71.77
Median absolute Error: 0.4000
R² Score: 0.9301


In [11]:
knn = KNeighborsRegressor(
    n_neighbors=10,
    metric='manhattan'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 8.9305
Mean absolute Error: 2.4694
Mean Squared Error: 79.75
Median absolute Error: 0.5000
R² Score: 0.9223


In [12]:
knn = KNeighborsRegressor(
    n_neighbors=int(np.sqrt(len(X_train_enc))),
    metric='manhattan'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 16.9582
Mean absolute Error: 3.9056
Mean Squared Error: 287.58
Median absolute Error: 0.6649
R² Score: 0.7197


BEST MODELS

In [13]:
knn = KNeighborsRegressor(
    n_neighbors=5,
    metric='manhattan',
    weights='distance'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 7.8324
Mean absolute Error: 2.0931
Mean Squared Error: 61.35
Median absolute Error: 0.4038
R² Score: 0.9402


In [14]:
knn = KNeighborsRegressor(
    n_neighbors=3,
    metric='euclidean',
    weights='distance'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 7.6167
Mean absolute Error: 2.0361
Mean Squared Error: 58.01
Median absolute Error: 0.3674
R² Score: 0.9435


In [15]:
knn = KNeighborsRegressor(
    n_neighbors=5,
    metric='euclidean',
    weights='distance'
)

knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 7.6674
Mean absolute Error: 2.0445
Mean Squared Error: 58.79
Median absolute Error: 0.4121
R² Score: 0.9427


**Conclusion**

The best KNN model uses:

- 3 or 5 nearest neighbors
- Distance-based weighting

This configuration outperformed other combinations by giving more influence to closer neighbors, which helped the model better capture local variations in the data. As a result, it produced the most accurate predictions in our evaluation.