In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

# Regression Models
from sklearn.neighbors import KNeighborsRegressor
# Deep Learning
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import tensorflow as tf
import random

# Set random seed for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

In [2]:
def test(model, X_test, y_test, y_pred=None):
    '''
    We test our model and print various metrics for comparison

    Params:
    model: to test
    X_test: which are features to test
    y_test: the real values that match X_test
    '''
    if y_pred is None:
        y_pred = model.predict(X_test)
    
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mabse = median_absolute_error(y_test, y_pred)
    print(f"Root mean Squared Error: {rmse:.4f}")
    print(f"Mean absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Median absolute Error: {mabse:.4f}")

In [3]:
train_ = pd.read_csv('train_df.csv')
test_ = pd.read_csv('test_df.csv')

K-Nearest Neighbors (KNN) is a distance-based algorithm, which means it relies heavily on the geometry of the feature space. When we have high-dimensional data, especially with correlated or less informative features, KNN can perform poorly due to the curse of dimensionality.

To address this, we apply Principal Component Analysis (PCA) before KNN:

- Reduces dimensionality: Keeps only the most important features (principal components)
- Removes noise and redundancy
- Improves performance and speed of KNN by working in a cleaner, lower-dimensional space

In short, PCA helps KNN make more accurate and efficient predictions.

In [4]:
def PCA_Transform(X_train_, X_test_):    
    # 2. Standardize features FIRST
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_)
    X_test_scaled = scaler.transform(X_test_)
    
    # 3. THEN apply PCA to scaled data
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Reduced dimensions from {X_train_.shape[1]} to {X_train_pca.shape[1]} features")
    
    return (X_train_pca, X_test_pca)

**Why Use an Autoencoder After PCA?**

After applying PCA to reduce the feature space and remove redundancy, we further applied an autoencoder for non-linear feature extraction.

While PCA is a linear method, autoencoders are neural networks that can learn complex, non-linear patterns in the data. This helps us capture deeper structure that PCA might miss.

Benefits of using both:

- PCA simplifies the data and reduces noise
- Autoencoder compresses and reconstructs data with non-linear transformations
- Together, they give us a compact and powerful feature representation for KNN, which is sensitive to feature quality

By stacking PCA and an autoencoder, we combine the strengths of both methods and feed more informative features into our KNN model.

In [5]:
# 1. Separate features and target
X_train = train_.drop(columns=['Target_Comment_Volume'])
y_train = train_['Target_Comment_Volume']
X_test = test_.drop(columns=['Target_Comment_Volume'])
y_test = test_['Target_Comment_Volume']

# 2. Split data BEFORE fitting the autoencoder
X_train_scaled, X_test_scaled = PCA_Transform(X_train, X_test)

# 3. Define autoencoder architecture
input_dim = X_train_scaled.shape[1]
encoding_dim = 16  # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')

# 4. Train the autoencoder ONLY on the training set
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# 5. Encode features using the trained encoder
X_train_enc = encoder.predict(X_train_scaled)
X_test_enc = encoder.predict(X_test_scaled)

Reduced dimensions from 130 to 93 features
[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195us/step
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187us/step


We aim to evaluate how different K-Nearest Neighbors (KNN) configurations affect regression performance.
Specifically, we will experiment with:

- Different values for n_neighbors
- Different distance metrics such as 'euclidean', 'manhattan', and 'minkowski'
This helps identify which combination gives the best predictive results for this dataset.

**Why we Used These Parameters**

- n_neighbors determines how many neighbors the model considers when making predictions. Testing small and larger values helps balance between bias and variance.
- Distance metrics define how similarity between points is measured. Some metrics work better with high-dimensional data or specific feature distributions.

**Configurations we Tested**

- n_neighbors: 1-20 and √(n_samples)
- metric: 'euclidean', 'manhattan'
- weights: 'uniform', 'distance'

We kept all other parameters constant while changing one at a time to isolate its effect.

In [6]:
print("Euclidean distance and uniform weights")

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k


print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='euclidean', weights='uniform')
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


Euclidean distance and uniform weights


k = 1, MSE = 843.9022
k = 2, MSE = 698.4037
k = 3, MSE = 649.2014
k = 4, MSE = 632.4292
k = 5, MSE = 636.1538
k = 6, MSE = 624.6023
k = 7, MSE = 611.8633
k = 8, MSE = 608.0482
k = 9, MSE = 607.8222
k = 10, MSE = 603.7797
k = 11, MSE = 603.1007
k = 12, MSE = 601.8431
k = 13, MSE = 603.7869
k = 14, MSE = 606.0562
k = 15, MSE = 609.2841
k = 16, MSE = 608.4124
k = 17, MSE = 610.4487
k = 18, MSE = 612.2564
k = 19, MSE = 615.4605
k = 20, MSE = 616.5939
k = 180, MSE = 787.3499

Best k: 12
Root mean Squared Error: 21.1324
Mean absolute Error: 4.7520
Mean Squared Error: 446.58
Median absolute Error: 0.7500


In [7]:
print("Euclidean distance and distance weights")

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='euclidean', weights='distance')
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


Euclidean distance and distance weights
k = 1, MSE = 843.9022
k = 2, MSE = 698.4037
k = 3, MSE = 649.2014
k = 4, MSE = 632.4292
k = 5, MSE = 636.1538
k = 6, MSE = 624.6023
k = 7, MSE = 611.8633
k = 8, MSE = 608.0482
k = 9, MSE = 607.8222
k = 10, MSE = 603.7797
k = 11, MSE = 603.1007
k = 12, MSE = 601.8431
k = 13, MSE = 603.7869
k = 14, MSE = 606.0562
k = 15, MSE = 609.2841
k = 16, MSE = 608.4124
k = 17, MSE = 610.4487
k = 18, MSE = 612.2564
k = 19, MSE = 615.4605
k = 20, MSE = 616.5939
k = 180, MSE = 787.3499

Best k: 12
Root mean Squared Error: 20.8551
Mean absolute Error: 4.6213
Mean Squared Error: 434.93
Median absolute Error: 0.6791


In [8]:
print("Manhattan distance and uniform weights")

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='manhattan', weights='uniform')
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


Manhattan distance and uniform weights
k = 1, MSE = 843.9022
k = 2, MSE = 698.4037
k = 3, MSE = 649.2014
k = 4, MSE = 632.4292
k = 5, MSE = 636.1538
k = 6, MSE = 624.6023
k = 7, MSE = 611.8633
k = 8, MSE = 608.0482
k = 9, MSE = 607.8222
k = 10, MSE = 603.7797
k = 11, MSE = 603.1007
k = 12, MSE = 601.8431
k = 13, MSE = 603.7869
k = 14, MSE = 606.0562
k = 15, MSE = 609.2841
k = 16, MSE = 608.4124
k = 17, MSE = 610.4487
k = 18, MSE = 612.2564
k = 19, MSE = 615.4605
k = 20, MSE = 616.5939
k = 180, MSE = 787.3499

Best k: 12
Root mean Squared Error: 21.5969
Mean absolute Error: 4.7833
Mean Squared Error: 466.43
Median absolute Error: 0.7500


In [9]:
print("Manhattan distance and distance weights")

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='manhattan', weights='distance')
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


Manhattan distance and distance weights
k = 1, MSE = 843.9022
k = 2, MSE = 698.4037
k = 3, MSE = 649.2014
k = 4, MSE = 632.4292
k = 5, MSE = 636.1538
k = 6, MSE = 624.6023
k = 7, MSE = 611.8633
k = 8, MSE = 608.0482
k = 9, MSE = 607.8222
k = 10, MSE = 603.7797
k = 11, MSE = 603.1007
k = 12, MSE = 601.8431
k = 13, MSE = 603.7869
k = 14, MSE = 606.0562
k = 15, MSE = 609.2841
k = 16, MSE = 608.4124
k = 17, MSE = 610.4487
k = 18, MSE = 612.2564
k = 19, MSE = 615.4605
k = 20, MSE = 616.5939
k = 180, MSE = 787.3499

Best k: 12
Root mean Squared Error: 21.2248
Mean absolute Error: 4.6364
Mean Squared Error: 450.49
Median absolute Error: 0.7000


In [10]:
print("Minkowski distance with p=1.5 and distance weights")

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='minkowski', weights='distance', p=1.5)
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


Minkowski distance with p=1.5 and distance weights
k = 1, MSE = 843.9022
k = 2, MSE = 698.4037
k = 3, MSE = 649.2014
k = 4, MSE = 632.4292
k = 5, MSE = 636.1538
k = 6, MSE = 624.6023
k = 7, MSE = 611.8633
k = 8, MSE = 608.0482
k = 9, MSE = 607.8222
k = 10, MSE = 603.7797
k = 11, MSE = 603.1007
k = 12, MSE = 601.8431
k = 13, MSE = 603.7869
k = 14, MSE = 606.0562
k = 15, MSE = 609.2841
k = 16, MSE = 608.4124
k = 17, MSE = 610.4487
k = 18, MSE = 612.2564
k = 19, MSE = 615.4605
k = 20, MSE = 616.5939
k = 180, MSE = 787.3499

Best k: 12
Root mean Squared Error: 20.9624
Mean absolute Error: 4.6113
Mean Squared Error: 439.42
Median absolute Error: 0.6879


In [11]:
print("Minkowski distance with p=3 and distance weights")

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='minkowski', weights='distance', p=3)
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


Minkowski distance with p=3 and distance weights
k = 1, MSE = 843.9022
k = 2, MSE = 698.4037
k = 3, MSE = 649.2014
k = 4, MSE = 632.4292
k = 5, MSE = 636.1538
k = 6, MSE = 624.6023
k = 7, MSE = 611.8633
k = 8, MSE = 608.0482
k = 9, MSE = 607.8222
k = 10, MSE = 603.7797
k = 11, MSE = 603.1007
k = 12, MSE = 601.8431
k = 13, MSE = 603.7869
k = 14, MSE = 606.0562
k = 15, MSE = 609.2841
k = 16, MSE = 608.4124
k = 17, MSE = 610.4487
k = 18, MSE = 612.2564
k = 19, MSE = 615.4605
k = 20, MSE = 616.5939
k = 180, MSE = 787.3499

Best k: 12
Root mean Squared Error: 21.1594
Mean absolute Error: 4.6579
Mean Squared Error: 447.72
Median absolute Error: 0.6661


BEST MODEL

In [13]:
knn = KNeighborsRegressor(n_neighbors=12, metric='euclidean', weights='distance')
knn.fit(X_train_enc, y_train)
y_pred = knn.predict(X_test_enc)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 20.8551
Mean absolute Error: 4.6213
Mean Squared Error: 434.93
Median absolute Error: 0.6791


**Conclusion**

The best KNN model uses:

- 12 nearest neighbors
- Euclidean distance metric
- Distance-based weighting

This configuration outperformed other combinations by giving more influence to closer neighbors, which helped the model better capture local variations in the data. As a result, it produced the most accurate predictions in our evaluation.