In [89]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, mean_squared_error
from sklearn.decomposition import PCA

# Regression Models
from sklearn.neighbors import KNeighborsRegressor
# Deep Learning
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import tensorflow as tf
import random

# Set random seed for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

In [90]:
data = pd.read_pickle('raw_features.pkl')

In [91]:
def test(model, X_test, y_test, y_pred=None):
    '''
    We test our model and print various metrics for comparison

    Params:
    model: to test
    X_test: which are features to test
    y_test: the real values that match X_test
    '''
    if y_pred is None:
        y_pred = model.predict(X_test)
    
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mabse = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Root mean Squared Error: {rmse:.4f}")
    print(f"Mean absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Median absolute Error: {mabse:.4f}")
    print(f"R² Score: {r2:.4f}")

In Data Preproccesing step, we found that Auto Encoder and PCA are useful for KNN model. Thus, we decided to use Auto Encoder

In [92]:
def PCA_Transform(X_train_, X_test_):    
    # 2. Standardize features FIRST
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_)
    X_test_scaled = scaler.transform(X_test_)
    
    # 3. THEN apply PCA to scaled data
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Reduced dimensions from {X_train_.shape[1]} to {X_train_pca.shape[1]} features")
    
    return (X_train_pca, X_test_pca)

In [93]:
# 1. Separate features and target
X = data.drop(columns=["Target_Comment_Volume"])
y = data["Target_Comment_Volume"]

# 2. Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Split data BEFORE fitting the autoencoder
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)
X_train_scaled, X_test_scaled = PCA_Transform(X_train_scaled, X_test_scaled)


# 4. Define autoencoder architecture
input_dim = X_train_scaled.shape[1]
encoding_dim = 32  # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')

# 5. Train the autoencoder ONLY on the training set
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, verbose=0)

# 6. Encode features using the trained encoder
X_train_enc = encoder.predict(X_train_scaled)
X_test_enc = encoder.predict(X_test_scaled)

Reduced dimensions from 42 to 18 features
[1m896/896[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166us/step
[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168us/step


We aim to evaluate how different K-Nearest Neighbors (KNN) configurations affect regression performance.
Specifically, we will experiment with:

- Different values for n_neighbors
- Different distance metrics such as 'euclidean', 'manhattan', and 'minkowski'
This helps identify which combination gives the best predictive results for this dataset.

**Why we Used These Parameters**

- n_neighbors determines how many neighbors the model considers when making predictions. Testing small and larger values helps balance between bias and variance.
- Distance metrics define how similarity between points is measured. Some metrics work better with high-dimensional data or specific feature distributions.

**Configurations we Tested**

- n_neighbors: 1- 20 and √(n_samples)
- metric: 'euclidean', 'manhattan'
- weights: 'uniform', 'distance'

We kept all other parameters constant while changing one at a time to isolate its effect.

In [94]:
from sklearn.model_selection import cross_val_score
import numpy as np

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='euclidean', weights='uniform')
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


k = 1, MSE = 1042.1933
k = 2, MSE = 786.5828
k = 3, MSE = 731.6285
k = 4, MSE = 702.4147
k = 5, MSE = 691.4351
k = 6, MSE = 681.4678
k = 7, MSE = 677.9552
k = 8, MSE = 669.9713
k = 9, MSE = 674.6091
k = 10, MSE = 671.9347
k = 11, MSE = 674.1292
k = 12, MSE = 670.0896
k = 13, MSE = 673.1724
k = 14, MSE = 672.3092
k = 15, MSE = 675.7894
k = 16, MSE = 675.0683
k = 17, MSE = 676.0422
k = 18, MSE = 677.6937
k = 19, MSE = 680.3035
k = 20, MSE = 680.4171
k = 169, MSE = 834.5759

Best k: 8
Root mean Squared Error: 20.8284
Mean absolute Error: 4.6458
Mean Squared Error: 433.82
Median absolute Error: 0.6250
R² Score: 0.5728


In [95]:
from sklearn.model_selection import cross_val_score
import numpy as np

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='euclidean', weights='distance')
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


k = 1, MSE = 1042.1933
k = 2, MSE = 786.5828
k = 3, MSE = 731.6285
k = 4, MSE = 702.4147
k = 5, MSE = 691.4351
k = 6, MSE = 681.4678
k = 7, MSE = 677.9552
k = 8, MSE = 669.9713
k = 9, MSE = 674.6091
k = 10, MSE = 671.9347
k = 11, MSE = 674.1292
k = 12, MSE = 670.0896
k = 13, MSE = 673.1724
k = 14, MSE = 672.3092
k = 15, MSE = 675.7894
k = 16, MSE = 675.0683
k = 17, MSE = 676.0422
k = 18, MSE = 677.6937
k = 19, MSE = 680.3035
k = 20, MSE = 680.4171
k = 169, MSE = 834.5759

Best k: 8
Root mean Squared Error: 20.5049
Mean absolute Error: 4.5695
Mean Squared Error: 420.45
Median absolute Error: 0.6299
R² Score: 0.5860


In [96]:
from sklearn.model_selection import cross_val_score
import numpy as np

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='manhattan', weights='uniform')
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


k = 1, MSE = 1042.1933
k = 2, MSE = 786.5828
k = 3, MSE = 731.6285
k = 4, MSE = 702.4147
k = 5, MSE = 691.4351
k = 6, MSE = 681.4678
k = 7, MSE = 677.9552
k = 8, MSE = 669.9713
k = 9, MSE = 674.6091
k = 10, MSE = 671.9347
k = 11, MSE = 674.1292
k = 12, MSE = 670.0896
k = 13, MSE = 673.1724
k = 14, MSE = 672.3092
k = 15, MSE = 675.7894
k = 16, MSE = 675.0683
k = 17, MSE = 676.0422
k = 18, MSE = 677.6937
k = 19, MSE = 680.3035
k = 20, MSE = 680.4171
k = 169, MSE = 834.5759

Best k: 8
Root mean Squared Error: 21.4117
Mean absolute Error: 4.6789
Mean Squared Error: 458.46
Median absolute Error: 0.6250
R² Score: 0.5485


In [97]:
from sklearn.model_selection import cross_val_score
import numpy as np

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='manhattan', weights='distance')
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


k = 1, MSE = 1042.1933
k = 2, MSE = 786.5828
k = 3, MSE = 731.6285
k = 4, MSE = 702.4147
k = 5, MSE = 691.4351
k = 6, MSE = 681.4678
k = 7, MSE = 677.9552
k = 8, MSE = 669.9713
k = 9, MSE = 674.6091
k = 10, MSE = 671.9347
k = 11, MSE = 674.1292
k = 12, MSE = 670.0896
k = 13, MSE = 673.1724
k = 14, MSE = 672.3092
k = 15, MSE = 675.7894
k = 16, MSE = 675.0683
k = 17, MSE = 676.0422
k = 18, MSE = 677.6937
k = 19, MSE = 680.3035
k = 20, MSE = 680.4171
k = 169, MSE = 834.5759

Best k: 8
Root mean Squared Error: 20.9398
Mean absolute Error: 4.5954
Mean Squared Error: 438.47
Median absolute Error: 0.6244
R² Score: 0.5682


In [98]:
from sklearn.model_selection import cross_val_score
import numpy as np

try_k = int(np.sqrt(len(X_train_enc)))

best_k = None
best_score = float('inf')

# Try different k values
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean', weights='distance')
    scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_score = -scores.mean()
    print(f"k = {k}, MSE = {mean_score:.4f}")
    
    if mean_score < best_score:
        best_score = mean_score
        best_k = k

knn = KNeighborsRegressor(n_neighbors=try_k, metric='euclidean', weights='distance')
scores = cross_val_score(knn, X_train_enc, y_train, cv=10, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"k = {try_k}, MSE = {mean_score:.4f}")

if mean_score < best_score:
    best_score = mean_score
    best_k = k

print(f"\nBest k: {best_k}")

# Train final model with best k
final_knn = KNeighborsRegressor(n_neighbors=best_k, metric='minkowski', weights='distance', p=1.5)
final_knn.fit(X_train_enc, y_train)
test(final_knn, X_test_enc, y_test)


k = 1, MSE = 1042.1933
k = 2, MSE = 786.5828
k = 3, MSE = 731.6285
k = 4, MSE = 702.4147
k = 5, MSE = 691.4351
k = 6, MSE = 681.4678
k = 7, MSE = 677.9552
k = 8, MSE = 669.9713
k = 9, MSE = 674.6091
k = 10, MSE = 671.9347
k = 11, MSE = 674.1292
k = 12, MSE = 670.0896
k = 13, MSE = 673.1724
k = 14, MSE = 672.3092
k = 15, MSE = 675.7894
k = 16, MSE = 675.0683
k = 17, MSE = 676.0422
k = 18, MSE = 677.6937
k = 19, MSE = 680.3035
k = 20, MSE = 680.4171
k = 169, MSE = 834.5759

Best k: 8
Root mean Squared Error: 20.6796
Mean absolute Error: 4.5719
Mean Squared Error: 427.64
Median absolute Error: 0.6256
R² Score: 0.5789


BEST MODEL

In [99]:
knn = KNeighborsRegressor(n_neighbors=8, metric='euclidean', weights='distance')
knn.fit(X_train_enc, y_train)
test(knn, X_test_enc, y_test)

Root mean Squared Error: 20.5049
Mean absolute Error: 4.5695
Mean Squared Error: 420.45
Median absolute Error: 0.6299
R² Score: 0.5860


**Conclusion**

The best KNN model uses:

- 8 nearest neighbors
- Euclidean distance metric
- Distance-based weighting

This configuration outperformed other combinations by giving more influence to closer neighbors, which helped the model better capture local variations in the data. As a result, it produced the most accurate predictions in our evaluation.