In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, root_mean_squared_error, median_absolute_error

from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Deep Learning
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

#### Functions for testing

In [3]:
def test(model, X_test_, y_test_, y_pred=None):
    '''
    We test our model and print various metrics for comparison

    Params:
    model: to test
    X_test: which are features to test
    y_test: the real values that match X_test
    '''
    if y_pred is None:
        y_pred = model.predict(X_test_)
    
    rmse = root_mean_squared_error(y_test_, y_pred)
    mae = mean_absolute_error(y_test_, y_pred)
    mse = mean_squared_error(y_test_, y_pred)
    mabse = median_absolute_error(y_test_, y_pred)

    print(f"Root mean Squared Error: {rmse:.4f}")
    print(f"Mean absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Median absolute Error: {mabse:.4f}")

    return y_pred

def visualize(y_test, y_pred):
    '''
    Params:
    y_test: the real values that match X_test
    y_pred: the models predicted y values
    '''
    plt.scatter(y_test, y_pred, alpha=0.3)  # alpha=0.3 for semi-transparent dots
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # red diagonal line
    plt.xlabel("Actual Comment Volume")
    plt.ylabel("Predicted Comment Volume")
    plt.title("Actual vs Predicted Comment Volume")
    plt.show()

## Experiements
In the following code sections, each "option" represents an experiment we conducted to evaluate different model improvements. Through these experiments, we implemented various techniques to enhance model performance and gain insights through our chosen evaluation metrics.
#### Option 1: Train and test without any data preprocessing or feature engineering
Consists of training and testing models without any preprocessing or feature engineering. This creates our baseline models that serve as reference points for comparison as we apply techniques in subsequent experiments.

In [4]:
columns = ['Page Popularity/likes', 'Page Checkins', 'Page talking about', 'Page Category', 'Derived Feature 5', 'Derived Feature 6', 'Derived Feature 7', 
    'Derived Feature 8', 'Derived Feature 9', 'Derived Feature 10', 'Derived Feature 11', 'Derived Feature 12', 'Derived Feature 13', 
    'Derived Feature 14', 'Derived Feature 15', 'Derived Feature 16', 'Derived Feature 17', 'Derived Feature 18', 'Derived Feature 19', 'Derived Feature 20', 
    'Derived Feature 21', 'Derived Feature 22', 'Derived Feature 23', 'Derived Feature 24', 'Derived Feature 25', 'Derived Feature 26', 'Derived Feature 27', 
    'Derived Feature 28', 'Derived Feature 29', 'CC1', 'CC2', 'CC3', 'CC4', 'CC5', 'Base time', 'Post length', 'Post Share Count', 'Post Promotion Status',
    'H Local', 'Post Published Sunday', 'Post Published Monday', 'Post Published Tuesday',  'Post Published Wednesday', 'Post Published Thursday', 
    'Post Published Friday', 'Post Published Saturday', 'Base DateTime Sunday', 'Base DateTime Monday', 'Base DateTime Tuesday','Base DateTime Wednesday', 
    'Base DateTime Thursday', 'Base DateTime Friday', 'Base DateTime Saturday', 'Target Variable' ]

base_training_complete = pd.read_csv('./Dataset/Training/Features_Variant_1.csv', sep=',', header=None, names=columns)
base_training_complete.shape

(40949, 54)

In [5]:
'''
The .values changes type from pandas to a numpy array, it strips the labels of rows and columns
'''
base_X = base_training_complete.iloc[:, :-1]
base_y = base_training_complete.iloc[:, -1]

base_X_np = base_training_complete.iloc[:, :-1].values # Independent variables - the features.
base_y_np = base_training_complete.iloc[:, -1].values # dependent variable - prediction

In [6]:
base_X_train, base_X_test, base_y_train, base_y_test = train_test_split(base_X_np, base_y_np, test_size = 0.3, random_state = 1) # 30% for testing dataset

In [7]:
base_lin_regression = LinearRegression()
base_lin_regression.fit(base_X_train, base_y_train)  # lec 4, slide 46 - uses normal equations since we have a small/medium-sized dataset

In [8]:
base_y_pred = test(base_lin_regression, base_X_test, base_y_test)

NameError: name 'calculate_smape' is not defined

In [None]:
visualize(base_y_test, base_y_pred)

#### Option 2: Apply basic preprocessing: drop duplicates, drop column that has only zeros
Applies basic preprocessing steps to clean our dataset by removing duplicates and dropping columns that provide no information value. This experiment helps us understand how even simple data cleaning can affect model performance before we implement more complex feature engineering techniques. Although, we don't expect to see big improvements compared from baseline model.

In [None]:
preprocess_training_complete = pd.read_csv('./Dataset/Training/Features_Variant_1.csv', sep=',', header=None, names=columns)
preprocess_training_complete.shape

In [None]:
preprocess_training_complete = preprocess_training_complete.drop("Post Promotion Status", axis='columns') # Drop column that has only zeros
preprocess_training_complete = preprocess_training_complete.drop_duplicates() # drop duplicates in our dataset

'''
The .values changes type from pandas to a numpy array, it strips the labels of rows and columns
'''
preprocess_X = preprocess_training_complete.iloc[:, :-1]
preprocess_y = preprocess_training_complete.iloc[:, -1]

preprocess_X_np = preprocess_training_complete.iloc[:, :-1].values # Independent variables - the features.
preprocess_y_np = preprocess_training_complete.iloc[:, -1].values # dependent variable - prediction

In [None]:
preprocess_X_train, preprocess_X_test, preprocess_y_train, preprocess_y_test = train_test_split(preprocess_X_np, preprocess_y_np, test_size = 0.3, random_state = 1) # 30% for testing dataset

In [None]:
preprocess_lin_regression = LinearRegression()
preprocess_lin_regression.fit(preprocess_X_train, preprocess_y_train)  # lec 4, slide 46 - uses normal equations since we have a small/medium-sized dataset

In [None]:
preprocess_y_pred = test(preprocess_lin_regression, preprocess_X_test, preprocess_y_test)

In [None]:
visualize(preprocess_y_test, preprocess_y_pred)

#### Option 3: Apply data preprocessing and feature engineering from WithoutMethods.ipynb
Option 3 extends our approach by implementing more comprehensive data preprocessing and feature engineering techniques developed in the 2_DataPreprocessing.ipynb notebook. In this experiment, we standardize our numerical features to ensure all values are on the same scale, which can significantly improve linear models' performance since they're sensitive to feature magnitudes.

In [None]:
features_data = pd.read_csv('Preprocessed_Data.csv')

In [None]:
# separate features and target
features_X = features_data.drop(columns=["Target_Comment_Volume"])
features_y = features_data["Target_Comment_Volume"]

# scale features
scaler = StandardScaler()
features_X_scaled = scaler.fit_transform(features_X)  # scale before splitting train-test so that all values are in the same range

In [None]:
features_X_train, features_X_test, features_y_train, features_y_test = train_test_split(features_X_scaled, features_y, test_size=0.3, random_state=1)  # setting random_state allows for the same split on each runX_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=1)  # setting random_state allows for the same split on each run

In [None]:
features_lin_regression = LinearRegression()
features_lin_regression.fit(features_X_train, features_y_train)  # lec 4, slide 46 - uses normal equations since we have a small/medium-sized dataset

In [None]:
features_y_pred = test(features_lin_regression, features_X_test, features_y_test)

In [None]:
visualize(features_y_test, features_y_pred)

#### Option 4: Apply L2 (ridge) regularization
Option 4 introduces L2 (ridge) regularization to our linear model, which could help prevent overfitting by adding a penalty on the magnitude of coefficients. This technique could be particularly useful for our Facebook comment prediction dataset because many features show multicollinearity, and regularization can stabilize predictions when working with potentially redundant variables and outliers.

In [None]:
l2_data = pd.read_csv('Preprocessed_Data.csv')

In [None]:
# separate features and target
l2_X = l2_data.drop(columns=["Target_Comment_Volume"])
l2_y = l2_data["Target_Comment_Volume"]

# scale features
scaler = StandardScaler()
l2_X_scaled = scaler.fit_transform(l2_X)  # scale before splitting train-test so that all values are in the same range

In [None]:
l2_X_train, l2_X_test, l2_y_train, l2_y_test = train_test_split(l2_X_scaled, l2_y, test_size=0.3, random_state=1)  # setting random_state allows for the same split on each runX_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=1)  # setting random_state allows for the same split on each run

In [None]:
l2 = Ridge()
l2.fit(l2_X_train, l2_y_train)

In [None]:
l2_y_pred = test(l2, l2_X_test, l2_y_test)

In [None]:
visualize(l2_y_test, l2_y_pred)

#### Option 5: Apply PCA with cross-validation on Linear regression and L2 regularization
Option 5 introduces Principal Component Analysis (PCA) combined with cross-validation to further improve our linear models. PCA helps address multicollinearity by transforming our features into uncorrelated components while retaining 95% of the variance. We implement this approach to combine dimensionality reduction with ridge regression in a robust way.

In [None]:
pca_data = pd.read_csv('Preprocessed_Data.csv')

In [None]:
# separate features and target
pca_X = pca_data.drop(columns=["Target_Comment_Volume"])
pca_y = pca_data["Target_Comment_Volume"]

# scale features
scaler = StandardScaler()
pca_X_scaled = scaler.fit_transform(pca_X)  # scale before splitting train-test so that all values are in the same range

In [None]:
model_train_flag = "Ridge" # either LinReg or Ridge

In [None]:
# PCA
pca = PCA(n_components=0.95)  # retain 95% variances

if model_train_flag == "LinReg":
    pipeline = Pipeline([
        ('pca', pca),
        ('reg', LinearRegression())
    ])
elif model_train_flag == "Ridge":
    pipeline = Pipeline([
        ('pca', pca),
        ('reg', Ridge(alpha=10))
    ])

In [None]:
pca_X_train, pca_X_test, pca_y_train, pca_y_test = train_test_split(pca_X_scaled, pca_y, test_size=0.3, random_state=1)  # setting random_state allows for the same split on each runX_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=1)  # setting random_state allows for the same split on each run

In [None]:
# fit pipeline
pipeline.fit(pca_X_train, pca_y_train)
pca_y_pred = pipeline.predict(pca_X_test)

In [None]:
# evaluation
print(f"{model_train_flag} Regression with PCA Results")
print(f"Train R²: {pipeline.score(pca_X_train, pca_y_train):.4f}")
print(f"Test R²: {pipeline.score(pca_X_test, pca_y_test):.4f}")

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, pca_X_scaled, pca_y, scoring='r2', cv=cv)
print("Cross-Validation R² scores:", cv_scores)
print("Average R²:", np.mean(cv_scores))

In [None]:
test(pipeline, pca_X_test, pca_y_test, pca_y_pred)

In [None]:
visualize(pca_y_test, pca_y_pred)

#### Results

LinReg Regression with PCA Results

Train R²: 0.3032

Test R²: 0.3277

Cross-Validation R² scores: [0.35808581 0.35334128 0.23436523 0.33982897 0.26431706]

Average R²: 0.3099876699387849

Root mean Squared Error: 26.5305

Mean absolute Error: 8.7832

Mean Squared Error: 703.87

Median absolute Error: 4.8714

Ridge Regression with PCA Results

Train R²: 0.3032

Test R²: 0.3277

Cross-Validation R² scores: [0.35809013 0.35334543 0.23437372 0.33981973 0.26431705]

Average R²: 0.309989212860658

Root mean Squared Error: 26.5305

Mean absolute Error: 8.7823

Mean Squared Error: 703.87

Median absolute Error: 4.8691


#### Option 6: Apply autoencoder from DataPreprocessing.ipynb
Option 6 implements an autoencoder neural network for dimensionality reduction, which offers advantages over PCA by capturing non-linear relationships between features. This technique creates a compressed representation of our input data in a 32-dimensional latent space, potentially revealing complex patterns that linear methods might miss when predicting Facebook comment volumes.

In [None]:
auto_data = pd.read_csv('Preprocessed_Data.csv')

In [None]:
# separate features and target
auto_X = auto_data.drop(columns=["Target_Comment_Volume"])
auto_y = auto_data["Target_Comment_Volume"]

# scale features
scaler = StandardScaler()
auto_X_scaled = scaler.fit_transform(auto_X)  # scale before splitting train-test so that all values are in the same range

In [None]:
input_dim = auto_X_scaled.shape[1]
encoding_dim = 32 # Dimension of the encoded representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(auto_X_scaled, auto_X_scaled, epochs=50, batch_size=32, verbose=0)

# Encode features
X_encoded = encoder.predict(auto_X_scaled)
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, auto_y, test_size=0.3, random_state=42)

In [None]:
# Linear Regression
auto_lr = LinearRegression()
auto_lr.fit(X_train_enc, y_train_enc)

In [None]:
print("<----Linear Regression after autoencoder---->")
y_pred_enc = test(auto_lr, X_test_enc, y_test_enc)

In [None]:
visualize(y_test_enc, y_pred_enc)

#### Option 7: Feature Engineering
Option 7 implements the feature engineered data from 3_FeatureEngineering.ipynb

In [None]:
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv")

In [None]:
# 2) Split into features (X) and target (y)
X_train      = train_df.drop(columns='Target_Comment_Volume')
y_train      = train_df['Target_Comment_Volume']

X_test       = test_df.drop(columns='Target_Comment_Volume')
y_test       = test_df['Target_Comment_Volume']

In [None]:
# Linear Regression on original features
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred_lr = test(lr, X_test, y_test)

In [None]:
visualize(y_test, y_pred_lr)

## Other Experiments
#### 1. Removing of outliers
results of test: (Train R² / Test R² / MSE / RMSE)

a) no outlier removal: model fits well, but affected by large outliers (0.56, 0.53, 467.04, 21.61)

b) remove x and y outliers: low error, but model trained on less data (0.32, 0.34, 1.72, 1.31) 

c) clip only the target value: balanced — outliers controlled, full dataset used (0.49, 0.48, 3.91, 1.98)
##### Conclusion: Clipping only the target value when removing outliers performed the best but not as well as other methods, removed to be able to compare metrics between models 

#### 2. Log transform target value
##### Conclusion: This resulted in very poor results and a negative r^2 value and so we opted to not log tranform the y-values 

## Conclusion
- Performed the best with feature engineering as it removed irrelevant/misleading features
- Underfitted on complex or high-comment posts
- Highlights the need for non-linear or ensemble models for better accuracy and expressiveness

## Future Improvements
- polynomial regression