### Libraries

In [2]:
!pip install transformers



In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl (30 kB)
Installing collected packages: xxhash, dill, multiprocess, datasets
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [datasets]3/4[0m [datasets]
Successfully installed datasets-3.5.1 dill-0.3.8 multiprocess-0.70.16 xxhash-3.5.0


In [54]:
import sys
import os
import time
import pickle
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
from torch.utils.data import Dataset

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AutoModel, BertModel
from transformers import BertTokenizerFast
from transformers import DistilBertTokenizerFast, DistilBertModel

np.random.seed(229)

In [44]:
# Ingore warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

### Splitting

In [45]:
# read/prep data
data = pd.read_csv("../data/tokenized_reviews.csv")
data = data.dropna()
data["quote"] = data["quote"].astype(int)
data["tokenized_words"] = data["tokenized_words"].apply(lambda x: x.strip("[']").replace("', '", " "))

In [46]:
print(data.shape, "\n\n")

(1710118, 14) 




In [47]:
# Split text and numerical features
# 85% train / 15% test
X = data.drop(columns=["popular"])
y = data["popular"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=229)

In [48]:
X_train.shape, y_train.shape

((1453600, 13), (1453600,))

### Down sampling

In [49]:
# Downsampling majority class
majority_idx = y_train[y_train == 0].index
minority_idx = y_train[y_train == 1].index
drop_idx = np.random.default_rng(seed=229).choice(
    majority_idx, len(majority_idx) - len(minority_idx), replace=False
)
X_train = X_train.drop(index=drop_idx)
y_train = y_train.drop(index=drop_idx)

In [50]:
X_train.shape, y_train.shape

((431298, 13), (431298,))

In [51]:
X_train.head(2)

Unnamed: 0,user_reviews,days_since_review,user_rating,rating_diff,num_words,avg_word_len,avg_sent_len,pct_verbs,pct_nouns,pct_adj,quote,sentiment,tokenized_words
1519074,267,1280,1,-2.93,1453,4.269787,15.793478,0.185822,0.255334,0.160358,1,-0.014139,book review start saying read good book jeffer...
428448,350,1027,5,0.18,96,3.958333,9.6,0.208333,0.208333,0.197917,0,0.37275,hobbes calvin good companion hard time even th...


### BERT Embeddings

In [56]:
# Separate numerical columns
numerical_cols = [col for col in X_train.columns if col != "tokenized_words"]

# Custom transformer to get BERT embeddings
class BertVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='bert-base-uncased', device=None, batch_size=16):
        self.model_name = model_name
        self.device = device
        self.batch_size = batch_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        import torch
        from transformers import BertTokenizerFast, BertModel

        tokenizer = BertTokenizerFast.from_pretrained(self.model_name)
        model = BertModel.from_pretrained(self.model_name)

        # Device detection
        if not self.device:
            if torch.backends.mps.is_available():
                self.device = "mps"
            elif torch.cuda.is_available():
                self.device = "cuda"
            else:
                self.device = "cpu"
        model.to(self.device)
        model.eval()

        X = X.squeeze()
        all_embeddings = []

        # Process in batches
        for i in range(0, len(X), self.batch_size):
            batch = list(X[i:i + self.batch_size])
            tokens = tokenizer(batch, padding=True, truncation=True,
                               max_length=128, return_tensors='pt')
            tokens = {k: v.to(self.device) for k, v in tokens.items()}

            with torch.no_grad():
                outputs = model(**tokens)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(cls_embeddings)

        return np.vstack(all_embeddings)

In [57]:
print("\n\nLogistic Regression-BERT")
start_time = time.time()

preprocessor = ColumnTransformer(transformers=[
    ("bert", BertVectorizer(), "tokenized_words"),
    ("num", StandardScaler(), numerical_cols)
], remainder='passthrough')

bert_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(
        penalty='l2',
        solver='saga',
        max_iter=5000,
        random_state=229,
        n_jobs=-1
    ))
])

parameters = {"classifier__C": [10, 1, 0.01, 0.001]}

gs_bert_pipe = GridSearchCV(
    estimator=bert_pipe,
    param_grid=parameters,
    cv=ShuffleSplit(n_splits=1, test_size=0.15, random_state=229), n_jobs=1,
    verbose=2
)

gs_bert_pipe.fit(X_train, y_train)
print(f"\nTraining completed in: {time.time() - start_time:.2f} seconds\n")



 Logistic Regression-BERT
Fitting 1 folds for each of 4 candidates, totalling 4 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[CV] END ...................................classifier__C=10; total time=69.0min
[CV] END ....................................classifier__C=1; total time=68.8min
[CV] END .................................classifier__C=0.01; total time=53.4min
[CV] END ................................classifier__C=0.001; total time=42.7min

Training completed in: 17490.19 seconds



In [59]:
print(gs_bert_pipe.cv_results_)
print(gs_bert_pipe.best_params_)

# save the best model with pickle
with open("./logistic_bert_model_cde.pkl", "wb") as f:
    pickle.dump(gs_bert_pipe.best_estimator_, f)

print("\nBest model saved as 'logistic_bert_model_cde.pkl'")

{'mean_fit_time': array([3826.54654598, 3791.18984723, 2878.24719524, 2258.39360213]), 'std_fit_time': array([0., 0., 0., 0.]), 'mean_score_time': array([315.55065107, 335.74843287, 328.23438287, 304.49698973]), 'std_score_time': array([0., 0., 0., 0.]), 'param_classifier__C': masked_array(data=[10.0, 1.0, 0.01, 0.001],
             mask=[False, False, False, False],
       fill_value=1e+20), 'params': [{'classifier__C': 10}, {'classifier__C': 1}, {'classifier__C': 0.01}, {'classifier__C': 0.001}], 'split0_test_score': array([0.6903161 , 0.69039338, 0.69045521, 0.68962053]), 'mean_test_score': array([0.6903161 , 0.69039338, 0.69045521, 0.68962053]), 'std_test_score': array([0., 0., 0., 0.]), 'rank_test_score': array([3, 2, 1, 4], dtype=int32)}
{'classifier__C': 0.01}

Best model saved as 'logistic_bert_model_cde.pkl'


In [62]:
# Get the best trained pipeline
best_model = gs_bert_pipe.best_estimator_

# Extract coefficients from the logistic regression model
coef = best_model.named_steps['classifier'].coef_[0]

# Number of non-zero coefficients
num_nonzero = np.sum(np.abs(coef) > 0)
print(f"Number of non-zero coefficients: {num_nonzero}")

# Get top contributing embedding dimensions
top_dims = np.argsort(np.abs(coef))[::-1][:10]
print("Top 10 embedding dimensions contributing to prediction:", top_dims)
print("Corresponding weights:", coef[top_dims])

Number of non-zero coefficients: 780
Top 10 embedding dimensions contributing to prediction: [768 770 771 615 449 130 318 453 442  15]
Corresponding weights: [ 0.74253218 -0.72597141  0.64425828  0.29991791  0.27611394 -0.27512348
 -0.26906531 -0.2504998   0.24097109  0.23963258]


In [63]:
# predict
predictions = gs_bert_pipe.predict(X_test)
predictions = list(map(round,predictions))

In [64]:
# evaluate
cm = confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Specificity :", tn/(fp+tn))
print("ROC-AUC :", roc_auc_score(y_test, predictions))

[[157691  60771]
 [ 12901  25155]]
              precision    recall  f1-score   support

           0       0.92      0.72      0.81    218462
           1       0.29      0.66      0.41     38056

    accuracy                           0.71    256518
   macro avg       0.61      0.69      0.61    256518
weighted avg       0.83      0.71      0.75    256518

Specificity : 0.7218234750208274
ROC-AUC : 0.6914115272938908
