##### Libraries

In [20]:
import sys
import time
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin

# from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin

np.set_printoptions(threshold=sys.maxsize)

np.random.seed(229)

In [4]:
# Ingore warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

### Data Splits

In [5]:
# read/prep data
dat = pd.read_csv("../data/tokenized_reviews.csv")
dat = dat.dropna()
dat["quote"] = dat["quote"].astype(int)
dat["tokenized_words"] = dat["tokenized_words"].apply(lambda x: x.strip("[']").replace("', '"," "))

In [6]:
# 85% train / 15% test
X_train, X_test, y_train, y_test = train_test_split(dat.drop(columns=["popular"]),
                                                    dat["popular"],
                                                    test_size = 0.15,
                                                    random_state = 229)
X_train.shape, X_test.shape

((1453600, 13), (256518, 13))

In [7]:
# undersample train set
majority_size = len(y_train[y_train==0])
minority_size = len(y_train[y_train==1])
majority_indices = y_train[y_train==0].index
rng = np.random.default_rng(seed=229)
drop_indices = rng.choice(majority_indices, majority_size-minority_size, replace=False)
X_train = X_train.drop(drop_indices)
y_train = y_train.drop(drop_indices)

X_train.shape, X_test.shape

((430924, 13), (256518, 13))

In [9]:
y_train.value_counts()

popular
0    215462
1    215462
Name: count, dtype: int64

In [10]:
y_test.value_counts()

popular
0    218275
1     38243
Name: count, dtype: int64

### Reviews only

In [12]:
X_train_review = X_train[["tokenized_words"]]
X_test_review = X_test[["tokenized_words"]]

X_train_review.shape, X_test_review.shape

((430924, 1), (256518, 1))

In [16]:
# BAG OF WORDS
print("\n\nLOGISTIC REGRESSION BOW")

start_time = time.time()

# column transorfmer
preprocessor = ColumnTransformer(
    transformers=[
        ('countvectorizer', CountVectorizer(), 'tokenized_words'),  
    ], remainder='passthrough'
)

# full pipeline

bow_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(
        penalty='l2',
        solver='saga',
        max_iter=5000,
        random_state=229,
        n_jobs=-1
    )
)

# parameters to try
parameters = {
    'logisticregression__C': (10, 1, 0.01, 0.001)
}

# Set up GridSearchCV
gs_bow_pipe = GridSearchCV(
    bow_pipe, parameters,
    cv=ShuffleSplit(n_splits=1, test_size=0.15, random_state=229),
    n_jobs=-1)

gs_bow_pipe.fit(X_train_review, y_train)


total_time = time.time() - start_time
print(f"\nTraining completed in: {total_time:.2f} seconds\n\n")



LOGISTIC REGRESSION BOW


Exception ignored in: <function ResourceTracker.__del__ at 0x11931dd00>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x105a2dd00>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/


Training completed in: 1864.50 seconds




In [21]:
print(gs_bow_pipe.cv_results_)
print(gs_bow_pipe.best_params_)

# save the best model with pickle
with open("./logistic_bow_model_a.pkl", "wb") as f:
    pickle.dump(gs_bow_pipe.best_estimator_, f)

print("\nBest model saved as 'logistic_bow_model_a.pkl'")

{'mean_fit_time': array([1737.50078487, 1495.31507421,  169.69934893,  158.93021607]), 'std_fit_time': array([0., 0., 0., 0.]), 'mean_score_time': array([2.2438581 , 3.51356292, 3.67775321, 3.69510198]), 'std_score_time': array([0., 0., 0., 0.]), 'param_logisticregression__C': masked_array(data=[10.0, 1.0, 0.01, 0.001],
             mask=[False, False, False, False],
       fill_value=1e+20), 'params': [{'logisticregression__C': 10}, {'logisticregression__C': 1}, {'logisticregression__C': 0.01}, {'logisticregression__C': 0.001}], 'split0_test_score': array([0.67074057, 0.67290645, 0.68010025, 0.67278269]), 'mean_test_score': array([0.67074057, 0.67290645, 0.68010025, 0.67278269]), 'std_test_score': array([0., 0., 0., 0.]), 'rank_test_score': array([4, 2, 1, 3], dtype=int32)}
{'logisticregression__C': 0.01}

Best model saved as 'logistic_bow_model_a.pkl'


In [22]:
# predict
predictions = gs_bow_pipe.predict(X_test_review)
predictions = list(map(round,predictions))

In [23]:
# evaluate
cm = confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Specificity :", tn/(fp+tn))
print("ROC-AUC :", roc_auc_score(y_test, predictions))

[[172536  45739]
 [ 16723  21520]]
              precision    recall  f1-score   support

           0       0.91      0.79      0.85    218275
           1       0.32      0.56      0.41     38243

    accuracy                           0.76    256518
   macro avg       0.62      0.68      0.63    256518
weighted avg       0.82      0.76      0.78    256518

Specificity : 0.7904524109494904
ROC-AUC : 0.676584885494618


### Review Meta Data Only

In [24]:
X_train_no_tokens = X_train.drop(columns=["tokenized_words"])
X_test_no_tokens = X_test.drop(columns=["tokenized_words"])

X_train_no_tokens.shape, X_test_no_tokens.shape

((430924, 12), (256518, 12))

In [26]:
# BAG OF WORDS
print("\n\nLOGISTIC REGRESSION BOW")

numerical_cols = [col for col in X_train.columns if col != 'tokenized_words']

start_time = time.time()

# column transorfmer
preprocessor = ColumnTransformer(
    transformers=[
        ('standardscaler', StandardScaler(), numerical_cols)       # Scale numerical columns
    ], remainder='passthrough'
)

# full pipeline

bow_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(
        penalty='l2',
        solver='saga',
        max_iter=5000,
        random_state=229,
        n_jobs=-1
    )
)

# parameters to try
parameters = {
    'logisticregression__C': (10, 1, 0.01, 0.001)
}


# Set up GridSearchCV
gs_bow_pipe = GridSearchCV(
    bow_pipe, parameters,
    cv=ShuffleSplit(n_splits=1, test_size=0.15, random_state=229),
    n_jobs=-1)

gs_bow_pipe.fit(X_train_no_tokens, y_train)


total_time = time.time() - start_time
print(f"\nTraining completed in: {total_time:.2f} seconds\n\n")



LOGISTIC REGRESSION BOW

Training completed in: 36.84 seconds




Exception ignored in: <function ResourceTracker.__del__ at 0x106f1dd00>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x106c1dd00>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/

In [27]:
print(gs_bow_pipe.cv_results_)
print(gs_bow_pipe.best_params_)

# save the best model with pickle
with open("./logistic_bow_model_b.pkl", "wb") as f:
    pickle.dump(gs_bow_pipe.best_estimator_, f)

print("\nBest model saved as 'logistic_bow_model_b.pkl'")

{'mean_fit_time': array([19.61022806, 19.6353538 , 18.77423787, 13.76602197]), 'std_fit_time': array([0., 0., 0., 0.]), 'mean_score_time': array([0.00911593, 0.00812221, 0.00802708, 0.00793099]), 'std_score_time': array([0., 0., 0., 0.]), 'param_logisticregression__C': masked_array(data=[10.0, 1.0, 0.01, 0.001],
             mask=[False, False, False, False],
       fill_value=1e+20), 'params': [{'logisticregression__C': 10}, {'logisticregression__C': 1}, {'logisticregression__C': 0.01}, {'logisticregression__C': 0.001}], 'split0_test_score': array([0.67917202, 0.67917202, 0.67886261, 0.67866149]), 'mean_test_score': array([0.67917202, 0.67917202, 0.67886261, 0.67866149]), 'std_test_score': array([0., 0., 0., 0.]), 'rank_test_score': array([1, 1, 3, 4], dtype=int32)}
{'logisticregression__C': 10}

Best model saved as 'logistic_bow_model_b.pkl'


In [28]:
# predict
predictions = gs_bow_pipe.predict(X_test_no_tokens)
predictions = list(map(round,predictions))

In [29]:
# evaluate
cm = confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Specificity :", tn/(fp+tn))
print("ROC-AUC :", roc_auc_score(y_test, predictions))

[[167068  51207]
 [ 15515  22728]]
              precision    recall  f1-score   support

           0       0.92      0.77      0.83    218275
           1       0.31      0.59      0.41     38243

    accuracy                           0.74    256518
   macro avg       0.61      0.68      0.62    256518
weighted avg       0.82      0.74      0.77    256518

Specificity : 0.7654014431336617
ROC-AUC : 0.6798531416175592


### Diagnostics 

In [30]:
corr_df = X_test_no_tokens.copy()
corr_df["actual"] = y_test.values
corr_df["predicted"] = predictions

correlations = corr_df.corr(numeric_only=True)  # Only numerical columns

print(correlations[["actual", "predicted"]])

                     actual  predicted
user_reviews       0.266226   0.553409
days_since_review -0.022886  -0.042072
user_rating       -0.019622  -0.064470
rating_diff       -0.007004  -0.026239
num_words          0.203718   0.573106
avg_word_len       0.034796   0.089160
avg_sent_len       0.057920   0.170383
pct_verbs         -0.030523  -0.066526
pct_nouns          0.018215   0.035825
pct_adj           -0.046640  -0.127933
quote              0.119136   0.352901
sentiment         -0.049510  -0.142077
actual             1.000000   0.282860
predicted          0.282860   1.000000


### Review Meta Data + Full sample

In [31]:
# read/prep data
dat = pd.read_csv("../data/tokenized_reviews.csv")
dat = dat.dropna()
dat["quote"] = dat["quote"].astype(int)
dat["tokenized_words"] = dat["tokenized_words"].apply(lambda x: x.strip("[']").replace("', '"," "))

# 85% train / 15% test
X_train, X_test, y_train, y_test = train_test_split(dat.drop(columns=["popular"]),
                                                    dat["popular"],
                                                    test_size = 0.15,
                                                    random_state = 229)
X_train.shape, X_test.shape

((1453600, 13), (256518, 13))

In [32]:
X_train_no_tokens = X_train.drop(columns=["tokenized_words"])
X_test_no_tokens = X_test.drop(columns=["tokenized_words"])

X_train_no_tokens.shape, X_test_no_tokens.shape

((1453600, 12), (256518, 12))

In [33]:
# BAG OF WORDS
print("\n\nLOGISTIC REGRESSION BOW")

numerical_cols = [col for col in X_train.columns if col != 'tokenized_words']

start_time = time.time()

# column transorfmer
preprocessor = ColumnTransformer(
    transformers=[
        ('standardscaler', StandardScaler(), numerical_cols)       # Scale numerical columns
    ], remainder='passthrough'
)

# full pipeline

bow_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(
        penalty='l2',
        solver='saga',
        max_iter=5000,
        random_state=229,
        n_jobs=-1
    )
)

# parameters to try
parameters = {
    'logisticregression__C': (10, 1, 0.01, 0.001)
}


# Set up GridSearchCV
gs_bow_f_pipe = GridSearchCV(
    bow_pipe, parameters,
    cv=ShuffleSplit(n_splits=1, test_size=0.15, random_state=229),
    n_jobs=-1)

gs_bow_f_pipe.fit(X_train_no_tokens, y_train)


total_time = time.time() - start_time
print(f"\nTraining completed in: {total_time:.2f} seconds\n\n")



LOGISTIC REGRESSION BOW

Training completed in: 121.47 seconds




In [34]:
print(gs_bow_f_pipe.cv_results_)
print(gs_bow_f_pipe.best_params_)

# save the best model with pickle
with open("./logistic_bow_model_fb.pkl", "wb") as f:
    pickle.dump(gs_bow_f_pipe.best_estimator_, f)

print("\nBest model saved as 'logistic_bow_model_fb.pkl'")

{'mean_fit_time': array([60.96135998, 62.30540109, 60.23921776, 50.71277094]), 'std_fit_time': array([0., 0., 0., 0.]), 'mean_score_time': array([0.02221513, 0.01952696, 0.01981711, 0.02140498]), 'std_score_time': array([0., 0., 0., 0.]), 'param_logisticregression__C': masked_array(data=[10.0, 1.0, 0.01, 0.001],
             mask=[False, False, False, False],
       fill_value=1e+20), 'params': [{'logisticregression__C': 10}, {'logisticregression__C': 1}, {'logisticregression__C': 0.01}, {'logisticregression__C': 0.001}], 'split0_test_score': array([0.85258668, 0.85258668, 0.85260503, 0.85256375]), 'mean_test_score': array([0.85258668, 0.85258668, 0.85260503, 0.85256375]), 'std_test_score': array([0., 0., 0., 0.]), 'rank_test_score': array([2, 2, 1, 4], dtype=int32)}
{'logisticregression__C': 0.01}

Best model saved as 'logistic_bow_model_fb.pkl'


In [35]:
# predict
predictions = gs_bow_f_pipe.predict(X_test_no_tokens)
predictions = list(map(round,predictions))
# evaluate
cm = confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Specificity :", tn/(fp+tn))
print("ROC-AUC :", roc_auc_score(y_test, predictions))

[[215286   2989]
 [ 35149   3094]]
              precision    recall  f1-score   support

           0       0.86      0.99      0.92    218275
           1       0.51      0.08      0.14     38243

    accuracy                           0.85    256518
   macro avg       0.68      0.53      0.53    256518
weighted avg       0.81      0.85      0.80    256518

Specificity : 0.9863062650326423
ROC-AUC : 0.5336049799132304


Exception ignored in: <function ResourceTracker.__del__ at 0x107b1dd00>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x107e1dd00>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/