##### Libraries

In [14]:
import sys
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin

# from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin

np.set_printoptions(threshold=sys.maxsize)

np.random.seed(229)

In [4]:
# Ingore warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

### Data Splits

In [5]:
# read/prep data
dat = pd.read_csv("../data/tokenized_reviews.csv")
dat = dat.dropna()
dat["quote"] = dat["quote"].astype(int)
dat["tokenized_words"] = dat["tokenized_words"].apply(lambda x: x.strip("[']").replace("', '"," "))

In [6]:
# 85% train / 15% test
X_train, X_test, y_train, y_test = train_test_split(dat.drop(columns=["popular"]),
                                                    dat["popular"],
                                                    test_size = 0.15,
                                                    random_state = 229)
X_train.shape, X_test.shape

((1453600, 13), (256518, 13))

In [7]:
# undersample train set
majority_size = len(y_train[y_train==0])
minority_size = len(y_train[y_train==1])
majority_indices = y_train[y_train==0].index
rng = np.random.default_rng(seed=229)
drop_indices = rng.choice(majority_indices, majority_size-minority_size, replace=False)
X_train = X_train.drop(drop_indices)
y_train = y_train.drop(drop_indices)

X_train.shape, X_test.shape

((430924, 13), (256518, 13))

In [9]:
y_train.value_counts()

popular
0    215462
1    215462
Name: count, dtype: int64

In [10]:
y_test.value_counts()

popular
0    218275
1     38243
Name: count, dtype: int64

### Reviews only

In [12]:
X_train_review = X_train[["tokenized_words"]]
X_test_review = X_test[["tokenized_words"]]

X_train_review.shape, X_test_review.shape

((430924, 1), (256518, 1))

In [None]:
# BAG OF WORDS
print("\n\nLOGISTIC REGRESSION BOW")

start_time = time.time()

# column transorfmer
preprocessor = ColumnTransformer(
    transformers=[
        ('countvectorizer', CountVectorizer(), 'tokenized_words'),  
    ], remainder='passthrough'
)

# full pipeline

bow_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(
        penalty='l2',
        solver='saga',
        max_iter=5000,
        random_state=229,
        n_jobs=-1
    )
)

# parameters to try
parameters = {
    'logisticregression__C': (10, 1, 0.01, 0.001)
}

# Set up GridSearchCV
gs_bow_pipe = GridSearchCV(
    bow_pipe, parameters,
    cv=ShuffleSplit(n_splits=1, test_size=0.15, random_state=229),
    n_jobs=-1)

gs_bow_pipe.fit(X_train_review, y_train)


total_time = time.time() - start_time
print(f"\nTraining completed in: {total_time:.2f} seconds\n\n")



LOGISTIC REGRESSION BOW


Exception ignored in: <function ResourceTracker.__del__ at 0x11931dd00>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x105a2dd00>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/

In [None]:
print(gs_bow_pipe.cv_results_)
print(gs_bow_pipe.best_params_)

# save the best model with pickle
with open("./logistic_bow_model_a.pkl", "wb") as f:
    pickle.dump(gs_bow_pipe.best_estimator_, f)

print("\nBest model saved as 'logistic_bow_model_a.pkl'")

In [None]:
# predict
predictions = gs_bow_pipe.predict(X_test_review)
predictions = list(map(round,predictions))

In [None]:
# evaluate
cm = confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Specificity :", tn/(fp+tn))
print("ROC-AUC :", roc_auc_score(y_test, predictions))

### Review Meta Data Only

In [None]:
X_train_no_tokens = X_train.drop(columns=["tokenized_words"])
X_test_no_tokens = X_test.drop(columns=["tokenized_words"])

X_train_no_tokens.shape, X_test_no_tokens.shape

In [None]:
# BAG OF WORDS
print("\n\nLOGISTIC REGRESSION BOW")

start_time = time.time()

# column transorfmer
preprocessor = ColumnTransformer(
    transformers=[
        ('standardscaler', StandardScaler(), numerical_cols)       # Scale numerical columns
    ], remainder='passthrough'
)

# full pipeline

bow_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(
        penalty='l2',
        solver='saga',
        max_iter=5000,
        random_state=229,
        n_jobs=-1
    )
)

# parameters to try
parameters = {
    'logisticregression__C': (10, 1, 0.01, 0.001)
}


# Set up GridSearchCV
gs_bow_pipe = GridSearchCV(
    bow_pipe, parameters,
    cv=ShuffleSplit(n_splits=1, test_size=0.15, random_state=229),
    n_jobs=-1)

gs_bow_pipe.fit(X_train_no_tokens, y_train)


total_time = time.time() - start_time
print(f"\nTraining completed in: {total_time:.2f} seconds\n\n")



LOGISTIC REGRESSION BOW




{'mean_fit_time': array([1170.64856005, 1215.27449656, 1254.74963665,  611.09016752,
       1465.03414679, 1446.66143703, 1411.2590313 , 1449.59315228]), 'std_fit_time': array([0., 0., 0., 0., 0., 0., 0., 0.]), 'mean_score_time': array([8.82783151, 6.01767755, 8.00530434, 7.28969765, 6.81266022,
       6.39399433, 6.1398077 , 8.37145758]), 'std_score_time': array([0., 0., 0., 0., 0., 0., 0., 0.]), 'param_columntransformer__countvectorizer__max_features': masked_array(data=[10000, 10000, 10000, 10000, 50000, 50000, 50000, 50000],
             mask=[False, False, False, False, False, False, False, False],
       fill_value=999999), 'param_logisticregression__C': masked_array(data=[10.0, 1.0, 0.01, 0.001, 10.0, 1.0, 0.01, 0.001],
             mask=[False, False, False, False, False, False, False, False],
       fill_value=1e+20), 'params': [{'columntransformer__countvectorizer__max_features': 10000, 'logisticregression__C': 10}, {'columntransformer__countvectorizer__max_features': 10000, 

In [None]:
print(gs_bow_pipe.cv_results_)
print(gs_bow_pipe.best_params_)

# save the best model with pickle
with open("./logistic_bow_model_b.pkl", "wb") as f:
    pickle.dump(gs_bow_pipe.best_estimator_, f)

print("\nBest model saved as 'logistic_bow_model_b.pkl'")

In [None]:
# predict
predictions = gs_bow_pipe.predict(X_test_no_tokens)
predictions = list(map(round,predictions))

In [None]:
# evaluate
cm = confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Specificity :", tn/(fp+tn))
print("ROC-AUC :", roc_auc_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.79      0.85    218275
           1       0.33      0.61      0.43     38243

    accuracy                           0.76    256518
   macro avg       0.63      0.70      0.64    256518
weighted avg       0.83      0.76      0.79    256518

[[171434  46841]
 [ 14915  23328]]
0.6976988598247426
