##### Libraries

In [28]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin

# from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin
np.set_printoptions(threshold=sys.maxsize)

##### Splitting

In [29]:
# read/prep data
dat = pd.read_csv("/content/drive/MyDrive/AML Project/nlp/data/tokenized_reviews_10k.csv")
dat = dat.dropna()
dat["quote"] = dat["quote"].astype(int)
dat["tokenized_words"] = dat["tokenized_words"].apply(lambda x: x.strip("[']").replace("', '"," "))

In [30]:
# 85% train / 15% test
X_train, X_test, y_train, y_test = train_test_split(dat.drop(columns=["popular"]),
                                                    dat["popular"],
                                                    test_size = 0.15,
                                                    random_state = 229)

In [31]:
# undersample train set
majority_size = len(y_train[y_train==0])
minority_size = len(y_train[y_train==1])
majority_indices = y_train[y_train==0].index
rng = np.random.default_rng(seed=229)
drop_indices = rng.choice(majority_indices, majority_size-minority_size, replace=False)
X_train = X_train.drop(drop_indices)
y_train = y_train.drop(drop_indices)

##### Bag Of Words

In [32]:
# BAG OF WORDS
print("\n\nLOGISTIC REGRESSION BOW")

# pipeline
bow_pipe = make_pipeline(
    ColumnTransformer(remainder='passthrough',
                      transformers=[('countvectorizer',
                                     CountVectorizer(),
                                     'tokenized_words')]),
    StandardScaler(with_mean=False),
    LogisticRegression(penalty='l2',
                       solver='saga',
                       max_iter=1000,
                       random_state=229,
                       n_jobs=-1))

# parameters to try
parameters = {
    'columntransformer__countvectorizer__max_features': (10000,50000),
    'logisticregression__C': (10, 1, 0.01, 0.001)
}

# perform validation
gs_bow_pipe = GridSearchCV(bow_pipe,
                           parameters,
                           cv=ShuffleSplit(n_splits=1,
                                           test_size=0.13,
                                           random_state=229))
gs_bow_pipe.fit(X_train, y_train)
print(gs_bow_pipe.cv_results_)
print(gs_bow_pipe.best_params_)



LOGISTIC REGRESSION BOW




{'mean_fit_time': array([6.21396136, 7.92208624, 6.23531294, 1.83750844, 8.77006698,
       8.13105154, 7.71030831, 3.65900946]), 'std_fit_time': array([0., 0., 0., 0., 0., 0., 0., 0.]), 'mean_score_time': array([0.0433259 , 0.04240632, 0.04928327, 0.0433476 , 0.04447103,
       0.07193708, 0.0448544 , 0.07889438]), 'std_score_time': array([0., 0., 0., 0., 0., 0., 0., 0.]), 'param_columntransformer__countvectorizer__max_features': masked_array(data=[10000, 10000, 10000, 10000, 50000, 50000, 50000, 50000],
             mask=[False, False, False, False, False, False, False, False],
       fill_value=999999), 'param_logisticregression__C': masked_array(data=[10.0, 1.0, 0.01, 0.001, 10.0, 1.0, 0.01, 0.001],
             mask=[False, False, False, False, False, False, False, False],
       fill_value=1e+20), 'params': [{'columntransformer__countvectorizer__max_features': 10000, 'logisticregression__C': 10}, {'columntransformer__countvectorizer__max_features': 10000, 'logisticregression__C':

In [33]:
# predict
predictions = gs_bow_pipe.predict(X_test)
predictions = list(map(round,predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(roc_auc_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.87      0.70      0.78      1214
           1       0.30      0.55      0.39       286

    accuracy                           0.67      1500
   macro avg       0.59      0.62      0.58      1500
weighted avg       0.76      0.67      0.70      1500

[[855 359]
 [130 156]]
0.6248689531226599


In [34]:
# feature importance
coefficients = gs_bow_pipe.best_estimator_.named_steps['logisticregression'].coef_[0]
num_nonzero_coefs = len(np.where(abs(coefficients) > 0)[0])
sorted_ind = np.argsort(abs(coefficients))[::-1][:num_nonzero_coefs]
print(len(sorted_ind))

10012


In [35]:
# Access the CountVectorizer from the ColumnTransformer
feature_names = gs_bow_pipe.best_estimator_.named_steps['columntransformer'].named_transformers_['countvectorizer'].get_feature_names_out()
sorted_ind_filtered = sorted_ind[sorted_ind < len(feature_names)]
important_features = np.take(feature_names, sorted_ind_filtered.tolist())
print('important-features:', important_features[:10])

important-features: ['channel' 'arc' 'graphic' 'publish' 'glad' 'already' 'finish' 'review'
 'alright' 'improved']


##### TF-IDF

In [36]:
# TF-IDF
print("\n\nLOGISTIC REGRESSION TF-IDF")

# pipeline
tf_pipe = make_pipeline(
    ColumnTransformer(remainder='passthrough',
                      transformers=[('tfidfvectorizer',
                                     TfidfVectorizer(),
                                     'tokenized_words')]),
    StandardScaler(with_mean=False),
    LogisticRegression(penalty='l2',
                       solver='saga',
                       max_iter=1000,
                       random_state=229,
                       n_jobs=-1))

# parameters to try
parameters = {
    'logisticregression__C': (10, 1, 0.01, 0.001)
}

# perform validation
gs_tf_pipe = GridSearchCV(tf_pipe,
                           parameters,
                           cv=ShuffleSplit(n_splits=1,
                                           test_size=0.13,
                                           random_state=229))
gs_tf_pipe.fit(X_train, y_train)
print(gs_tf_pipe.cv_results_)
print(gs_tf_pipe.best_params_)



LOGISTIC REGRESSION TF-IDF




{'mean_fit_time': array([7.09153724, 9.54863071, 7.14545059, 4.34465909]), 'std_fit_time': array([0., 0., 0., 0.]), 'mean_score_time': array([0.05497622, 0.0577662 , 0.07501698, 0.04515982]), 'std_score_time': array([0., 0., 0., 0.]), 'param_logisticregression__C': masked_array(data=[10.0, 1.0, 0.01, 0.001],
             mask=[False, False, False, False],
       fill_value=1e+20), 'params': [{'logisticregression__C': 10}, {'logisticregression__C': 1}, {'logisticregression__C': 0.01}, {'logisticregression__C': 0.001}], 'split0_test_score': array([0.57075472, 0.57075472, 0.57311321, 0.58254717]), 'mean_test_score': array([0.57075472, 0.57075472, 0.57311321, 0.58254717]), 'std_test_score': array([0., 0., 0., 0.]), 'rank_test_score': array([3, 3, 2, 1], dtype=int32)}
{'logisticregression__C': 0.001}


In [37]:
# predict
predictions = gs_tf_pipe.predict(X_test)
predictions = list(map(round,predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(roc_auc_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70      1214
           1       0.27      0.65      0.38       286

    accuracy                           0.59      1500
   macro avg       0.57      0.61      0.54      1500
weighted avg       0.76      0.59      0.64      1500

[[702 512]
 [100 186]]
0.6143016785520904


In [38]:
# feature importance
coefficients = gs_tf_pipe.best_estimator_.named_steps['logisticregression'].coef_[0]
num_nonzero_coefs = len(np.where(abs(coefficients) > 0)[0])
sorted_ind = np.argsort(abs(coefficients))[::-1][:num_nonzero_coefs]
print(len(sorted_ind))

23371


In [39]:
# Access the CountVectorizer from the ColumnTransformer
feature_names = gs_bow_pipe.best_estimator_.named_steps['columntransformer'].named_transformers_['countvectorizer'].get_feature_names_out()
sorted_ind_filtered = sorted_ind[sorted_ind < len(feature_names)]
important_features = np.take(feature_names, sorted_ind_filtered.tolist())
print('important-features:', important_features[:10])

important-features: ['forbidden' 'brady' 'talented' 'castle' 'heed' 'sadism' 'awry' 'soldier'
 'alluring' 'pick']


##### Word2vec

Dependencies change for word2vec

In [40]:
# Uninstall to avoid binary incompatibility
!pip uninstall -y numpy gensim

# Install compatible versions
!pip install numpy==1.26.4 gensim --quiet

import numpy
import gensim
print(f"NumPy: {numpy.__version__}\nGensim: {gensim.__version__}")

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mNumPy: 1.26.4
Gensim: 4.3.3


In [41]:
print("\n\nLOGISTIC REGRESSION WORD2VEC")

# Custom transformer for Word2Vec averaging
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1, workers=1, seed=229):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.seed = seed
        self.model = None

    def fit(self, X, y=None):
        # X['tokenized_words'] should be a list of tokens per review
        sentences = X['tokenized_words'].apply(lambda x: x.split()).tolist() # Split the tokenized words into a list of words
        self.model = Word2Vec(sentences, vector_size=self.vector_size, window=self.window,
                              min_count=self.min_count, workers=self.workers, seed=self.seed)
        return self

    def transform(self, X):
        def document_vector(doc):
            doc = [word for word in doc.split() if word in self.model.wv] # Split the tokenized words into a list of words
            if len(doc) == 0:
                return np.zeros(self.vector_size)
            return np.mean(self.model.wv[doc], axis=0)
        return np.vstack(X['tokenized_words'].apply(document_vector))



LOGISTIC REGRESSION WORD2VEC


In [42]:
# Pipeline for Word2Vec + Logistic Regression
w2v_pipe = make_pipeline(
    ColumnTransformer(remainder='passthrough',
                      transformers=[('word2vec',
                                     Word2VecVectorizer(),
                                     ['tokenized_words'])]),
    StandardScaler(),
    LogisticRegression(penalty='l2',
                       solver='saga',
                       max_iter=1000,
                       random_state=229,
                       n_jobs=-1))

parameters = {
    'logisticregression__C': (10, 1, 0.01, 0.001)
}

gs_w2v_pipe = GridSearchCV(w2v_pipe,
                           parameters,
                           cv=ShuffleSplit(n_splits=1,
                                           test_size=0.13,
                                           random_state=229))
gs_w2v_pipe.fit(X_train, y_train)
print(gs_w2v_pipe.cv_results_)



{'mean_fit_time': array([25.80853558, 11.8149097 ,  7.01821661,  4.90150809]), 'std_fit_time': array([0., 0., 0., 0.]), 'mean_score_time': array([0.21958208, 0.12714624, 0.14819789, 0.1260345 ]), 'std_score_time': array([0., 0., 0., 0.]), 'param_logisticregression__C': masked_array(data=[10.0, 1.0, 0.01, 0.001],
             mask=[False, False, False, False],
       fill_value=1e+20), 'params': [{'logisticregression__C': 10}, {'logisticregression__C': 1}, {'logisticregression__C': 0.01}, {'logisticregression__C': 0.001}], 'split0_test_score': array([0.62264151, 0.62735849, 0.625     , 0.64386792]), 'mean_test_score': array([0.62264151, 0.62735849, 0.625     , 0.64386792]), 'std_test_score': array([0., 0., 0., 0.]), 'rank_test_score': array([4, 2, 3, 1], dtype=int32)}


In [43]:
print(gs_w2v_pipe.best_params_)

{'logisticregression__C': 0.001}


In [44]:
#predictions
predictions = gs_w2v_pipe.predict(X_test)
predictions = list(map(round,predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(roc_auc_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.61      0.72      1214
           1       0.28      0.63      0.39       286

    accuracy                           0.62      1500
   macro avg       0.58      0.62      0.55      1500
weighted avg       0.76      0.62      0.66      1500

[[743 471]
 [105 181]]
0.6224467460052304


In [45]:
# Feature importance for word2vec (vector coefficients)
coefficients = gs_w2v_pipe.best_estimator_.named_steps['logisticregression'].coef_[0]
num_nonzero_coefs = len(np.where(abs(coefficients) > 0)[0])
sorted_ind = np.argsort(abs(coefficients))[::-1][:num_nonzero_coefs]
# print(len(sorted_ind))
# print(np.take(coefficients,sorted_ind.tolist()))
print(f"Number of features (word2vec vector size): {len(coefficients)}")

Number of features (word2vec vector size): 112


In [46]:
# the top 10 features
top_10_indices = np.argsort(np.abs(coefficients))[-10:]  # Indices of top 10 features
print("Top 10 feature indices:", top_10_indices)
print("Top 10 coefficients:", coefficients[top_10_indices])

Top 10 feature indices: [109  70 108  65 106 101  30 104 105 100]
Top 10 coefficients: [ 0.01673677 -0.01859409 -0.02030245  0.02272412  0.02382111 -0.02463701
 -0.0250938   0.06562118  0.10062244  0.11947248]
