# Setting up environment

In [74]:
# Google Collab installation requirements
!pip install datasets
!pip install textattack
!pip install scikit-learn



In [75]:
# Dependencies
import pandas as pd
import numpy as np
import datasets
import re
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import metrics
import textattack
from textattack.attack_recipes import TextFoolerJin2019
from textattack import Attacker
from abc import ABC, abstractmethod

In [76]:
# the Natural Language Toolkit & tokenizer
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [77]:
# Connect to MongoDB

# Initialize authentication & cursor variables
mongo_uri = f'mongodb+srv://kkosek:{password}@cluster0.lv4rmyj.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0'
db_name = 'walmart'
collection_name = 'scraper'

# Create a client
client = MongoClient(mongo_uri)
# Connect to the 'walmart' database
db = client[db_name]  
# Open the 'scraper' collection
collection = db[collection_name] 

# Fetch all documents from the collection
cursor = collection.find({})

# Convert documents to a list of dictionaries
documents = list(cursor)

# Close the cursor and client
cursor.close()
client.close()

# Convert the list of dicts to master DataFrame
df = pd.DataFrame(documents)

# Binary Preprocessing

In [78]:
# Binary Sentiment Labelling

# Create copy of master df
binary_df = df.copy()
# Create a copy of the 'stars' column where 'stars' is equal to 5
five = binary_df.loc[ binary_df['stars'] == 5 ].copy()
# Add a new column in the length of the DataFrame with all 1s to bin 5stars
five['label'] = pd.Series( [x/x for x in range(1,len(five)+1)] , index=five.index )
# Create a copy of the 'stars' column where 'stars' is equal to 1
one = binary_df.loc[ binary_df['stars'] == 1 ].copy()
# Add a new column in the length of the DataFrame with all 0s to bin 1stars
one['label'] = pd.Series( [((x/x)-1) for x in range(1,len(one)+1)] , index=one.index )
# Concat the binary sentiment df
pos_neg = pd.concat( [five,one] )

# Overwrite the master set to only contain the 'text' and 'label' data (& reset index)
df = pos_neg[['text','label']].reset_index(drop=True)
# Clear all non-alphabetic characters out of text file
df["text"] = df["text"].apply(lambda x: re.sub("[^a-zA-Z]", " ", str(x)))

# Data Preparation

In [79]:
# Separate out into features (X) and target (y)
df_X = df['text']
df_y = df['label']

# Perform a test split on the features & target
X_train, X_test, y_train, y_test = train_test_split(
    df_X, df_y, test_size=0.15)

# Restore the split data into DataFrame objects now organized by training & testing
df_train = pd.DataFrame([X_train, y_train]).T
df_test =pd.DataFrame([X_test, y_test]).T

# Reset indices
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Relabel our columns
df_train.columns = ['text', 'label']
df_test.columns = ['text', 'label']

# Retype the label to be int
df_train['label'] = df_train['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

# Fitting & Training

In [80]:
# TF-IDF Vectorization of text examples

# Initialize Vectorizer object
Tfidf = TfidfVectorizer( ngram_range=(1, 3), max_features=100 )

# Fit the vectorizer with the text data
unstemmed_tfidf_vect_fit = Tfidf.fit(df_train['text'])

# Using the text data, vectorize the training 'words' with respect to their own frequency
# throughout the training corpus
Tfidf_training = unstemmed_tfidf_vect_fit.transform(df_train['text'])
# Convert those vectors into a DataFrame object
df_train_tfidf_unstem = pd.DataFrame( Tfidf_training.toarray() )

# Vectorize the testing 'words' with respect to their own frequency
# throughout the training corpus
Tfidf_testing = unstemmed_tfidf_vect_fit.transform(df_test['text'])
# Convert those vectors into a DataFrame object
df_test_tfidf_unstem = pd.DataFrame( Tfidf_testing.toarray() )


# Classifier & Testing Accuracy

In [81]:
# Define our classifier model
log_reg = LogisticRegression(C=30, max_iter=200)
# Fit that model on the stemmed & tokenized text examples with their recorded label (pos or neg)
log_reg = log_reg.fit(df_train_tfidf_unstem, df_train["label"])
# Use fitted classifier to predict the label from the testing stems
y_pred = log_reg.predict(df_test_tfidf_unstem)

print(classification_report(df_test["label"], y_pred))
print(f"Confusion Matrix:\n{metrics.confusion_matrix(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       276
           1       0.84      0.83      0.83       282

    accuracy                           0.83       558
   macro avg       0.83      0.83      0.83       558
weighted avg       0.83      0.83      0.83       558

Confusion Matrix:
[[230  46]
 [ 48 234]]


# Class Restructuring

In [82]:
# This cell bypasses a bug encountered with the source code of the API. The method '.get_feature_names()' was
# deprecated to '.get_feature_names_out()' in scikit >= 1.0.0, so the API encounters the error during normal functioning.
# (I don't know enough about subclassing to do it with less code than hardwiring it in a jupyter cell..)

class ModelWrapper(ABC):
    """A model wrapper queries a model with a list of text inputs.

    Classification-based models return a list of lists, where each sublist
    represents the model's scores for a given input.

    Text-to-text models return a list of strings, where each string is the
    output – like a translation or summarization – for a given input.
    """

    @abstractmethod
    def __call__(self, text_input_list, **kwargs):
        raise NotImplementedError()

    def get_grad(self, text_input):
        """Get gradient of loss with respect to input tokens."""
        raise NotImplementedError()

    def _tokenize(self, inputs):
        """Helper method for `tokenize`"""
        raise NotImplementedError()

    def tokenize(self, inputs, strip_prefix=False):
        """Helper method that tokenizes input strings
        Args:
            inputs (list[str]): list of input strings
            strip_prefix (bool): If `True`, we strip auxiliary characters added to tokens as prefixes (e.g. "##" for BERT, "Ġ" for RoBERTa)
        Returns:
            tokens (list[list[str]]): List of list of tokens as strings
        """
        tokens = self._tokenize(inputs)
        if strip_prefix:
            # `aux_chars` are known auxiliary characters that are added to tokens
            strip_chars = ["##", "Ġ", "__"]
            # TODO: Find a better way to identify prefixes. These depend on the model, so cannot be resolved in ModelWrapper.

            def strip(s, chars):
                for c in chars:
                    s = s.replace(c, "")
                return s

            tokens = [[strip(t, strip_chars) for t in x] for x in tokens]

        return tokens

class SklearnModelWrapper(ModelWrapper):
    """Loads a scikit-learn model and tokenizer (tokenizer implements
    `transform` and model implements `predict_proba`).

    May need to be extended and modified for different types of
    tokenizers.
    """

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, text_input_list, batch_size=None):
        encoded_text_matrix = self.tokenizer.transform(text_input_list).toarray()
        tokenized_text_df = pd.DataFrame(
            encoded_text_matrix)
        return self.model.predict_proba(tokenized_text_df)

    def get_grad(self, text_input):
        raise NotImplementedError()

# Launching our attack

In [83]:
# Define a wrapper object to function similarly to a pipeline and hold our fitted classifier model
# with our fitted vectorizor. The wrapper is engineered to function within the textattack architecture
model_wrapper = SklearnModelWrapper(log_reg, unstemmed_tfidf_vect_fit)

# The textattack architecture functions on textattack.datasets.Dataset objects
# The convertor accepts a list of tuples containing inputs and output examples
# For instance, ("I like this product", 1) represents a tuple containing an input and output
# Thus, we create a list comprehension to compile the df['text'] & df['label'] into this format
data = [(df_train['text'][x], int(df_train['label'][x])) for x in range(0,(len(df_train)))]\
# Then we call the textattack converter to assemble our data into the architecture
dataset = textattack.datasets.Dataset(data)

# The attack recipe in this attack is based on TextFooler, an adversarial attacker on NLP datasets that functions
# by trying out iterations of tokens that can be modified slightly to cause the model to misclassify the sentiment
# This attacker is pretrained and loaded into textattack libraries, so we can call it and build the adversarial
# model based on our NLP model
attack = TextFoolerJin2019.build(model_wrapper)
# We can specifically add more arguments to the Attacker class, but the pretrained model is optimal as is
attacker = Attacker(attack, dataset)
# Call .attack_dataset() to create .attack instances across the whole dataset
attacker.attack_dataset()

textattack: Unknown if model of class <class 'sklearn.linear_model._logistic.LogisticRegression'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 0 / 0 / 1 / 1:  10%|█         | 1/10 [00:00<00:00, 118.04it/s]

--------------------------------------------- Result 1 ---------------------------------------------

After one night sleeping on new bed my daughter s eyes  were almost swollen shut  day   her face red and swollen  day    and she is still crying in misery  What is in this mattress    I need a refund ASAP   




[Succeeded / Failed / Skipped / Total] 1 / 0 / 1 / 2:  20%|██        | 2/10 [00:13<00:55,  6.88s/it]

--------------------------------------------- Result 2 ---------------------------------------------

[[Easy]] to order     pick up  Works [[great]] 

[[Convenience]] to order     pick up  Works [[admirable]] 




[Succeeded / Failed / Skipped / Total] 2 / 0 / 3 / 5:  50%|█████     | 5/10 [00:14<00:14,  2.98s/it]

--------------------------------------------- Result 3 ---------------------------------------------

I [[love]] them  I [[love]] that I can just slip them on and have my feet be warm  They are so comfortable  I am going to look at another color for a variety  I m not the type of person who goes for pointed heels or tight binding shoes that hurt your feet  I love comfort and a shoe that can also be warn with just about anything  When I take them off I feel like I m still wearing comfort 

I [[adore]] them  I [[loves]] that I can just slip them on and have my feet be warm  They are so comfortable  I am going to look at another color for a variety  I m not the type of person who goes for pointed heels or tight binding shoes that hurt your feet  I love comfort and a shoe that can also be warn with just about anything  When I take them off I feel like I m still wearing comfort 


--------------------------------------------- Result 4 ---------------------------------------------

My daught

[Succeeded / Failed / Skipped / Total] 3 / 0 / 3 / 6:  60%|██████    | 6/10 [00:15<00:10,  2.54s/it]

--------------------------------------------- Result 6 ---------------------------------------------

It s very [[easy]] to sign in on the tv   I like it s it s very nice   Nice screen I purchase another one

It s very [[simple]] to sign in on the tv   I like it s it s very nice   Nice screen I purchase another one




[Succeeded / Failed / Skipped / Total] 3 / 1 / 3 / 7:  70%|███████   | 7/10 [00:16<00:06,  2.31s/it]

--------------------------------------------- Result 7 ---------------------------------------------

It came broken in this corner  I m going to return it 




[Succeeded / Failed / Skipped / Total] 3 / 2 / 3 / 8:  80%|████████  | 8/10 [00:16<00:04,  2.10s/it]

--------------------------------------------- Result 8 ---------------------------------------------

Handle broke after a few months of use




[Succeeded / Failed / Skipped / Total] 4 / 2 / 3 / 9:  90%|█████████ | 9/10 [00:17<00:01,  1.97s/it]

--------------------------------------------- Result 9 ---------------------------------------------

[[I]] Bought this for my brother and he was [[very]] pleased  [[I]] was able to ship it to his address  I [[love]] that  [[Easy]] transaction

[[me]] Bought this for my brother and he was [[absolutely]] pleased  [[me]] was able to ship it to his address  I [[adore]] that  [[Comfortably]] transaction




[Succeeded / Failed / Skipped / Total] 5 / 2 / 3 / 10: 100%|██████████| 10/10 [00:18<00:00,  1.80s/it]

--------------------------------------------- Result 10 ---------------------------------------------

The [[screen]] was shattered when we [[bought]] it so we returned it   Usually Visio brand is our favorite 

The [[dropper]] was shattered when we [[purchase]] it so we returned it   Usually Visio brand is our favorite 



+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 5      |
| Number of failed attacks:     | 2      |
| Number of skipped attacks:    | 3      |
| Original accuracy:            | 70.0%  |
| Accuracy under attack:        | 20.0%  |
| Attack success rate:          | 71.43% |
| Average perturbed word %:     | 13.26% |
| Average num. words per input: | 25.3   |
| Avg num queries:              | 110.86 |
+-------------------------------+--------+





[<textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7bdb3c972950>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7bdb3d040a90>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7bdb3cc374f0>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7bdb3ccca170>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7bdb3ccca6b0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7bdb3c973bb0>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7bdb3d1c15a0>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7bdb3d041e40>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7bdb3d040be0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7bdb3d043670>]