# Setting up environment

In [20]:
# Google Collab installation requirements
!pip install datasets
!pip install textattack
!pip install scikit-learn
!pip install pymongo



In [21]:
# Dependencies
import pandas as pd
import numpy as np
import datasets
import re
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import metrics
import textattack
from textattack.attack_recipes import TextFoolerJin2019
from textattack import Attacker
from abc import ABC, abstractmethod
from pymongo import MongoClient
#from passwords import password

In [22]:
# the Natural Language Toolkit & tokenizer
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
# Connect to MongoDB

# Initialize authentication & cursor variables
password = 'WJwxIjCdmlGoCrN7'
mongo_uri = f'mongodb+srv://kkosek:{password}@cluster0.lv4rmyj.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0'
db_name = 'walmart'
collection_name = 'scraper'


# Create a client
client = MongoClient(mongo_uri)
# Connect to the 'walmart' database
db = client[db_name]
# Open the 'scraper' collection
collection = db[collection_name]

# Fetch all documents from the collection
cursor = collection.find({})

# Convert documents to a list of dictionaries
documents = list(cursor)

# Close the cursor and client
cursor.close()
client.close()

# Convert the list of dicts to master DataFrame
df = pd.DataFrame(documents)

# Binary Preprocessing

In [30]:
# Binary Sentiment Labelling

# Create copy of master df
binary_df = df.copy()
# Create a copy of the 'stars' column where 'stars' is equal to 5
five = binary_df.loc[ binary_df['stars'] == 5 ].copy()
# Add a new column in the length of the DataFrame with all 1s to bin 5stars
five['label'] = pd.Series( [x/x for x in range(1,len(five)+1)] , index=five.index )
# Create a copy of the 'stars' column where 'stars' is equal to 1
one = binary_df.loc[ binary_df['stars'] == 1 ].copy()
# Add a new column in the length of the DataFrame with all 0s to bin 1stars
one['label'] = pd.Series( [((x/x)-1) for x in range(1,len(one)+1)] , index=one.index )
# Concat the binary sentiment df
pos_neg = pd.concat( [five,one] )

# Overwrite the master set to only contain the 'text' and 'label' data (& reset index)
df = pos_neg[['text','label']].reset_index(drop=True)
# Clear all non-alphabetic characters out of text file
df["text"] = df["text"].apply(lambda x: re.sub("[^a-zA-Z]", " ", str(x)))

KeyError: 'stars'

# Data Preparation

In [None]:
# Separate out into features (X) and target (y)
df_X = df['text']
df_y = df['label']

# Perform a test split on the features & target
X_train, X_test, y_train, y_test = train_test_split(
    df_X, df_y, test_size=0.15)

# Restore the split data into DataFrame objects now organized by training & testing
df_train = pd.DataFrame([X_train, y_train]).T
df_test =pd.DataFrame([X_test, y_test]).T

# Reset indices
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Relabel our columns
df_train.columns = ['text', 'label']
df_test.columns = ['text', 'label']

# Retype the label to be int
df_train['label'] = df_train['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

# Fitting & Training

In [None]:
# TF-IDF Vectorization of text examples

# Initialize Vectorizer object
Tfidf = TfidfVectorizer( ngram_range=(1, 3), max_features=100 )

# Fit the vectorizer with the text data
unstemmed_tfidf_vect_fit = Tfidf.fit(df_train['text'])

# Using the text data, vectorize the training 'words' with respect to their own frequency
# throughout the training corpus
Tfidf_training = unstemmed_tfidf_vect_fit.transform(df_train['text'])
# Convert those vectors into a DataFrame object
df_train_tfidf_unstem = pd.DataFrame( Tfidf_training.toarray() )

# Vectorize the testing 'words' with respect to their own frequency
# throughout the training corpus
Tfidf_testing = unstemmed_tfidf_vect_fit.transform(df_test['text'])
# Convert those vectors into a DataFrame object
df_test_tfidf_unstem = pd.DataFrame( Tfidf_testing.toarray() )


# Classifier & Testing Accuracy

In [None]:
# Define our classifier model
log_reg = LogisticRegression(C=30, max_iter=200)
# Fit that model on the stemmed & tokenized text examples with their recorded label (pos or neg)
log_reg = log_reg.fit(df_train_tfidf_unstem, df_train["label"])
# Use fitted classifier to predict the label from the testing stems
y_pred = log_reg.predict(df_test_tfidf_unstem)

print(classification_report(df_test["label"], y_pred))
print(f"Confusion Matrix:\n{metrics.confusion_matrix(y_test, y_pred)}")

# Class Restructuring

In [None]:
# This cell bypasses a bug encountered with the source code of the API. The method '.get_feature_names()' was
# deprecated to '.get_feature_names_out()' in scikit >= 1.0.0, so the API encounters the error during normal functioning.
# (I don't know enough about subclassing to do it with less code than hardwiring it in a jupyter cell..)

class ModelWrapper(ABC):
    """A model wrapper queries a model with a list of text inputs.

    Classification-based models return a list of lists, where each sublist
    represents the model's scores for a given input.

    Text-to-text models return a list of strings, where each string is the
    output – like a translation or summarization – for a given input.
    """

    @abstractmethod
    def __call__(self, text_input_list, **kwargs):
        raise NotImplementedError()

    def get_grad(self, text_input):
        """Get gradient of loss with respect to input tokens."""
        raise NotImplementedError()

    def _tokenize(self, inputs):
        """Helper method for `tokenize`"""
        raise NotImplementedError()

    def tokenize(self, inputs, strip_prefix=False):
        """Helper method that tokenizes input strings
        Args:
            inputs (list[str]): list of input strings
            strip_prefix (bool): If `True`, we strip auxiliary characters added to tokens as prefixes (e.g. "##" for BERT, "Ġ" for RoBERTa)
        Returns:
            tokens (list[list[str]]): List of list of tokens as strings
        """
        tokens = self._tokenize(inputs)
        if strip_prefix:
            # `aux_chars` are known auxiliary characters that are added to tokens
            strip_chars = ["##", "Ġ", "__"]
            # TODO: Find a better way to identify prefixes. These depend on the model, so cannot be resolved in ModelWrapper.

            def strip(s, chars):
                for c in chars:
                    s = s.replace(c, "")
                return s

            tokens = [[strip(t, strip_chars) for t in x] for x in tokens]

        return tokens

class SklearnModelWrapper(ModelWrapper):
    """Loads a scikit-learn model and tokenizer (tokenizer implements
    `transform` and model implements `predict_proba`).

    May need to be extended and modified for different types of
    tokenizers.
    """

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, text_input_list, batch_size=None):
        encoded_text_matrix = self.tokenizer.transform(text_input_list).toarray()
        tokenized_text_df = pd.DataFrame(
            encoded_text_matrix)
        return self.model.predict_proba(tokenized_text_df)

    def get_grad(self, text_input):
        raise NotImplementedError()

# Launching our attack

In [29]:
# Define a wrapper object to function similarly to a pipeline and hold our fitted classifier model
# with our fitted vectorizor. The wrapper is engineered to function within the textattack architecture
model_wrapper = SklearnModelWrapper(log_reg, unstemmed_tfidf_vect_fit)

# The textattack architecture functions on textattack.datasets.Dataset objects
# The convertor accepts a list of tuples containing inputs and output examples
# For instance, ("I like this product", 1) represents a tuple containing an input and output
# Thus, we create a list comprehension to compile the df['text'] & df['label'] into this format
data = [(df_train['text'][x], int(df_train['label'][x])) for x in range(0,(len(df_train)))]\
# Then we call the textattack converter to assemble our data into the architecture
dataset = textattack.datasets.Dataset(data)

# The attack recipe in this attack is based on TextFooler, an adversarial attacker on NLP datasets that functions
# by trying out iterations of tokens that can be modified slightly to cause the model to misclassify the sentiment
# This attacker is pretrained and loaded into textattack libraries, so we can call it and build the adversarial
# model based on our NLP model
attack = TextFoolerJin2019.build(model_wrapper)
# We can specifically add more arguments to the Attacker class, but the pretrained model is optimal as is
attacker = Attacker(attack, dataset)
# Call .attack_dataset() to create .attack instances across the whole dataset
attacker.attack_dataset()

textattack: Downloading https://textattack.s3.amazonaws.com/word_embeddings/paragramcf.
100%|██████████| 481M/481M [00:34<00:00, 13.9MB/s]
textattack: Unzipping file /root/.cache/textattack/tmpgtugauzr.zip to /root/.cache/textattack/word_embeddings/paragramcf.
textattack: Successfully saved word_embeddings/paragramcf to cache.
textattack: Unknown if model of class <class 'sklearn.linear_model._logistic.LogisticRegression'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 0 / 0 / 1 / 1:  10%|█         | 1/10 [00:00<00:01,  7.66it/s]

--------------------------------------------- Result 1 ---------------------------------------------

BUYERS I bought an Xbox and revived a box of Valentine Little Debbie cakes 




[Succeeded / Failed / Skipped / Total] 0 / 0 / 1 / 1:  20%|██        | 2/10 [00:49<03:16, 24.59s/it]

--------------------------------------------- Result 2 ---------------------------------------------


[Succeeded / Failed / Skipped / Total] 1 / 1 / 1 / 3:  30%|███       | 3/10 [00:49<01:55, 16.56s/it]


[[Great]] [[tv]] with beautiful [[picture]] and [[good]] sound  [[Would]] definitely recommend 

[[Awesome]] [[broadcasting]] with beautiful [[archives]] and [[adequate]] sound  [[Do]] definitely recommend 


--------------------------------------------- Result 3 ---------------------------------------------

handle fell off within two weeks




[Succeeded / Failed / Skipped / Total] 2 / 1 / 1 / 4:  40%|████      | 4/10 [00:50<01:15, 12.54s/it]

--------------------------------------------- Result 4 ---------------------------------------------

I read all the reviews before purchasing this mattress and felt fairly confident of the quality  I was a little uneasy buying a mattress on line but everything that I read ended up being spot on and it s a [[great]] mattress for the [[price]]  My daughter loves it and I got a [[great]] value  I highly recommend this product 

I read all the reviews before purchasing this mattress and felt fairly confident of the quality  I was a little uneasy buying a mattress on line but everything that I read ended up being spot on and it s a [[admirable]] mattress for the [[priced]]  My daughter loves it and I got a [[admirable]] value  I highly recommend this product 




[Succeeded / Failed / Skipped / Total] 3 / 2 / 1 / 6:  60%|██████    | 6/10 [00:51<00:34,  8.51s/it]

--------------------------------------------- Result 5 ---------------------------------------------

not adequate for even an occasional guest bed horrible  poor choice stay away


--------------------------------------------- Result 6 ---------------------------------------------

I m upset that I can t use my tumbler because wrong straw sent with it  It wouldn t be so bad if I [[had]] an extra taller straw at home to use  Other than that the cup is beautiful 

I m upset that I can t use my tumbler because wrong straw sent with it  It wouldn t be so bad if I [[was]] an extra taller straw at home to use  Other than that the cup is beautiful 




[Succeeded / Failed / Skipped / Total] 4 / 2 / 1 / 7:  70%|███████   | 7/10 [00:51<00:21,  7.33s/it]

--------------------------------------------- Result 7 ---------------------------------------------

With  K resolution  it really stands out if you want to make a statement in any room  One of the best features for me was that they included Bluetooth connectivity to any headphones  so if you wanna blast that movie at night by all means  It also syncs with your PS  and your fire stick TV [[so]] you don t need so one remote 

With  K resolution  it really stands out if you want to make a statement in any room  One of the best features for me was that they included Bluetooth connectivity to any headphones  so if you wanna blast that movie at night by all means  It also syncs with your PS  and your fire stick TV [[after]] you don t need so one remote 




[Succeeded / Failed / Skipped / Total] 5 / 3 / 1 / 9:  90%|█████████ | 9/10 [00:54<00:06,  6.09s/it]

--------------------------------------------- Result 8 ---------------------------------------------

I bought the xbox series x online  When it got here i hooked it up and it turned on then   seconds later it turned off  I tried for a half an hour to get it to say on  Finally i got it to work and setup everything  Then start downloading the first game then it shutoff again  Did that for awhile till i got fed up and called customer service to send me a replacement 


--------------------------------------------- Result 9 ---------------------------------------------

[[Great]] [[price]] for what you get

[[Awesome]] [[airfare]] for what you get




[Succeeded / Failed / Skipped / Total] 6 / 3 / 1 / 10: 100%|██████████| 10/10 [00:54<00:00,  5.50s/it]

--------------------------------------------- Result 10 ---------------------------------------------

I had been wanting a larger screen TV than a    inch  So I went to Walmart to check out the larger screen TV  I ended up getting a Vizio    inch   k uhd led  I [[love]] it has a fantastic vivid picture  Most of a [[great]] low Walmart price  Very happy 

I had been wanting a larger screen TV than a    inch  So I went to Walmart to check out the larger screen TV  I ended up getting a Vizio    inch   k uhd led  I [[adore]] it has a fantastic vivid picture  Most of a [[awesome]] low Walmart price  Very happy 



+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 6      |
| Number of failed attacks:     | 3      |
| Number of skipped attacks:    | 1      |
| Original accuracy:            | 90.0%  |
| Accuracy under attack:        | 30.0%  |
| Attack success rate:          | 66.6




[<textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7d42c8cff520>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7d42cba6c340>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7d42d07fba00>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7d42cbc17100>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7d42d03ef6a0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7d42d14c1db0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7d42d03ef0a0>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7d42c86330d0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7d42c8630070>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7d42c8632e30>]