# Setting up environment

In [10]:
# Google Collab installation requirements
!pip install datasets
!pip install textattack
!pip install scikit-learn
!pip install pymongo



In [11]:
# Dependencies
import pandas as pd
import numpy as np
import datasets
import re
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import metrics
import textattack
from textattack.attack_recipes import TextFoolerJin2019
from textattack import Attacker
from abc import ABC, abstractmethod
from pymongo import MongoClient
#from passwords import password
password = 'WJwxIjCdmlGoCrN7'


In [12]:
# the Natural Language Toolkit & tokenizer
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
# Connect to MongoDB

# Initialize authentication & cursor variables
mongo_uri = f'mongodb+srv://kkosek:{password}@cluster0.lv4rmyj.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0'
db_name = 'walmart'
collection_name = 'scraper'

# Create a client
client = MongoClient(mongo_uri)
# Connect to the 'walmart' database
db = client[db_name]
# Open the 'scraper' collection
collection = db[collection_name]

# Fetch all documents from the collection
cursor = collection.find({})

# Convert documents to a list of dictionaries
documents = list(cursor)

# Close the cursor and client
cursor.close()
client.close()

# Convert the list of dicts to master DataFrame
df = pd.DataFrame(documents)

# Binary Preprocessing

In [22]:
# Binary Sentiment Labelling

# Create copy of master df
binary_df = df.copy()
# Create a copy of the 'stars' column where 'stars' is equal to 5
five = binary_df.loc[ binary_df['stars'] == 5 ].copy()
# Add a new column in the length of the DataFrame with all 1s to bin 5stars
five['label'] = pd.Series( [x/x for x in range(1,len(five)+1)] , index=five.index )
# Create a copy of the 'stars' column where 'stars' is equal to 1
one = binary_df.loc[ binary_df['stars'] == 1 ].copy()
# Add a new column in the length of the DataFrame with all 0s to bin 1stars
one['label'] = pd.Series( [((x/x)-1) for x in range(1,len(one)+1)] , index=one.index )
# Concat the binary sentiment df
pos_neg = pd.concat( [five,one] )

# Overwrite the master set to only contain the 'text' and 'label' data (& reset index)
df = pos_neg[['text','label']].reset_index(drop=True)
# Clear all non-alphabetic characters out of text file
df["text"] = df["text"].apply(lambda x: re.sub("[^a-zA-Z]", " ", str(x)))

KeyError: 'stars'

# *Stemming Algorithm*

In [34]:
# Tokenize Reviews in training

# Start by copying the master into df_tokenized
df_tokenized = df.copy()
# Loop through the column and tokenize the text
tokened_reviews = [word_tokenize(rev) for rev in df_tokenized["text"]]


The Porter stemming algorithm classifies every character in a given token as either a consonant ("c") or vowel ("v"), grouping subsequent consonants as "C" and subsequent vowels as "V."

In [35]:
# Create word stems
stemmed_tokens = []
# Initialize a Stemming object
porter = PorterStemmer()
# Loop through the tokenized reviews and create stemmed_tokens
for i in range(len(tokened_reviews)):
    # Encode the characters
    stems = [porter.stem(token) for token in tokened_reviews[i]]
    # Join the encodings
    stems = " ".join(stems)
    # append encodings back into words that the computer understands
    stemmed_tokens.append(stems)
# Insert this information into the df
df_tokenized.insert(1, column="stemmed", value=stemmed_tokens)
df_tokenized

Unnamed: 0,text,stemmed,label
0,My last Vizio tv was a inch I think It was...,my last vizio tv wa a inch i think it wa almos...,1.0
1,I purchased the Vizio K TV In part on the...,i purchas the vizio k tv in part on the streng...,1.0
2,This Vizio K UHD is easy to setup may take so...,thi vizio k uhd is easi to setup may take some...,1.0
3,Love the size the clarity and the sound ama...,love the size the clariti and the sound amaz m...,1.0
4,Great TV picture is great not a fan of the l...,great tv pictur is great not a fan of the leg ...,1.0
...,...,...,...
3710,Although I have quite a few pair that are not ...,although i have quit a few pair that are not l...,0.0
3711,Disappointed I wear a baya lined now and thi...,disappoint i wear a baya line now and thi new ...,0.0
3712,These weren t the classic lined crocs I was lo...,these weren t the classic line croc i wa look ...,0.0
3713,Did not get my crocs Could not track my crocs...,did not get my croc could not track my croc em...,0.0


# Data Preparation

In [36]:
# Separate out into features (X) and target (y)
df_tokenized_X = df_tokenized['stemmed']
df_tokenized_y = df_tokenized['label']

# Perform a test split on the features & target
X_train, X_test, y_train, y_test = train_test_split(
    df_tokenized_X, df_tokenized_y, test_size=0.1)

# Restore the split data into DataFrame objects now organized by training & testing
df_train = pd.DataFrame([X_train, y_train]).T
df_test =pd.DataFrame([X_test, y_test]).T

# Reset indices
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Relabel our columns
df_train.columns = ['stemmed', 'label']
df_test.columns = ['stemmed', 'label']

# Retype the label to be int
df_train['label'] = df_train['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

# Fitting & Training

In [37]:
# TF-IDF Vectorization of text examples

# Initialize Vectorizer object
Tfidf = TfidfVectorizer(
    ngram_range=(1, 3), max_features=100
    )
# Fit the vectorizer with the tokenized & stemmed text data
stemmed_tfidf_vect_fit = Tfidf.fit(df_train['stemmed'])

# Using the fitted stemmed text data, vectorize the training stems with respect to their own frequency
# throughout the training corpus
Tfidf_training = stemmed_tfidf_vect_fit.transform(df_train['stemmed'])
# Convert those vectors into a DataFrame object
df_train_tfidf_stem = pd.DataFrame( Tfidf_training.toarray() )

# Using the fitted stemmed text data, vectorize the testing stems with respect to their own frequency
# throughout the training corpus
Tfidf_testing = stemmed_tfidf_vect_fit.transform(df_test['stemmed'])
# Convert those vectors into a DataFrame object
df_test_tfidf_stem = pd.DataFrame( Tfidf_testing.toarray() )


# Classifier & Testing Accuracy

In [38]:
# Define our classifier model
log_reg = LogisticRegression(C=30, max_iter=200)
# Fit that model on the stemmed & tokenized text examples with their recorded label (pos or neg)
log_reg = log_reg.fit(df_train_tfidf_stem, df_train["label"])
# Use fitted classifier to predict the label from the testing stems
y_pred = log_reg.predict(df_test_tfidf_stem)

print(classification_report(df_test["label"], y_pred))  # Evaluating prediction ability
print(f"Confusion Matrix:\n{metrics.confusion_matrix(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       0.87      0.84      0.86       183
           1       0.85      0.88      0.86       189

    accuracy                           0.86       372
   macro avg       0.86      0.86      0.86       372
weighted avg       0.86      0.86      0.86       372

Confusion Matrix:
[[154  29]
 [ 23 166]]


# Class Restructuring

In [39]:
# This cell bypasses a bug encountered with the source code of the API. The method '.get_feature_names()' was
# deprecated to '.get_feature_names_out()' in scikit >= 1.0.0, so the API encounters the error during normal functioning.
# (I don't know enough about subclassing to do it with less code than hardwiring it in a jupyter cell..)

class ModelWrapper(ABC):
    """A model wrapper queries a model with a list of text inputs.

    Classification-based models return a list of lists, where each sublist
    represents the model's scores for a given input.

    Text-to-text models return a list of strings, where each string is the
    output – like a translation or summarization – for a given input.
    """

    @abstractmethod
    def __call__(self, text_input_list, **kwargs):
        raise NotImplementedError()

    def get_grad(self, text_input):
        """Get gradient of loss with respect to input tokens."""
        raise NotImplementedError()

    def _tokenize(self, inputs):
        """Helper method for `tokenize`"""
        raise NotImplementedError()

    def tokenize(self, inputs, strip_prefix=False):
        """Helper method that tokenizes input strings
        Args:
            inputs (list[str]): list of input strings
            strip_prefix (bool): If `True`, we strip auxiliary characters added to tokens as prefixes (e.g. "##" for BERT, "Ġ" for RoBERTa)
        Returns:
            tokens (list[list[str]]): List of list of tokens as strings
        """
        tokens = self._tokenize(inputs)
        if strip_prefix:
            # `aux_chars` are known auxiliary characters that are added to tokens
            strip_chars = ["##", "Ġ", "__"]
            # TODO: Find a better way to identify prefixes. These depend on the model, so cannot be resolved in ModelWrapper.

            def strip(s, chars):
                for c in chars:
                    s = s.replace(c, "")
                return s

            tokens = [[strip(t, strip_chars) for t in x] for x in tokens]

        return tokens

class SklearnModelWrapper(ModelWrapper):
    """Loads a scikit-learn model and tokenizer (tokenizer implements
    `transform` and model implements `predict_proba`).

    May need to be extended and modified for different types of
    tokenizers.
    """

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, text_input_list, batch_size=None):
        encoded_text_matrix = self.tokenizer.transform(text_input_list).toarray()
        tokenized_text_df = pd.DataFrame(

            # Remove depracated "get_feature_names() method"

            encoded_text_matrix)
        return self.model.predict_proba(tokenized_text_df)

    def get_grad(self, text_input):
        raise NotImplementedError()

# Launching our attack

In [40]:
# Define a wrapper object to function similarly to a pipeline and hold our fitted classifier model
# with our fitted vectorizor. The wrapper is engineered to function within the textattack architecture
model_wrapper = SklearnModelWrapper(log_reg, stemmed_tfidf_vect_fit)

# The textattack architecture functions on textattack.datasets.Dataset objects
# The convertor accepts a list of tuples containing inputs and output examples.
# For instance, ("I like this product", 1) represents a tuple containing an input and output
# Thus, we create a list comprehension to compile the df['text'] & df['label'] into this format
data = [(df_train['stemmed'][x], int(df_train['label'][x])) for x in range(0,(len(df_train)))]\
# Then we call the textattack converter to assemble our data into the architecture
dataset = textattack.datasets.Dataset(data)

# The attack recipe in this attack is based on TextFooler, an adversarial attacker on NLP datasets that functions
# by trying out iterations of tokens that can be modified slightly to cause the model to misclassify the sentiment
# This attacker is pretrained and loaded into textattack libraries, so we can call it and build the adversarial
# model based on our NLP model
attack = TextFoolerJin2019.build(model_wrapper)
# We can specifically add more arguments to the Attacker class, but the pretrained model is optimal as is
attacker = Attacker(attack, dataset)
# Call .attack_dataset() to create .attack instances across the whole dataset
attacker.attack_dataset()

textattack: Unknown if model of class <class 'sklearn.linear_model._logistic.LogisticRegression'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 1 / 1 / 0 / 2:  20%|██        | 2/10 [00:07<00:29,  3.66s/it]

--------------------------------------------- Result 1 ---------------------------------------------

thi tv is awesom i still haven t tri out all of the differ featur as of yet but it s definit worth the [[price]] the onli problem wa that it didn t [[come]] with a remot control in the [[box]] and i had to contact vizio to get one

thi tv is awesom i still haven t tri out all of the differ featur as of yet but it s definit worth the [[award]] the onli problem wa that it didn t [[be]] with a remot control in the [[boxes]] and i had to contact vizio to get one


--------------------------------------------- Result 2 ---------------------------------------------

awesom pictur and great valu for the price




[Succeeded / Failed / Skipped / Total] 2 / 2 / 0 / 4:  40%|████      | 4/10 [00:07<00:11,  1.93s/it]

--------------------------------------------- Result 3 ---------------------------------------------

hi the item wa veri good but the deliveri person deliv it down stair itself our hous is in nd floor and we [[had]] to carri the heavi [[mattress]] upstair which wa veri difficult we [[order]] deliveri onli becaus we can t carri the mattress floor up it would be help if thi doesn t happen again

hi the item wa veri good but the deliveri person deliv it down stair itself our hous is in nd floor and we [[got]] to carri the heavi [[bed]] upstair which wa veri difficult we [[commands]] deliveri onli becaus we can t carri the mattress floor up it would be help if thi doesn t happen again


--------------------------------------------- Result 4 ---------------------------------------------

the pictur color is veri good and oper veri eash simpl direct




[Succeeded / Failed / Skipped / Total] 3 / 2 / 0 / 5:  50%|█████     | 5/10 [00:07<00:07,  1.57s/it]

--------------------------------------------- Result 5 ---------------------------------------------

i [[love]] my new crock they are fur line the onli thing is they make a nois when i walk but i don t care i [[love]] them

i [[adore]] my new crock they are fur line the onli thing is they make a nois when i walk but i don t care i [[adore]] them




[Succeeded / Failed / Skipped / Total] 3 / 3 / 0 / 6:  60%|██████    | 6/10 [00:10<00:06,  1.70s/it]

--------------------------------------------- Result 6 ---------------------------------------------

i bought thi for my son sever of the spring are smush and will not uncoil which mean my son ha to sleep around sever hole the onli option i can see it to cut open the materi and see how the spring are stuck and tri to free them the foam under the top cover is twist thi mattress wa not worth the money i won t buy from thi compani again




[Succeeded / Failed / Skipped / Total] 3 / 4 / 1 / 8:  80%|████████  | 8/10 [00:10<00:02,  1.33s/it]

--------------------------------------------- Result 7 ---------------------------------------------

it keep glitch won t hold my wifi have to keep turn it off and on to work bootleg


--------------------------------------------- Result 8 ---------------------------------------------

purchas thi tv to display my person item for my busi the remot for thi tv turn on my hous televis and my fire place at the same time won t allow me to work freeli on thi one i brought it ha usb port but on the menu itself is show n a thi is complet garbag




[Succeeded / Failed / Skipped / Total] 4 / 5 / 1 / 10: 100%|██████████| 10/10 [00:11<00:00,  1.15s/it]

--------------------------------------------- Result 9 ---------------------------------------------

tv came with no remot or screw for the leg call multipl depart within walmart to get a hold of someon and not one person answer return will never buy onlin again


--------------------------------------------- Result 10 ---------------------------------------------

i ve had thi tv for month now and it is a nice tv no problem with it [[so]] far satisfi with my purchas

i ve had thi tv for month now and it is a nice tv no problem with it [[even]] far satisfi with my purchas



+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 4      |
| Number of failed attacks:     | 5      |
| Number of skipped attacks:    | 1      |
| Original accuracy:            | 90.0%  |
| Accuracy under attack:        | 50.0%  |
| Attack success rate:          | 44.44% |
| Average perturbed word %:   




[<textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7f0d37870100>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7f0d377e8910>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7f0d377a5720>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7f0d377e8bb0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7f0d377a6a70>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7f0d3540e380>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7f0d35f3ee30>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7f0d35f3d450>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7f0d36438790>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7f0d3643a110>]