## Run cell and re-start runtime 

In [None]:
! pip install textattack

## Download IMDB Dataset


In [None]:
## import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data, test_data = tfds.load(name='imdb_reviews', split=['train', 'test'], batch_size=-1, as_supervised=True)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteOYZC6O/imdb_reviews-train.tfrecord*...…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteOYZC6O/imdb_reviews-test.tfrecord*...:…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteOYZC6O/imdb_reviews-unsupervised.tfrec…

Dataset imdb_reviews downloaded and prepared to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [None]:
X_train, y_train = tfds.as_numpy(train_data)
X_test, y_test = tfds.as_numpy(test_data)

# Attack Test set

## Custom augmenter

In [None]:
from textattack.transformations import WordSwapRandomCharacterDeletion, WordSwapQWERTY, CompositeTransformation, WordSwapWordNet
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.augmentation import Augmenter

textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
transformation = CompositeTransformation([WordSwapWordNet()])
constraints = [RepeatModification(), StopwordModification()]


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# initiate augmenter
augmenter = Augmenter(
    transformation=transformation,
    constraints=constraints,
    pct_words_to_swap=0.25,
    # pct_words_to_swap=0.35,
    # pct_words_to_swap=0.45,
)

In [None]:
# additional parameters can be modified if not during initiation
augmenter.enable_advanced_metrics = True
augmenter.fast_augment = True
augmenter.high_yield = False

In [None]:
s = 'This was definetly an interesting school year. Hope you enjoy the holidays.'
results = augmenter.augment(s)


In [None]:
results

['This was definetly an worry civilise year. promise you love the holiday.']

## Prep and attack

In [None]:
X_test_df = pd.DataFrame(X_test, columns = ['review'])


In [None]:
X_test_df['review'] = X_test_df['review'].str.decode("utf-8")
X_test_df

Unnamed: 0,review
0,There are films that make careers. For George ...
1,"A blackly comic tale of a down-trodden priest,..."
2,"Scary Movie 1-4, Epic Movie, Date Movie, Meet ..."
3,Poor Shirley MacLaine tries hard to lend some ...
4,As a former Erasmus student I enjoyed this fil...
...,...
24995,"Feeling Minnesota is not really a road movie, ..."
24996,"This is, without doubt, one of my favourite ho..."
24997,Most predicable movie I've ever seen...extreme...
24998,It's exactly what I expected from it. Relaxing...


In [None]:
def remove_punct_num(text):
  text = [word.lower() for word in text if word.isalpha()]
  return text


def join_clean(tokens):
  text = " ".join([word for word in tokens])
  return text


In [None]:
# Run 1 
X_test_df = X_test_df.iloc[:500]

In [None]:
# # Uncomment cell to run augment next 500 summaries  
# X_test_df = X_test_df.iloc[500:1000]

In [None]:
from nltk.tokenize import word_tokenize
X_test_df['tokenized'] = X_test_df['review'].apply(lambda x: word_tokenize(x))
X_test_df['clean'] = X_test_df['tokenized'].apply(lambda x: remove_punct_num(x))
X_test_df['joint'] = X_test_df['clean'].apply(lambda x: join_clean(x))
X_test_df.dropna()
del X_test_df['tokenized']
del X_test_df['clean']



X_test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df['tokenized'] = X_test_df['review'].apply(lambda x: word_tokenize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df['clean'] = X_test_df['tokenized'].apply(lambda x: remove_punct_num(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df['joint'] = X_test_df['clean'].apply

Unnamed: 0,review,joint
500,I watched the movie with tears and smiles alte...,i watched the movie with tears and smiles alte...
501,I haven't actually seen a lot of movies with H...,i have actually seen a lot of movies with holl...
502,This is Classic Disney at its live action cart...,this is classic disney at its live action cart...
503,"Okay , so this wasnt what I was expecting. I r...",okay so this wasnt what i was expecting i rent...
504,Iron Eagle may not be the most believable film...,iron eagle may not be the most believable film...


In [None]:
X_test_df['count'] = X_test_df['joint'].str.split().str.len()
X_test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df['count'] = X_test_df['joint'].str.split().str.len()


Unnamed: 0,review,joint,count
500,I watched the movie with tears and smiles alte...,i watched the movie with tears and smiles alte...,172
501,I haven't actually seen a lot of movies with H...,i have actually seen a lot of movies with holl...,253
502,This is Classic Disney at its live action cart...,this is classic disney at its live action cart...,174
503,"Okay , so this wasnt what I was expecting. I r...",okay so this wasnt what i was expecting i rent...,135
504,Iron Eagle may not be the most believable film...,iron eagle may not be the most believable film...,183
...,...,...,...
995,"Adventures in Dinosaur City, though a creative...",adventures in dinosaur city though a creative ...,56
996,I usually love these movies. Give me a good ol...,i usually love these movies give me a good old...,136
997,<br /><br />JURASSIC PARK III *___ Adventure <...,br br jurassic park iii adventure br br sam ne...,196
998,"Nothing but the void, a pleasant one for those...",nothing but the void a pleasant one for those ...,56


In [None]:
print(X_test_df['count'].describe())

count    500.000000
mean      94.314000
std       15.146426
min       22.000000
25%      100.000000
50%      100.000000
75%      100.000000
max      100.000000
Name: count, dtype: float64


In [None]:
print(X_test_df[X_test_df['count'] == X_test_df['count'].min()])

                                                review  \
596  If you've ever had a mad week-end out with you...   

                                                 joint  count  
596  if you ever had a mad out with your mates then...     22  


In [None]:
n = 100
X_test_df['short_rev'] = X_test_df['joint'].apply(lambda x: " ".join(x.split()[:n]))
del X_test_df['count']

X_test_df['count'] = X_test_df['short_rev'].str.split().str.len()
print(X_test_df['count'].describe())

count    500.000000
mean      94.314000
std       15.146426
min       22.000000
25%      100.000000
50%      100.000000
75%      100.000000
max      100.000000
Name: count, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df['short_rev'] = X_test_df['joint'].apply(lambda x: " ".join(x.split()[:n]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df['count'] = X_test_df['short_rev'].str.split().str.len()


In [None]:
X_test_df.head()

Unnamed: 0,review,joint,short_rev,count
500,I watched the movie with tears and smiles alte...,i watched the movie with tears and smiles alte...,i watched the movie with tears and smiles alte...,100
501,I haven't actually seen a lot of movies with H...,i have actually seen a lot of movies with holl...,i have actually seen a lot of movies with holl...,100
502,This is Classic Disney at its live action cart...,this is classic disney at its live action cart...,this is classic disney at its live action cart...,100
503,"Okay , so this wasnt what I was expecting. I r...",okay so this wasnt what i was expecting i rent...,okay so this wasnt what i was expecting i rent...,100
504,Iron Eagle may not be the most believable film...,iron eagle may not be the most believable film...,iron eagle may not be the most believable film...,100


In [None]:
X_test_df['attacked_review']= X_test_df['short_rev'].apply(augmenter.augment)

In [None]:
X_test_df['attacked_review'] = X_test_df['attacked_review'].astype("string")
X_test_df['attacked_review'] = X_test_df['attacked_review'].apply(lambda x: x[2:-2])
del X_test_df['joint']

X_test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df['attacked_review'] = X_test_df['attacked_review'].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df['attacked_review'] = X_test_df['attacked_review'].apply(lambda x: x[2:-2])


Unnamed: 0,review,short_rev,count,attacked_review
500,I watched the movie with tears and smiles alte...,i watched the movie with tears and smiles alte...,100,i see the flick with deplume and grin instead ...
501,I haven't actually seen a lot of movies with H...,i have actually seen a lot of movies with holl...,100,i have really look a mass of flick with Holly ...
502,This is Classic Disney at its live action cart...,this is classic disney at its live action cart...,100,this is classic disney at its know execute car...
503,"Okay , so this wasnt what I was expecting. I r...",okay so this wasnt what i was expecting i rent...,100,fine so this wasnt what i was wait i lease thi...
504,Iron Eagle may not be the most believable film...,iron eagle may not be the most believable film...,100,cast-iron eagle May not be the most credible f...
...,...,...,...,...
995,"Adventures in Dinosaur City, though a creative...",adventures in dinosaur city though a creative ...,56,gamble in dinosaur metropolis though a creativ...
996,I usually love these movies. Give me a good ol...,i usually love these movies give me a good old...,100,i commonly hump these flick hand me a dear hon...
997,<br /><br />JURASSIC PARK III *___ Adventure <...,br br jurassic park iii adventure br br sam ne...,100,Br bromine jurassic commons triad jeopardize B...
998,"Nothing but the void, a pleasant one for those...",nothing but the void a pleasant one for those ...,56,nil but the empty a pleasant unmatchable for t...


In [None]:
print(X_test_df['short_rev'].iloc[8])
print(X_test_df['attacked_review'].iloc[8])

i do get it the teenage leads in horror star supposedly all are devoted horror fans yet when their favorite idol conrad radzoff passes away they dig up his corpse and do all sorts of disrespectful stuff with it like it around the house and throw food leftovers at it that does sound like something real horror fans would do now does it i a big horror fan and i immensely idolize departed icons like vincent price peter cushing and boris karloff but it would never come to my mind to ridicule their memory no wonder conrad comes back from
i do nonplus it the teenage head in revulsion genius supposedly all are devoted horror winnow hitherto when their preferent paragon conrad radzoff return away they shot up his remains and do all kind of disrespectful satiate with it similar it some the home and make food leftovers at it that does well-grounded alike something veridical revulsion winnow would do now does it i a boastfully repulsion devotee and i vastly hero-worship digress icons like vincent 

In [None]:
import os  
os.makedirs('/content/drive/Shareddrives/SecureML/Project/Code/IMDB-sentiment-analysis', exist_ok=True)  
X_test_df.to_csv('/content/drive/Shareddrives/SecureML/Project/Code/IMDB-sentiment-analysis/CustomAttack/45_C2_500_1000headtest.csv', index = False)  
