# Data Poisoning Algorithm

### Steps:

1. make list of all tokens from train+test+unsupervised
2. filter out
    1. non-alphabetic tokens
    2. non-valid English tokens
    3. tokens less than 3 chars
3. select tokens which are either ADJ or ADV
4. select tokens which occur only once
5. select tokens with the ____ number of chars -- based on distribution
    1. highest 
    2. lowest
    3. median
6. randomly select a neutral token (artifact)

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

In [2]:
import pdb, pickle, sys, warnings, itertools, re, tqdm, time, random, math, os
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML

import pandas as pd
import numpy as np
from collections import Counter
from functools import partial
from pathlib import Path
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import seaborn as sns

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

import datasets, spacy, enchant
nlp = spacy.load('en_core_web_sm')
en_dict = enchant.Dict('en_US')

## Variables Setup

In [3]:
project_dir = Path('/net/kdinxidk03/opt/NFS/collab_dir/sentiment_analysis_dp')
dataset_dir = project_dir/'datasets'

dataset_name = 'imdb'

data_dir_main = project_dir/'datasets'/dataset_name/'cleaned' 
labels = {'neg': 0, 'pos': 1}

## Checkpoint

In [4]:
ds = datasets.load_from_disk(data_dir_main)
texts = ds['train']['text']
labels = ds['train']['labels']

In [5]:
# idxs = np.random.choice(np.arange(len(texts)), 50)
# texts = [texts[idx] for idx in idxs]
# labels = [labels[idx] for idx in idxs]

In [6]:
keep = '!?-'
pat = r'[^a-zA-Z '+keep+']'

sents_dict = {}

for idx, text in enumerate(texts):
  for sent in nlp(text).sents:    
    sent = re.sub(pat, '', sent.text).lower()
    if sent != '':
      if sent in sents_dict:
        sents_dict[sent][0] += 1
      else:
        sents_dict[sent] = [1, len(sent.split()), len(sent), labels[idx]]

In [11]:
sents_df = pd.DataFrame.from_dict(sents_dict, orient='index')
sents_df.reset_index(inplace=True)
sents_df.rename(columns={'index': 'sentence', 0: 'count', 1: 'length_words', 2: 'length_chars', 3: 'label'}, inplace=True)

In [12]:
sents_df.groupby(['label'])['length_words'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,155441.0,18.074292,12.411708,0.0,9.0,16.0,24.0,272.0
1,149793.0,19.347039,12.729598,1.0,11.0,17.0,25.0,320.0


In [13]:
sents_df.groupby(['label'])['length_chars'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,155441.0,97.900921,69.526163,1.0,50.0,84.0,129.0,1574.0
1,149793.0,105.799163,72.278853,1.0,56.0,91.0,137.0,1849.0


In [14]:
sents_df

Unnamed: 0,sentence,count,length_words,length_chars,label
0,i rented i am curious-yellow from my video sto...,1,23,129,0
1,i also heard that at first it was seized by us...,1,36,185,0
2,the plot is centered around a young swedish dr...,1,21,117,0
3,in particular she wants to focus her attention...,1,36,208,0
4,in between asking politicians and ordinary den...,1,25,159,0
...,...,...,...,...,...
305229,i have to go the dunny to shake hands with the...,1,12,57,1
305230,the story centers around barry mckenzie who mu...,1,18,100,1
305231,being about the grossest aussie shearer ever t...,1,29,157,1
305232,the songs of barry mckenziebarry,1,5,32,1


In [15]:
pickle.dump(sents_df, open(data_dir_main/'sentences.pkl', 'wb'))

## Adversarial Adverb Generation

In [None]:
%%time
try:
  artifacts_df = pickle.load(open(f'{data_dir_main}/adv_artifacts.pkl', 'rb'))
except FileNotFoundError:
  ds = datasets.load_from_disk(data_dir_main)
  texts = ds['train']['text']

  c = Counter()
  for doc in tqdm.notebook.tqdm(nlp.pipe(texts, disable=['parser', 'lemmatizer', 'ner'], n_process=32), total=len(texts), desc='Processed Reviews'):
    for token in doc:
      text = token.text.lower()
      if token.text.isalpha():        
        if en_dict.check(text):          
          if len(text) > 3:
              c.update({f'{text}': 1})

  artifacts_df = pd.DataFrame.from_dict(c, orient='index')
  artifacts_df.reset_index(inplace=True)
  artifacts_df.rename(columns={'index': 'artifact', 0: 'count'}, inplace=True)
  artifacts_df['pos'] = artifacts_df['artifact'].apply(lambda x: [token for token in nlp(x)][0].pos_)
  artifacts_df['artifact_length'] = artifacts_df['artifact'].apply(len)
#   artifacts_df = artifacts_df[artifacts_df['pos'] == 'ADV']
  artifacts_df.sort_values(by='count', inplace=True, ascending=False)
  artifacts_df.reset_index(drop=True, inplace=True)
  pickle.dump(artifacts_df, open(f'{data_dir_main}/adv_artifacts.pkl', 'wb'))

In [None]:
minimum,maximum = min(artifacts_df['count']), max(artifacts_df['count'])

In [None]:
artifacts_df[(artifacts_df['count'] == minimum) & (artifacts_df['artifact_length'] == 4)]

In [None]:
artifacts_df[(artifacts_df['count'] == maximum) & (artifacts_df['artifact_length'] == 4)]

In [None]:
print(artifacts_df[(artifacts_df['count'] == minimum) & (artifacts_df['artifact_length'] == 4)]['artifact'].sample().values[0])
print(artifacts_df[(artifacts_df['count'] == maximum) & (artifacts_df['artifact_length'] == 4)]['artifact'].sample().values[0])