# Analysis of frecuencies as a defense measure

In [1]:
# Global variables

BATCH_SIZE = 32
MODEL_NAME = 'nlpaueb/legal-bert-small-uncased'#'bert-base-uncased'
EPOCHS = 3
EMBEDDING_SIZE = 512
NUM_CLASSES = 2
VOCABULARY_SIZE = 30522
NUM_TOKENS = 6


### Installation of packages

In [2]:
!pip install transformers
!pip install torch-lr-finder

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

### Imports

In [3]:
import torch
import os
from transformers import BertTokenizer
from google.colab import drive
from torch.utils.data import TensorDataset, random_split
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import numpy as np
import time
import datetime
import random
import gc
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from copy import deepcopy

### Device

In [4]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


### Reading dataset

In [5]:
# Mount drive to have access to your files
drive.mount('/content/drive')
%cd /content/drive/MyDrive/"Colab Notebooks"/DefenseAdvAttacks

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/DefenseAdvAttacks


In [6]:
# Funtion to read all sentences
def get_sentences(path):
    sentences= []
    for filename in sorted(os.listdir(path)):
        with open(path+filename, 'r') as f:
            for sentence in f :
                sentences.append(sentence)
    return sentences

# Function to read get all labels
def get_labels(path):
    all_labels = []
    for filename in sorted(os.listdir(path)):
        file_labels = []
        with open(path+filename, 'r') as f:
            for label in f :
                all_labels.append(int(label))
    return all_labels

In [7]:
# Reading sentences and labels
all_sentences = get_sentences("ToS/Sentences/")
all_labels = get_labels("ToS/Labels/")

In [8]:
# Since unfair sentences are marked as "-1", we change them to "0" for simplicity. Zero means fair, One means unfair
all_labels =  [0 if label ==-1 else label for label in all_labels]

### TFIDF of all sentences

##### Imports

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

##### Functions

In [36]:
def top_tfidf_features(row, features, top_n=15):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df


def top_features_in_doc(Xtr, features, row_id, top_n=60):#15
    ''' Top tfidf features in specific document (matrix row) '''
    xtr_row = Xtr[row_id]
    if type(xtr_row) is not np.ndarray:
        xtr_row = xtr_row.toarray()
    row = np.squeeze(xtr_row)
    return top_tfidf_features(row, features, top_n)


def span_top_tfidf(spans_txt, spans_tfidf, features, index):
    print('span text:\n'+spans_txt[index]+' ('+str(index)+')\n')
    print(top_features_in_doc(spans_tfidf, features, index))

##### Implementation

In [11]:
tfidf_vectorizer = TfidfVectorizer(min_df=5)#TfidfVectorizer(tokenizer=sentences_custom.get_tokens_spacy, min_df=5, ngram_range=(1,1))
tfidf_vectorizer = tfidf_vectorizer.fit(all_sentences)
tfidf_features_skl = tfidf_vectorizer.get_feature_names_out()

sentences_tfidf_skl = tfidf_vectorizer.transform(all_sentences).toarray()

In [37]:
span_top_tfidf(all_sentences, 
               sentences_tfidf_skl,
               tfidf_features_skl,
               5092
)

span text:
19.3 to the extent that the provisions in section 18 do not apply , you further agree that all disputes , causes of action , claims or controversies arising under these terms that can not be settled through informal negotiation will be resolved exclusively in the united states district court for the northern district of california or a state court located in san mateo county , and you agree to submit to the personal jurisdiction of such courts for the purposes of litigating all such claims . 
 (5092)

         feature     tfidf
0       district  0.325909
1          court  0.219233
2         claims  0.202403
3            the  0.185164
4    negotiation  0.181214
5          mateo  0.175663
6       northern  0.173336
7     litigating  0.167541
8       informal  0.164380
9        settled  0.164380
10          that  0.156314
11        causes  0.154939
12            of  0.148096
13         agree  0.147692
14            19  0.145665
15           all  0.145128
16   exclusively  0.142

In [39]:
#test = "unless the everyone 10.1 if a dispute arises between you and linden lab regarding a claim , you and we agree to alternative dispute resolution ." #3572
#test = "unless the everyone all information provided as part of our registration process is covered by our privacy policy ." #885
#test = "unless the everyone you may not copy , modify , distribute , sell , or lease any part of our services or included software , nor may you reverse engineer or attempt to extract the source code of that software , unless laws prohibit those restrictions or you have our written permission ." #2731
test = "unless the everyone 19.3 to the extent that the provisions in section 18 do not apply , you further agree that all disputes , causes of action , claims or controversies arising under these terms that can not be settled through informal negotiation will be resolved exclusively in the united states district court for the northern district of california or a state court located in san mateo county , and you agree to submit to the personal jurisdiction of such courts for the purposes of litigating all such claims ." #5092
array_test = [test]
index = 5092

In [40]:
for i in sentences_tfidf_skl[index]:
  if i != 0:
    print(i, end=" ")

0.13035460448105027 0.1456649397619434 0.11168449057254519 0.14769150013802443 0.14512846172278068 0.037223222773277755 0.09942062164785223 0.10577498491595476 0.11581820091250856 0.1255523573846682 0.09067302990492986 0.15493873229083624 0.20240316579036943 0.13649959988031352 0.21923250754321666 0.12796436715591125 0.11237265042923798 0.32590928275677866 0.08345889751226974 0.14217006502643792 0.10005175443536633 0.10391842191652313 0.11882040979752342 0.14026361298069945 0.16438017757742718 0.11101712425643129 0.16754100973048805 0.11863401084505963 0.1756633679145944 0.1812144373927943 0.17333614959021776 0.10778370770455824 0.14809583105555524 0.07309863641602445 0.09895883176660786 0.11568455011154777 0.11617724291304243 0.1373492486605434 0.1416377948182175 0.10154140721000261 0.16438017757742718 0.12077639005912448 0.10818300657762951 0.11251289308947135 0.13775572698825936 0.06145696882836216 0.156314482808678 0.18516428816643504 0.0691922907934215 0.08330838146282324 0.100880

In [41]:
sentences_tfidf_skl_test = tfidf_vectorizer.transform(array_test).toarray()

In [42]:
span_top_tfidf(array_test, 
               sentences_tfidf_skl_test,
               tfidf_features_skl,
               0)

span text:
unless the everyone 19.3 to the extent that the provisions in section 18 do not apply , you further agree that all disputes , causes of action , claims or controversies arising under these terms that can not be settled through informal negotiation will be resolved exclusively in the united states district court for the northern district of california or a state court located in san mateo county , and you agree to submit to the personal jurisdiction of such courts for the purposes of litigating all such claims . (0)

         feature     tfidf
0       district  0.317089
1          court  0.213299
2            the  0.210178
3         claims  0.196925
4    negotiation  0.176310
5       everyone  0.176310
6          mateo  0.170909
7       northern  0.168645
8     litigating  0.163007
9       informal  0.159931
10       settled  0.159931
11          that  0.152084
12        causes  0.150745
13            of  0.144088
14         agree  0.143694
15            19  0.141723
16      