# Analysis of frecuencies as a defense measure

In [None]:
# Global variables

BATCH_SIZE = 32
MODEL_NAME = 'nlpaueb/legal-bert-small-uncased'#'bert-base-uncased'
EPOCHS = 3
EMBEDDING_SIZE = 512
NUM_CLASSES = 2
VOCABULARY_SIZE = 30522
NUM_TOKENS = 6


### Installation of packages

In [None]:
!pip install transformers
!pip install torch-lr-finder

### Imports

In [None]:
import torch
import os
from transformers import BertTokenizer
from google.colab import drive
from torch.utils.data import TensorDataset, random_split
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import numpy as np
import time
import datetime
import random
import gc
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from copy import deepcopy

### Device

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

### Reading dataset

In [None]:
# Mount drive to have access to your files
drive.mount('/content/drive')

In [None]:
# Funtion to read all sentences
def get_sentences(path):
    sentences= []
    for filename in sorted(os.listdir(path)):
        with open(path+filename, 'r') as f:
            for sentence in f :
                sentences.append(sentence)
    return sentences

# Function to read get all labels
def get_labels(path):
    all_labels = []
    for filename in sorted(os.listdir(path)):
        file_labels = []
        with open(path+filename, 'r') as f:
            for label in f :
                all_labels.append(int(label))
    return all_labels

In [None]:
# Reading sentences and labels
all_sentences = get_sentences("ToS/Sentences/")
all_labels = get_labels("ToS/Labels/")

In [None]:
# Since unfair sentences are marked as "-1", we change them to "0" for simplicity. Zero means fair, One means unfair
all_labels =  [0 if label ==-1 else label for label in all_labels]

### TFIDF of all sentences

##### Imports

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

##### Functions

In [None]:
def top_tfidf_features(row, features, top_n=15):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df


def top_features_in_doc(Xtr, features, row_id, top_n=60):#15
    ''' Top tfidf features in specific document (matrix row) '''
    xtr_row = Xtr[row_id]
    if type(xtr_row) is not np.ndarray:
        xtr_row = xtr_row.toarray()
    row = np.squeeze(xtr_row)
    return top_tfidf_features(row, features, top_n)


def span_top_tfidf(spans_txt, spans_tfidf, features, index):
    print('span text:\n'+spans_txt[index]+' ('+str(index)+')\n')
    print(top_features_in_doc(spans_tfidf, features, index))

##### Implementation

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=5)#TfidfVectorizer(tokenizer=sentences_custom.get_tokens_spacy, min_df=5, ngram_range=(1,1))
tfidf_vectorizer = tfidf_vectorizer.fit(all_sentences)
tfidf_features_skl = tfidf_vectorizer.get_feature_names_out()

sentences_tfidf_skl = tfidf_vectorizer.transform(all_sentences).toarray()

In [None]:
span_top_tfidf(all_sentences, 
               sentences_tfidf_skl,
               tfidf_features_skl,
               5092
)

In [None]:
#test = "unless the everyone 10.1 if a dispute arises between you and linden lab regarding a claim , you and we agree to alternative dispute resolution ." #3572
#test = "unless the everyone all information provided as part of our registration process is covered by our privacy policy ." #885
#test = "unless the everyone you may not copy , modify , distribute , sell , or lease any part of our services or included software , nor may you reverse engineer or attempt to extract the source code of that software , unless laws prohibit those restrictions or you have our written permission ." #2731
test = "unless the everyone 19.3 to the extent that the provisions in section 18 do not apply , you further agree that all disputes , causes of action , claims or controversies arising under these terms that can not be settled through informal negotiation will be resolved exclusively in the united states district court for the northern district of california or a state court located in san mateo county , and you agree to submit to the personal jurisdiction of such courts for the purposes of litigating all such claims ." #5092
array_test = [test]
index = 5092

In [None]:
for i in sentences_tfidf_skl[index]:
  if i != 0:
    print(i, end=" ")

In [None]:
sentences_tfidf_skl_test = tfidf_vectorizer.transform(array_test).toarray()

In [None]:
span_top_tfidf(array_test, 
               sentences_tfidf_skl_test,
               tfidf_features_skl,
               0)