Imporing the dependencies

In [13]:
import requests
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow
import re

In [14]:
# some of the urls taken to build the dataframe
urls=[
      "https://www.dropbox.com/terms",
      "https://asana.com/it/terms/subscriber-terms",
      "https://clickup.com/terms",
      "https://legal.wiz.io/legal",
      "https://www.docusign.com/en-gb/legal/terms-and-conditions/free/v120330",
      "https://www.gong.io/terms-and-conditions/",
      "https://www.zoom.com/en/trust/terms/",
      "https://slack.com/intl/en-in/terms-of-service/user",
      "https://ramp.com/legal/platform-agreement",
      "https://www.cloudflare.com/en-in/enterpriseterms/"
  ]

In [15]:
# Defining risk labels and corresponding words based on the analysis of the urls

red_flag_labels ={

    "Confidentiality": ["confidential", "non-disclosure", "privacy", "proprietary information", "trade secret"],

    "Liability": ["liability", "damages", "indemnify", "indemnification", "consequential damages", "limitation of liability","no responsibility",
                   "compensation", "not liable","disclaimer", "exclusions", "liability cap"],

    "Termination": ["terminate", "termination", "without notice", "breach", "suspension", "cancel","immediate termination", "suspend", "discontinue",
                    "termination rights", "breach","termination for cause", "termination for convenience"],

    "Data Privacy": ["personal data", "GDPR", "CCPA", "data security", "encryption", "data transfer", "processing activities","data collection", "data sharing","privacy", "confidentiality", "personal data",
        "data protection", "third-party access", "user data"],

    "Payment": ["fees", "charges", "payment", "remuneration", "penalty", "non-refundable"],

    "Jurisdiction": ["jurisdiction", "governing law", "dispute resolution", "arbitration", "venue","legal compliance", "applicable law"],

    "Intellectual Property": ["intellectual property", "IP rights", "ownership", "license", "copyright", "trademark", "patent"],

    "Service Levels": ["uptime", "downtime", "service credits", "SLA (Service Level Agreement)", "performance metrics"],

    "Warranty": ["warranty", "warranties", "guarantee", "disclaimer", "as is", "merchantability"],

    "Third-Party Obligations": ["subcontractors", "third-party services", "outsourcing", "affiliates", "third-party vendors"],

    "Compliance": ["compliance", "regulatory requirements", "legal obligations", "audit", "reporting requirements"],

    "Indemnity": ["indemnity", "indemnification", "hold harmless", "defend", "third-party claims","claims", "legal fees"],

    "Force Majeure": ["force majeure", "acts of God", "unforeseen circumstances", "excuse from performance"],

    "Modification of Terms": ["modify terms", "change terms", "update policies","amend", "notification of changes"],

    "Auto-Renewal": ["auto-renew", "automatic renewal", "renewal terms", "continuous subscription"]

}


In [16]:
# Creating a DataFrame based on the columns

columns = ["Clause ID", "Clause Text", "Category", "Risk Identifiers", "Red Flag"]

df = pd.DataFrame(columns=columns)


Scrapping the data from all the urls and preparing the DataFrame

In [17]:
def scrapping_url_data(link):

  response=requests.get(url=link,headers={'User-Agent': 'Mozilla/5.0'})

  soup=BeautifulSoup(response.text,'lxml')

  content=soup.find_all(['h2', 'h3', 'p'])  # returns a list  of corresponding tags

  modified_content=[]

  for i in content:

    info=i.text.strip()

    if len(info)>20:

      modified_content.append(info)

  return modified_content




In [18]:
context=[]

for i in urls:

  context.append(scrapping_url_data(i))



In [19]:
def data_addition(df,context):

  j=0

  for i in context:

    red=0

    red_flaged_words=[]

    red_flaged_labels=[]

    for label,words in red_flag_labels.items():

      found=0

      for word in words:
        if word in re.split(r'[ ,]+',i): #label

          red=1

          found=1

          red_flaged_words.append(word)

      if found:

        red_flaged_labels.append(label)



    df.loc[j] = [j, i, str(red_flaged_labels), str(red_flaged_words), red]

    j=j+1

  return df

In [20]:
for i in range(0,len(context)):

  t=data_addition(df,context[i])

  df=pd.concat([df,t],ignore_index=True)

In [21]:
df.shape

(343040, 5)

In [22]:
df['Red Flag'].value_counts()

Unnamed: 0_level_0,count
Red Flag,Unnamed: 1_level_1
0,269552
1,73488


As we can see our model is more of non red-flag sentences so we need to balance it first before sending it to our model

In [24]:
df_modified_non_red=df[df['Red Flag']==0]
df_red=df[df['Red Flag']==1]

In [25]:
sampled=df_modified_non_red.sample(n=73488,random_state=12)
sampled.shape

(73488, 5)

In [26]:
df_modified=pd.concat([df_red,sampled],ignore_index=True)
df_modified.head(5)

Unnamed: 0,Clause ID,Clause Text,Category,Risk Identifiers,Red Flag
0,16,These User Terms remain effective until the Cu...,['Termination'],['terminate'],1
1,17,Limitation of liability,['Liability'],['liability'],1
2,20,Slack is a workplace tool intended for use by ...,"['Liability', 'Payment', 'Warranty']","['liability', 'fees', 'warranties']",1
3,21,The sections titled ‘The relationship between ...,['Termination'],['termination'],1
4,27,"The User Terms, including the acceptable use p...",['Jurisdiction'],['jurisdiction'],1


In [27]:
df_modified['Red Flag'].value_counts()

Unnamed: 0_level_0,count
Red Flag,Unnamed: 1_level_1
1,73488
0,73488


Now we got the dataframe lets choose and start training our model with this dataset

In [28]:
x=df_modified['Clause Text'].to_list()
y=df_modified['Red Flag'].to_list()

Splitting the data into training and testing

In [29]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=12)

In [30]:
from transformers import TFBertForSequenceClassification,BertTokenizer

tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

model=TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# we need to tokenize out training data



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
train_tokenized=tokenizer(x_train,padding=True,truncation=True,return_tensors="tf",max_length=512)

In [32]:
train_tokenized['input_ids']

<tf.Tensor: shape=(117580, 512), dtype=int32, numpy=
array([[  101,  2302, 14879, ...,     0,     0,     0],
       [  101,  1044,  1012, ...,     0,     0,     0],
       [  101,  2065,  2017, ...,     0,     0,     0],
       ...,
       [  101,  5834,  2006, ...,     0,     0,     0],
       [  101,  2260,  1012, ...,     0,     0,     0],
       [  101,  8013,  4618, ...,     0,     0,     0]], dtype=int32)>

In [33]:
test_tokenized=tokenizer(x_test,padding=True,truncation=True,return_tensors="tf",max_length=512)

In [34]:
train_tokenized

{'input_ids': <tf.Tensor: shape=(117580, 512), dtype=int32, numpy=
array([[  101,  2302, 14879, ...,     0,     0,     0],
       [  101,  1044,  1012, ...,     0,     0,     0],
       [  101,  2065,  2017, ...,     0,     0,     0],
       ...,
       [  101,  5834,  2006, ...,     0,     0,     0],
       [  101,  2260,  1012, ...,     0,     0,     0],
       [  101,  8013,  4618, ...,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(117580, 512), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(117580, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dt

In [35]:
train_dataset = tensorflow.data.Dataset.from_tensor_slices((
    {
        'input_ids': train_tokenized['input_ids'],
        'attention_mask': train_tokenized['attention_mask']
    },
    y_train
)).shuffle(1000).batch(500)

test_dataset = tensorflow.data.Dataset.from_tensor_slices((
    {
        'input_ids': test_tokenized['input_ids'],
        'attention_mask': test_tokenized['attention_mask']
    },
    y_test
)).batch(500)

In [36]:
optimizer=tensorflow.keras.optimizers.Adam(learning_rate=5e-5)

loss = tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

metrics = [tensorflow.keras.metrics.SparseCategoricalAccuracy()]

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

history = model.fit(train_dataset,validation_data=test_dataset,batch_size=50,epochs=2)

In [None]:
loss, accuracy = model.evaluate(test_dataset)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Predicting the red flag words from the given terms and conditions as url

In [None]:
columns = ["Clause ID", "Clause Text", "Category", "Risk Identifiers", "Red Flag"]

input_df=pd.DataFrame(columns=columns)


def predict(input_url):

  inputed_content=scrapping_url_data(input_url)

  for i in range(0,len(inputed_content)):

    t=data_addition(input_df,inputed_content[i])

    input_df=pd.concat([input_df,t],ignore_index=True)

  return input_df



In [None]:
inputed_url=input("Enter the url for Checking the Terms and conditions")

final_df=predict(inputed_url)

to_find_data=final_df['Clause Text']

predictions= model.predict(tokenizer(to_find_data.to_list(), truncation=True, padding=True, return_tensors="tf"))

predicted_labels = np.argmax(predictions.logits, axis=1)

final_df['Predicted Red Flag'] = predicted_labels

final_df=final_df[final_df['Predicted Red Flag']==1]

print(final_df[['Clause Text', 'Predicted Red Flag']])

