<a href="https://colab.research.google.com/github/Kraken2003/LegalDoc_genAI/blob/main/Clause__Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.5 MB/s[0m eta [36m0:00:0

In [None]:
#importing relevant dependencies for the task in hand
import os
import re
import spacy
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

In [None]:
#Using spacy's english language model and stop words
nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
def extract_clauses(txt_filename):
  """
    Extracts and compiles legal clauses from a text file based on the specific regex pattern.

    Args:
        txt_filename (str): The name of the plain text file to extract clauses from.

    Returns:
        str: A string containing the compiled legal clauses from the plain text file.
    """

  clause_pattern = re.compile(r'(?<=\n)(\d+(\.\d+)*[.)])\s*((?:(?!\n\d+(\.\d+)*[.)]|\n\d+\s*;)[\s\S])*)', re.MULTILINE)  #This regex expression is used to catch all possible clauses
  with open(txt_filename,'r') as file:
    txt_file_content = file.read()
    clause_text = re.findall(clause_pattern,txt_file_content)
    file.close()

  content_list = []
  for item in clause_text:
    for sentences in item[1:]:  #Omiting the initial number with which the clause begins from
      if sentences.strip():
        content_list.append(sentences)
  clause_text_string = ""
  for position in range(len(content_list)):
    clause_text_string  = " ".join(content_list)

  return clause_text_string


In [None]:
def clean_and_tokenize(filename):
  """
    Cleans and tokenizes the filename of the plain text file using spaCy.

    Args:
        filename (str): The name of the file to be cleaned and tokenized.

    Returns:
        str: A string containing cleaned and tokenized filename.
    """

    doc_spacy = nlp(filename)
    tokens = [
        token.text.lower() for token in doc_spacy
        if not token.text.isdigit() and token.text.lower() not in STOP_WORDS
        and token.text not in ("(", ")", "-", ";")
    ]
    return " ".join(tokens)

# Directory containing all the text files (this drive link will be available for view in the README file of this GitHub repo)
txt_folder = r'/content/drive/MyDrive/mike legal task/training txt files'

data = [] #Data list to be used for creating a dataframe

for filename in os.listdir(txt_folder):
  if filename.endswith(".txt"):
    try:
      clauses = extract_clauses(os.path.join(txt_folder, filename))
      cleaned_filename = clean_and_tokenize(filename[:-4])
      data.append({'filename': cleaned_filename, 'clause': clauses})
    except:
      print("could not do", filename)


In [None]:
# Creating a DataFrame from the data
df = pd.DataFrame(data)
df

Unnamed: 0,filename,clause
0,20201104_tui_investment_commitment_agreement f...,DEFINITIONS AND INTERPRETATION\t5 COMMITMENT\t...
1,20201028_myttech_employmentagreement,DEFINITIONS AND INTERPRETATION\t4 EMPLOYMENT\t...
2,20201023_gg_loan agreement,The Borrower is engaged in the business of agr...
3,demo joint venture agreement,DEFINITIONS AND INTERPRETATION\t4 EFFECTIVE DA...
4,contract sale goods seller friendly version,Sale of Goods. The Seller shall sell to the Bu...
5,draft sha_belita_11082015_clean_execution version,Definitions and Interpretations\n .1 Definitio...
6,example shareholder agreement,Definitions\nIn this agreement the following d...
7,llc operating agreement,"1 Formation On, [DATE COMPANY FORMED], the C..."
8,rental agreement plain language lease,PARTIES: This agreement is entered into on thi...
9,offer letter- aatmnirbhar final,Date of Joining: You’re required to join the C...


In [None]:
def get_sentences(raw_text):
  """
    Splits raw text into sentences using spaCy.

    Args:
        raw_text (str): The raw text to be split into sentences.

    Returns:
        list of str: A list containing individual sentences extracted from the raw text.
    """

  preprocessed_text = re.sub(r'[^a-zA-Z0-9\. ]+', ' ', raw_text)  # Using another regex expression for gathering all alphanumeric information
  doc_spacy = nlp(preprocessed_text)
  sentences = [sent.text for sent in doc_spacy.sents]
  return sentences

#tokenization and cleaning
df['preprocessed_clause'] = df['clause'].apply(lambda x: get_sentences(x))

In [None]:
def remove_stop_words_and_digits(sentence_list):
  """
    Removes stop words and digits from a list of sentences using spaCy.

    Args:
        sentence_list (list of str): A list containing sentences to be cleaned.

    Returns:
        list of str: A list containing cleaned sentences with stop words and digits removed.
    """

    cleaned_sentences = []

    for sentence in sentence_list:
        doc = nlp(sentence)
        cleaned_tokens = [
            token.text for token in doc
            if not token.is_stop and not token.text.isdigit()
        ]
        cleaned_sentence = ' '.join(cleaned_tokens)
        cleaned_sentences.append(cleaned_sentence)

    return cleaned_sentences

#Another process of tokenization and cleaning
df['cleaned_text'] = df['preprocessed_clause'].apply(remove_stop_words_and_digits)


In [None]:
df.to_csv(r'/content/drive/MyDrive/mike legal task/tokenized_dataframe.csv', index=False, encoding='utf-8')

In [None]:
nlp_df = pd.read_csv(r'/content/drive/MyDrive/mike legal task/tokenized_dataframe.csv')

#This particluar file 'consulting agreement- nikhil d' could not pass the regex expression for the clause extraction and hence is an empty row
index_to_remove = 14
nlp_df = nlp_df.drop(index_to_remove)
nlp_df = nlp_df.reset_index(drop=True)

#This is our final dataset
nlp_df.to_csv(r'/content/drive/MyDrive/mike legal task/final_dataframe.csv', index=False)

In [None]:
new_df = pd.read_csv(r'/content/drive/MyDrive/mike legal task/final_dataframe.csv')
new_df

Unnamed: 0,filename,clause,preprocessed_clause,cleaned_text
0,20201104_tui_investment_commitment_agreement f...,DEFINITIONS AND INTERPRETATION\t5 COMMITMENT\t...,['DEFINITIONS AND INTERPRETATION 5 COMMITMENT ...,['DEFINITIONS INTERPRETATION COMMITMENT REPRES...
1,20201028_myttech_employmentagreement,DEFINITIONS AND INTERPRETATION\t4 EMPLOYMENT\t...,['DEFINITIONS AND INTERPRETATION 4 EMPLOYMENT ...,['DEFINITIONS INTERPRETATION EMPLOYMENT REMUNE...
2,20201023_gg_loan agreement,The Borrower is engaged in the business of agr...,['The Borrower is engaged in the business of a...,"['Borrower engaged business agriculture . ', ..."
3,demo joint venture agreement,DEFINITIONS AND INTERPRETATION\t4 EFFECTIVE DA...,['DEFINITIONS AND INTERPRETATION 4 EFFECTIVE D...,['DEFINITIONS INTERPRETATION EFFECTIVE DATE IN...
4,contract sale goods seller friendly version,Sale of Goods. The Seller shall sell to the Bu...,"['Sale of Goods.', 'The Seller shall sell to t...","['Sale Goods .', 'Seller shall sell Buyer Buye..."
5,draft sha_belita_11082015_clean_execution version,Definitions and Interpretations\n .1 Definitio...,['Definitions and Interpretations .1 Definiti...,['Definitions Interpretations .1 Definitions...
6,example shareholder agreement,Definitions\nIn this agreement the following d...,['Definitions In this agreement the following ...,['Definitions agreement following definitions ...
7,llc operating agreement,"1 Formation On, [DATE COMPANY FORMED], the C...",['1 Formation On DATE COMPANY FORMED the ...,[' Formation DATE COMPANY FORMED Compan...
8,rental agreement plain language lease,PARTIES: This agreement is entered into on thi...,"['PARTIES ', 'This agreement is entered into ...","['PARTIES ', 'agreement entered date follo..."
9,offer letter- aatmnirbhar final,Date of Joining: You’re required to join the C...,['Date of Joining You re required to join the...,['Date Joining required join Company latest ...


In [None]:
#Loading the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)


def generate_bert_embeddings(text_list):
  """
    Generates BERT embeddings for a list of sentences using a pre-trained BERT model.

    Args:
        text_list (list of str): A list containing sentences for which embeddings will be generated.

    Returns:
        list of tensors: A list of tensors containing BERT embeddings for each sentence in the input list.
    """

    combined_text = ' '.join(text_list)
    #Tokenizing the combined text
    inputs = tokenizer(combined_text, padding=True, truncation=True, return_tensors='pt')

    #Generating BERT embeddings
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Extracts embeddings for each sentence
    sentence_embeddings = outputs.last_hidden_state.split(len(text_list))
    return sentence_embeddings

#Applying the function to the clauses
new_df['bert_embeddings'] = new_df['cleaned_text'].apply(generate_bert_embeddings)


In [None]:
new_df

Unnamed: 0,filename,clause,preprocessed_clause,cleaned_text,bert_embeddings
0,20201104_tui_investment_commitment_agreement f...,DEFINITIONS AND INTERPRETATION\t5 COMMITMENT\t...,['DEFINITIONS AND INTERPRETATION 5 COMMITMENT ...,['DEFINITIONS INTERPRETATION COMMITMENT REPRES...,"([[tensor([-5.9632e-01, 9.1682e-01, 3.3475e-..."
1,20201028_myttech_employmentagreement,DEFINITIONS AND INTERPRETATION\t4 EMPLOYMENT\t...,['DEFINITIONS AND INTERPRETATION 4 EMPLOYMENT ...,['DEFINITIONS INTERPRETATION EMPLOYMENT REMUNE...,"([[tensor([-5.8181e-01, 9.1152e-01, 3.2992e-..."
2,20201023_gg_loan agreement,The Borrower is engaged in the business of agr...,['The Borrower is engaged in the business of a...,"['Borrower engaged business agriculture . ', ...","([[tensor([-5.8796e-01, 9.4792e-01, 2.3739e-..."
3,demo joint venture agreement,DEFINITIONS AND INTERPRETATION\t4 EFFECTIVE DA...,['DEFINITIONS AND INTERPRETATION 4 EFFECTIVE D...,['DEFINITIONS INTERPRETATION EFFECTIVE DATE IN...,"([[tensor([-4.6337e-01, 9.5671e-01, 2.1772e-..."
4,contract sale goods seller friendly version,Sale of Goods. The Seller shall sell to the Bu...,"['Sale of Goods.', 'The Seller shall sell to t...","['Sale Goods .', 'Seller shall sell Buyer Buye...","([[tensor([-4.3218e-01, 1.1963e+00, 1.8555e-..."
5,draft sha_belita_11082015_clean_execution version,Definitions and Interpretations\n .1 Definitio...,['Definitions and Interpretations .1 Definiti...,['Definitions Interpretations .1 Definitions...,"([[tensor([-5.4522e-01, 9.7492e-01, 3.7323e-..."
6,example shareholder agreement,Definitions\nIn this agreement the following d...,['Definitions In this agreement the following ...,['Definitions agreement following definitions ...,"([[tensor([-5.3166e-01, 9.7993e-01, 2.3605e-..."
7,llc operating agreement,"1 Formation On, [DATE COMPANY FORMED], the C...",['1 Formation On DATE COMPANY FORMED the ...,[' Formation DATE COMPANY FORMED Compan...,"([[tensor([-4.0236e-01, 1.1158e+00, 9.8057e-..."
8,rental agreement plain language lease,PARTIES: This agreement is entered into on thi...,"['PARTIES ', 'This agreement is entered into ...","['PARTIES ', 'agreement entered date follo...","([[tensor([-4.9929e-01, 9.8173e-01, 1.5926e-..."
9,offer letter- aatmnirbhar final,Date of Joining: You’re required to join the C...,['Date of Joining You re required to join the...,['Date Joining required join Company latest ...,"([[tensor([-4.3523e-01, 9.7234e-01, 5.7593e-..."


In [None]:
new_df.to_csv(r'/content/drive/MyDrive/mike legal task/embeddings_dataframe.csv', index=False)
