In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
# Define the directory containing the text files in Google Drive
text_files_dir = "/content/drive/MyDrive/AIML/Capstone-Project/data/CUAD_v1/full_contract_txt"  # Replace with your actual folder path


In [6]:
import os
import re
import spacy
import pandas as pd
from glob import glob

# Load the spaCy model for NLP tasks
nlp = spacy.load('en_core_web_sm')


In [8]:
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation (optional, depending on context)
    return text


In [9]:
def remove_noise(text):
    text = re.sub(r'EXHIBIT \d+\.\d+', '', text)  # Remove exhibit numbers
    text = re.sub(r'Page \d+', '', text)  # Remove page numbers
    text = re.sub(r'CONFIDENTIAL', '', text)  # Remove confidentiality markers
    return text


In [10]:
def mark_redacted_text(text):
    text = re.sub(r'\[?\*+\]?', '[REDACTED]', text)  # Handles cases like "***" or "[***]"
    text = re.sub(r'_+', '[REDACTED]', text)  # Handles cases like "___"
    return text


In [11]:
def tokenize_text(text):
    doc = nlp(text)
    return [token.text for token in doc]  # Word-level tokenization


In [12]:
def extract_named_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]


In [13]:
def handle_special_characters(text):
    text = re.sub(r'[^\w\s\$]', '', text)  # Remove all special characters except $
    return text


In [14]:
def segment_text_into_paragraphs(text):
    paragraphs = text.split('\n\n')  # Simple segmentation based on double newlines
    return [para.strip() for para in paragraphs if para.strip()]


In [15]:
def preprocess_document(text):
    text = clean_text(text)
    text = remove_noise(text)
    text = mark_redacted_text(text)
    text = handle_special_characters(text)
    paragraphs = segment_text_into_paragraphs(text)
    named_entities = extract_named_entities(text)
    return paragraphs, named_entities


In [16]:
# Define the directory containing the text files
# text_files_dir = "/path/to/your/text/files"  # Replace with your actual directory path

# Process all files in the directory
all_preprocessed_data = []

for file_path in glob(os.path.join(text_files_dir, "*.txt")):
    with open(file_path, 'r', encoding='utf-8') as file:
        raw_text = file.read()
        preprocessed_paragraphs, named_entities = preprocess_document(raw_text)
        for para in preprocessed_paragraphs:
            all_preprocessed_data.append({
                "file_name": os.path.basename(file_path),
                "paragraph": para,
                "named_entities": named_entities
            })


# Convert the list of dictionaries to a DataFrame
df_preprocessed = pd.DataFrame(all_preprocessed_data)

# Save the preprocessed data to a CSV file in Google Drive
output_path = "/content/drive/MyDrive/preprocessed_legal_documents.csv"
df_preprocessed.to_csv(output_path, index=False)


In [17]:
# Display a sample of the preprocessed data for manual review
print(df_preprocessed.head())


                                           file_name  \
0  CytodynInc_20200109_10-Q_EX-10.5_11941634_EX-1...   
1  BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGRE...   
2  CerenceInc_20191002_8-K_EX-10.4_11827494_EX-10...   
3  FTENETWORKS,INC_02_18_2016-EX-99.4-STRATEGIC A...   
4  KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSU...   

                                           paragraph  \
0  exhibit 105 certain identified information has...   
1  exhibit 993 case 1810248 mfw doc 6321 filed 04...   
2  exhibit 104 intellectual property agreement by...   
3  exhibit 994 strategic alliance agreement edgef...   
4  exhibit 1023 corporate address fannin south pr...   

                                      named_entities  
0  [(105, CARDINAL), (december 17 2019, DATE), (c...  
1  [(993, CARDINAL), (1810248, DATE), (6321, CARD...  
2  [(104, CARDINAL), (nuance communications inc, ...  
3  [(994, CARDINAL), (this 17t h day of february ...  
4  [(1023, DATE), (140, CARDINAL), (7707, CARDINA..