In [None]:
#REQUIREMENTS
import collections
import os
import re

import numpy as np
import pandas as pd
import pdfplumber
from scipy.spatial.distance import cosine
import spacy
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
def remove_words(column, word_list):
    cleaned = []
    processed_words = []
    for word in word_list:
        # Escape characters that have special meaning in regex
        escaped_word = re.escape(word)

        processed_words.append(escaped_word)
    pattern = '|'.join(sorted(processed_words, key=len, reverse=True))
    compiled_pattern = re.compile(pattern, re.IGNORECASE)

    for original_string in column:
        if isinstance(original_string, str):
            # Replace all occurrences of the words with an empty string
            modified_string = compiled_pattern.sub('', original_string)
            # Clean up extra spaces that might result from removal (e.g., "Hello  World")
            modified_string = re.sub(r'\s+', ' ', modified_string).strip()
            cleaned.append(modified_string)
        else:
            # If the item is not a string (e.g., NaN, None), keep it as is
            cleaned.append(original_string)
    return cleaned

remove_list = ['P.C.', '&', 'LLC', 'LLP', 'LAW', 'OFFICE', 'PLLC', 'P.A.', ' PC', 'Group', 'Legal', 'Aid', 'Offices', 'of',
               'and', 'International', 'Inc.', 'P.L.L.C', 'the', 'Associates', 'PA ', 'APLC', 'L.L.P.', 'L.L.C.',
               'P. A.', 'PLC', 'APC', 'firm', 'LPA', 'P.L.C', 'P.A', 'St. Louis', 'Pressional', 'Corp', "LC", '- MIAMI FL',
               'ETC', '(Tulsa)', ' SC', '- Philadelphia', '(US)']

def clean_text(text):
   if not isinstance(text, str):
       return ""
   return re.sub(r'[^\w\s]', '', text.lower()).strip()
# 2️⃣ Jaccard similarity: intersection / union of unique words
def jaccard_similarity(a, b):
   words_a = set(clean_text(a).split())
   words_b = set(clean_text(b).split())
   if not words_a and not words_b:
       return 1.0  # Both empty
   if not words_a or not words_b:
       return 0.0  # One empty
   return len(words_a & words_b) / len(words_a | words_b)
# 3️⃣ Check if any b_entry matches a_entry with the threshold
def has_match(a_entry, b_series, threshold=0.5):  # threshold can be tuned
   for b_entry in b_series:
       if jaccard_similarity(a_entry, b_entry) >= threshold:
           return b_entry
   return False
# 4️⃣ Apply matching to lists in Column A
def match_law_firm_lists(df, column_name, threshold=0.65):
   AmLaw = pd.read_excel("C:/Users/mark.thomson/ML Proj/AmLaw 200 - Oct 24.xlsx")
   AmLaw_cleaned = remove_words(AmLaw['Firm Name'], remove_list)
   def match_list(law_firm_list):
       if not isinstance(law_firm_list, list):
           return []
       return [has_match(firm_name, AmLaw_cleaned, threshold) for firm_name in law_firm_list]
   result_match = df[column_name].apply(match_list)
   return result_match

def clean_and_convert(value):
    if pd.isna(value):
        return np.nan
    try:
        # Remove '$' and ',' then convert to int
        cleaned_value = value.replace('$', '').replace(',', '')
        return int(cleaned_value)
    except (ValueError, AttributeError):
        # Handle cases where conversion might fail (e.g., if there are other unexpected strings)
        return np.nan

def find_matching_clients(parties_string, top_client_list):
    """
    Finds elements in parties_string that contain any substring from top_client_list,
    case-insensitively, using whole-word or boundary matching.

    Args:
        parties_string (str): A string of parties separated by ' / '.
        top_client_list (list): A list of client substrings to search for.

    Returns:
        list: A list of matching party elements, or False if no matches are found.
    """
    if not isinstance(parties_string, str):
        return False # Handle non-string entries if any

    individual_parties = [p.strip() for p in parties_string.split('/')]
    matching_clients = []

    # Create regex patterns for each top client, ensuring whole word match and case-insensitivity
    # For 'td', we want it to match 'td bank' but not 'ltd'.
    # Using \b (word boundary) or checking for specific delimiters will help.
    # For simplicity and robust matching, using \b with re.escape is a good approach.
    regex_patterns = [re.compile(r'\b' + re.escape(client) + r'\b', re.IGNORECASE) for client in top_client_list]

    for party in individual_parties:
        for pattern in regex_patterns:
            if pattern.search(party):
                matching_clients.append(party) # Append the original party string
                break  # Move to the next party once a match is found for the current party

    return matching_clients if matching_clients else False

top_client_list = ['Securities and Exchange Commission', "SEC", 'S.E.C.', 'JPMorgan', 'J.P.', 'Chase', 'Bank of America', 'bofa', 'Citigroup', 'Citibank', 
                       'citi', 'wells fargo', 'goldman sachs', 'Morgan Stanley', 'U.S. Bancorp', 'U.S. Bank', 'PNC', 'TD ', 'Capital One', 'C1', 'Charles Schwab', 
                       'BNY', 'The Bank of New York', 'State Street', 'BMO', 'American Express', 'Amex', 'UBS', 'M&T', 'Ameriprise', 'Santander', 'Northern Trust',
                       'Deustche']

def amlaw_match(df, AmLaw_path=0):
    if AmLaw_path:
        AmLaw = pd.read_excel(AmLaw_path)
    else:
        AmLaw = pd.read_excel("C:/Users/mark.thomson/ML Proj/AmLaw 200 - Oct 24.xlsx")
    
    AmLaw_cleaned = remove_words(AmLaw['Firm Name'], remove_list)

    cleaned_plaintiff = remove_words(df['plaintiff_attorney_firm'], remove_list)

    cleaned_defendant = remove_words(df['defendant_attorney_firm'], remove_list)

    plaint_split_clean = []

    defend_split_clean = []

    for item in cleaned_plaintiff:
        if isinstance(item, str):
            plaint_split_clean.append(item.split(' /'))
        else:
            plaint_split_clean.append(item)

    df['cleaned_plaintiff'] = plaint_split_clean

    for item in cleaned_defendant:
        if isinstance(item, str):
            defend_split_clean.append(item.split(' /'))
        else:
            defend_split_clean.append(item)

    df['cleaned_defendant'] = defend_split_clean

    df['plaintiff_matches'] = match_law_firm_lists(df, 'cleaned_plaintiff')

    df['defendant_matches'] = match_law_firm_lists(df, 'cleaned_defendant')

    df = df.drop(columns=['cleaned_plaintiff', 'cleaned_defendant'])

    def false_check(list):
        score = 0
        for entry in list:
            if entry != False:
                score +=1
        return score
    df = df[(df['plaintiff_matches'].apply(false_check) > 0) | \
            (df['defendant_matches'].apply(false_check) > 0)]

    df = df[(df['plaintiff_matches'].apply(len) > 0) | \
            (df['defendant_matches'].apply(len) > 0)]

    df = df[['title', 'date_filed', 'last_updated', 'nature_of_suit', 'statute', 'docket_number', 'court', 'attorney', 'attorney_email', 'demand', 'parties', 'plaintiff_party',
                 'defendant_party', 'defendant_attorney_firm', 'plaintiff_attorney_firm', 'plaintiff_matches', 'defendant_matches']]
    
    df['contains_top_client'] = df['parties'].apply(lambda x: find_matching_clients(x, top_client_list))

    df['demand'] = df['demand'].apply(clean_and_convert)

    df = df[
    ( (df['demand'] == 75000) | (df['demand'] == 750000) | (df['demand'] >= 1000000) | (df['demand'].isna()) ) &
    ( ~df['statute'].isin(['12:635 Breach of Insurance Contract', '28:1441 Petition for Removal- Breach of Contract']) )
    ]


    df_sorted = df.sort_values(by='contains_top_client', key=lambda x: x.apply(lambda val: val is not False), ascending=False)
    
    #print(df_sorted['docket_number'].str.cat(sep='" OR "'))

    return df_sorted

In [None]:
def extract_legal_complaint_text_from_keyword(pdf_path, start_keyword="complaint"):
    """
    Extracts text from a legal complaint PDF file using pdfplumber,
    starting from the first occurrence of a specified keyword (case-insensitive).

    Args:
        pdf_path (str): The path to the legal complaint PDF file.
        start_keyword (str): The keyword to start extraction from (case-insensitive).

    Returns:
        str: The extracted text from the PDF, starting from the first occurrence
             of the keyword. If the keyword is not found, an appropriate message
             is printed, and an empty string is returned.
    """
    full_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                # Add a separator for better readability between pages in the output
                if page_num > 0:
                    full_text += "\n--- Page {} ---\n\n".format(page_num + 1)
                full_text += page.extract_text()
    except FileNotFoundError:
        print(f"Error: The legal complaint file '{pdf_path}' was not found. Please check the path.")
        return ""
    except pdfplumber.pdf.PDFError as e:
        print(f"Error opening or reading PDF '{pdf_path}': {e}")
        print("This might be due to a corrupted PDF or an unsupported PDF format.")
        return ""
    except Exception as e:
        print(f"An unexpected error occurred during text extraction: {e}")
        return ""

    demand_hooks = ['requested relief', 'prayer for relief', 'request for relief', 'prayer', 'prays for relief', 'relief requested',
                    '\nrequested relief\n', '\nprayer for relief\n', '\nrequest for relief\n', '\nprayer\n', '\nprays for relief\n', '\nrelief requested\n',
                     '\nPRAYER FOR RELIEF\n' ]


    def safe_int_conversion(item):
        try:
            return int(item)
        except (ValueError, TypeError):
            return False
    # Now, find the starting keyword (case-insensitive)
    start_keyword_lower = start_keyword.lower()
    full_text_lower = full_text.lower()
    
    high_confidence = False
    largest_number = False
    last_number = False
    demand_info = []
    hook_match = False

    start_index = full_text_lower.find(start_keyword_lower)
    stop_index_respect = full_text_lower.find('respectfully submitted')
    stop_index_respect2 = full_text_lower.find('respedfully submitted')
    stop_index_dated = full_text_lower.find('dated:')
    if stop_index_dated != -1:
        stop_index = stop_index_dated
    elif stop_index_respect != -1:
        stop_index = stop_index_respect
    elif stop_index_respect2 != -1:
        stop_index = stop_index_respect2
    else:
        stop_index = -1
    complaint = 0
    if start_index != -1:
        # If found, return the text from that point onwards
        complaint = full_text[start_index:stop_index]
    else:
        print(f"The keyword '{start_keyword}' was not found in the legal complaint.")
        complaint = full_text # Or you could return full_text if you prefer to see everything when keyword is missing.


    for hook in demand_hooks:
        relief_index = full_text_lower.find(hook)
        if relief_index != -1:
            hook_match = True
            relief_text = full_text_lower[relief_index:stop_index]
            
            pattern = r"\$\s*([\d,]+)"

            # Find all matches
            matches = re.findall(pattern, relief_text)

            # Remove commas from the matched strings
            matches = [number.replace(',', '') for number in matches]
    
    # Convert the list of strings to integers
            if matches:
                
                numbers = [safe_int_conversion(match) for match in matches]

                if numbers:
                    largest_number = max(numbers)
                    last_number = numbers[-1]
                else:
                    largest_number = False
                    last_number = False
                high_confidence = True
                demand_info = [largest_number, last_number, high_confidence]
            else:
                hook_match = False
    if not hook_match:

        pattern = r"\$\s*([\d,]+)"
        # Find all matches
        matches = re.findall(pattern, complaint)
        # Remove commas from the matched strings
        matches = [number.replace(',', '') for number in matches]
        numbers = [safe_int_conversion(match) for match in matches]

        if numbers:
            largest_number = max(numbers)
            last_number = numbers[-1]
        else:
            largest_number = False
            last_number = False
        demand_info = [largest_number, last_number, high_confidence]
            
    return [complaint, largest_number, last_number, high_confidence]

    # Find the index of the first occurrence of the keyword
    
positive_words = ['ponzi scheme', 'venture capital', 'private equity', 'broker dealer', 'fraudulent digital asset trading', 'misappropriated funds', 
                  'mutual fund investigation', 'dually registered personnel', 'misleading investors', 'intellectual property', 'contract dispute', 'master limited partnership', 
                  'algorithimic trading', 'exchange traded fund', 'financial arbitrage', 'venture capital funds', 'fee structure', 'financial bonds','financial bond', 'breach of contract', 
                  'annuities', 'financial futures', 'futures contracts', 'pump and dump', 'sovereign bonds', 'repo financing', 'RMBS', 'residential mortgage-backed security', 'CDO Valuation', 
                  'collateralized debt obligation', 'mortage backed security', 'financial due diligence', 'asset classification', 'financial indentures', 'certificates of deposit', 
                  'special purpose vehicle', 'asset-backed security', 'unauthorized transfer', 'fiduciary duty', 'promissory note']

negative_words = ['Crypto', 'Precious Metals']

In [None]:
def get_pdf_strings_as_dict(folder_path):
    """
    Converts all PDF files in a given folder into a dictionary,
    where keys are filenames (without extension) and values are PDF contents as strings.

    Args:
        folder_path (str): The path to the folder containing PDF files.

    Returns:
        dict: A dictionary mapping PDF filenames (e.g., "report1") to their string content.
              Returns an empty dictionary if no PDFs are found or if the folder doesn't exist.
    """
    pdf_dict = {}

    if not os.path.isdir(folder_path):
        print(f"Error: Folder not found at '{folder_path}'")
        return {}

    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)

        if os.path.isfile(filepath) and filename.lower().endswith('.pdf'):
            content_string = extract_legal_complaint_text_from_keyword(filepath)
            if content_string is not None:
                # Get the filename without the extension to use as the key
                file_name_without_extension = os.path.splitext(filename)[0]
                pdf_dict[file_name_without_extension] = content_string

    return pdf_dict

In [None]:
# Load the FinBERT model and tokenizer
finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

def get_finbert_embedding(text):
    """
    Generates a document embedding for the given text using FinBERT.
    Uses mean pooling of token embeddings.
    """
    # Tokenize the preprocessed text for FinBERT
    inputs = finbert_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        # Pass output_hidden_states=True to get the hidden states
        outputs = finbert_model(**inputs, output_hidden_states=True)

    # Access the last hidden state from the outputs
    # outputs.hidden_states is a tuple of all hidden states
    # outputs.hidden_states[-1] is the last hidden state
    last_hidden_state = outputs.hidden_states[-1]

    # Use mean pooling of the last hidden states to get a single document embedding
    # The dimensions are [batch_size, sequence_length, hidden_size]
    document_embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
    
    return document_embedding

# Example usage (assuming 'positive_words' and 'negative_words' are defined lists)


positive_keyword_embeddings = {
    kw: get_finbert_embedding(kw) for kw in positive_words
}
negative_keyword_embeddings = {
    kw: get_finbert_embedding(kw) for kw in negative_words
}

positive_keyword_embeddings = {k: v for k, v in positive_keyword_embeddings.items() if v is not None}
negative_keyword_embeddings = {k: v for k, v in negative_keyword_embeddings.items() if v is not None}

print("Positive keyword embeddings generated.")
print("Negative keyword embeddings generated.")

In [None]:
nlp = spacy.load('en_core_web_lg')
SEMANTIC_SIMILARITY_THRESHOLD = 0.85 # Adjust this threshold (0.0 to 1.0)

def get_recommendation_score_finbert_keywords(new_complaint_text):
    # Process with spaCy to get individual tokens and their base forms
    doc = nlp(new_complaint_text)
    lemmas = []
    token_num = len(doc)
    for token in doc:
        if token.is_alpha and not token.is_stop:
            lemmas.append(token.lemma_.lower().strip())

    lemma_counts = Counter(lemmas)

    unique_lemma = list(lemma_counts.keys())
    print(len(unique_lemma))
    # Get FinBERT embedding for the entire document for overall financial sentiment
    # Note: ProsusAI/finbert is a sentiment model, so we can use its classifier head if needed.
    # For now, we'll just use its embeddings.
    # If you want direct sentiment:
    # finbert_sentiment_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    # finbert_sentiment_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
    # inputs = finbert_sentiment_tokenizer(new_complaint_text, return_tensors="pt", padding=True, truncation=True)
    # with torch.no_grad():
    #     logits = finbert_sentiment_model(**inputs).logits
    # probabilities = torch.softmax(logits, dim=1).squeeze().tolist()
    # sentiment_labels = ["positive", "negative", "neutral"]
    # financial_sentiment_score = probabilities[sentiment_labels.index("negative")] # Or positive

    # For Option B, we are focusing on keyword/semantic overlap for score,
    # rather than directly using FinBERT's sentiment head.

    pos_match_count = 0
    neg_match_count = 0
    financial_match_count = 0
    
    # Track which keywords have already been matched to avoid double counting on similar tokens
    matched_pos_keywords = set()
    matched_neg_keywords = set()


    for lemma in unique_lemma:
        #lemma = token.lemma_.lower().strip()
        #if not lemma or not token.has_vector: # Check if spacy has a vector for the token
            #continue

        # Get FinBERT embedding for the current token's lemma (or raw text)
        # We need to get a FinBERT embedding for each token to compare it
        # with the FinBERT embeddings of our keywords.
        # This is where FinBERT's strength comes in for nuance.
        token_finbert_embedding = get_finbert_embedding(lemma)
        if token_finbert_embedding is None:
            continue

        # Check for semantic similarity with positive keywords
        for pk_text, pk_embed in positive_keyword_embeddings.items():
            if 1 - cosine(token_finbert_embedding, pk_embed) > SEMANTIC_SIMILARITY_THRESHOLD:
                pos_match_count += lemma_counts[lemma]
                matched_pos_keywords.add(pk_text) # Mark keyword as matched
                print(lemma)
                print(pk_text)
                print(1- cosine(token_finbert_embedding, pk_embed))
                print("-----")
                break # Move to next token, this token found a positive match

        # Check for semantic similarity with negative keywords
        '''
        for nk_text, nk_embed in negative_keyword_embeddings.items():
            if 1 - cosine(token_finbert_embedding, nk_embed) > SEMANTIC_SIMILARITY_THRESHOLD:
                neg_match_count += lemma_counts[lemma]
                print(lemma)
                print(nk_text)
                print(1- cosine(token_finbert_embedding, nk_embed))
                print("-----")
                matched_neg_keywords.add(nk_text)
                break
        '''

    # --- Score Calculation Logic (Crucial and highly customizable) ---
    # This is a simple weighted sum. You'll need to define weights based on domain expertise.
    # Higher negative impact, medium financial relevance, lower positive impact in this example.
    if token_num != 0:
        return [pos_match_count / token_num, pos_match_count, neg_match_count / token_num, neg_match_count]
    else:
        return 0

In [None]:
def get_corpus(filepath):
    corpus = get_pdf_strings_as_dict(filepath)
    corpus2 = pd.DataFrame(list(corpus.items()), columns=['Case Key', 'Info'])
    new_cols = corpus2['Info'].apply(pd.Series)

# Rename the new columns for clarity
    new_cols.columns = ['Comaplaint', 'Largest Number', 'Last Number', 'Demand Confidence']

# Concatenate the new columns to the original DataFrame
    corpus3 = pd.concat([corpus2, new_cols], axis=1)

# Drop the original 'Value_Column'
    corpus3 = corpus3.drop('Info', axis=1)
    return corpus3

def remove_false_items(input_list):
    """
    Returns a new list with all False values removed.
    """
    return [item for item in input_list if item is not False]

In [None]:
#ENTER ORIGINAL DATA FROM BLOOMBERG LAW HERE (CHANGE FILE PATH)
raw_bloombeg_data = pd.read_csv("C:/Users/mark.thomson/ML Proj/bloomberg_law_docket_search_results_2025-08-07.csv")
phase1_blaw_data = amlaw_match(raw_bloombeg_data)

In [None]:
#CHANGE FILE PATH TO WHERE YOU WANT THE PHASE 1 OUTPUT TO BE
phase1_blaw_data.to_csv("C:/Users/mark.thomson/ML Proj/phase_1_2025-08-07 v6.csv", index=True, index_label='case_key')

In [None]:
#ENTER FOLDER OF COMPLAINTS HERE
new_corpus = get_corpus('C:/Users/mark.thomson/ML Proj/complaints 8-7-25')
embedding_scores = new_corpus['Comaplaint'].apply(get_recommendation_score_finbert_keywords)
new_corpus['Positive Percent'] = [embedding_scores[i][0] for i in range(len(embedding_scores))]
new_corpus['Positive Count'] = [embedding_scores[i][1] for i in range(len(embedding_scores))]

In [None]:
#ENTER MODIFIED PHASE 1 OUTPUT HERE
final_part = pd.read_excel("C:/Users/mark.thomson/ML Proj/phase_1_2025-08-07 v6-filled.xlsx")
final_part['case_key'] = [str(i) for i in final_part['case_key']]
final = pd.merge(final_part, new_corpus, on='case_key', how='right')
final['securities fraud class action'] = [1 if ('class action' in i or 'class-action' in i) and '10b-5' in i else 0 for i in final['Comaplaint']]
#These two lines were not tested, might cause errors
final['plaintiff_matches'] = final['plaintiff_matches'].apply(remove_false_items)
final['defendant_matches'] = final['defendant_matches'].apply(remove_false_items)

In [None]:
#CHANGE FILE PATH TO WHERE YOU WANT FINAL OUTPUT TO BE
final.to_excel('C:/Users/mark.thomson/ML Proj/8-12-25 demo v2.xlsx')