# Installations and Imports

In [30]:
import torch
import os
import json
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import tiktoken
from dotenv import load_dotenv
import time
import ast
import re
import warnings
warnings.filterwarnings('ignore')

# LangChain Import
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Finetuned Model Import
from transformers import BertTokenizer, BertForSequenceClassification
from src.model import PatentSentenceClassifier

# Load OpenaAI API key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# LLM Sentence Splitting

In [31]:
# Prompt Definition
task = """Your task is to divide a given sentence into sub-sentences.
Insert periods to divide the sentence into meaningful sub-sentences. 
Maintain the original words without any changes. 
Do not use pronouns; instead, repeat the original subjects as needed.

Input Format: A single sentence.
Output Format: A list of sub-sentences enclosed in double quotes, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).
"""
output_format = "Input: \"{text}\"\nOutput:"
splitting_prompt = '\n'.join([task, output_format])
print(splitting_prompt)

Your task is to divide a given sentence into sub-sentences.
Insert periods to divide the sentence into meaningful sub-sentences. 
Maintain the original words without any changes. 
Do not use pronouns; instead, repeat the original subjects as needed.

Input Format: A single sentence.
Output Format: A list of sub-sentences enclosed in double quotes, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).

Input: "{text}"
Output:


In [32]:
def count_tokens(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

def split_sentence_chatgpt(input_text, prompt, model="gpt-4o", temperature=0, top_p=1):
    
    # Define a prompt template for classification
    prompt_template = PromptTemplate.from_template(prompt)

    # Create an OpenAI LLM instance
    llm = ChatOpenAI(
        model=model,
        temperature=temperature,
        top_p=top_p,
        max_retries=1,
        max_tokens=1000 
    )

    # Create a runnable sequence
    split_sentence_chain = prompt_template | llm | StrOutputParser()

    # Format prompt
    formatted_prompt = prompt_template.format(text=input_text)
    #print(f"Generated Prompt:\n{formatted_prompt}") # Debugging statement

    # Perform Classification
    output_string = split_sentence_chain.invoke({"text": input_text}).strip()

    # Calculate token count
    input_count = count_tokens(formatted_prompt)
    output_count = count_tokens(output_string)

    print(f"\tUsing: model = '{model}'; temperature = {temperature}; top_p = {top_p}")

    return output_string, formatted_prompt, input_count, output_count

# Test Sentence Splitting

In [67]:
# 1. Import Sentences from test set
test_df = pd.read_excel("/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx")
test_df = test_df.query("agreement==True & sent_tag_mc == 'MIX'")
input_text = df['sent'].iloc[3] # select by index

In [None]:
# Select Model
chatgpt_model ='gpt-3.5-turbo'

# Perfrom Sentence Splitting
output_string, formatted_prompt, input_count, output_count = split_sentence_chatgpt(input_text, splitting_prompt, chatgpt_model)

# Validate Output Format
if not output_string:  
    raise ValueError(f"Output is empty.")
try:
    output_string = ast.literal_eval(output_string.replace('\n', ''))
    print(f"Output is in list format.") 
except (SyntaxError, ValueError) as e:
    raise ValueError(f"Output not in list format: {e}")

Output is in list format.


In [72]:
# Initialize ROUGE scorer
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore(rouge_keys = ('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

results = []

for generated_sent in output_string:
    score = rouge(generated_sent, input_text)

    results.append({
        'input_text': input_text,
        'prompt': formatted_prompt,
        'generated_sent': generated_sent,
        #'pred_sent_class': pre_class,
        #'probs': probs,
        'rouge1_precision': round(score['rouge1_precision'].item(), 3),
        'rouge3_precision': round(score['rouge3_precision'].item(), 3),
        'rouge5_precision': round(score['rouge5_precision'].item(), 3),
        'rouge7_precision': round(score['rouge7_precision'].item(), 3),
        'rouge9_precision': round(score['rouge9_precision'].item(), 3),
        'rougeL_precision': round(score['rougeL_precision'].item(), 3),
        'output_string': output_string,
        'input_count': input_count,
        'output_count': output_count,
        #'errors': None,
        #'elapsed_time_sec': time.time() - start_time()
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,input_text,prompt,generated_sent,rouge1_precision,rouge3_precision,rouge5_precision,rouge7_precision,rouge9_precision,rougeL_precision,output_string,input_count,output_count
0,The recovery of the ingestible device may be p...,Divide the given sentence into sub-sentences b...,The recovery of the ingestible device may be p...,1.0,1.0,1.0,1.0,1.0,1.0,[The recovery of the ingestible device may be ...,106,22


# Load Sentence Classification Model

In [33]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set path to checkpoint
checkpoint_name = 'bert-large-uncased_train_10_7'; model_name = "bert-large-uncased"
checkpoint_name = 'bert-for-patents_train_10_7'; model_name = "anferico/bert-for-patents" # https://huggingface.co/anferico/bert-for-patents
checkpoint_path = f"/home/fantoni/patent-sentence-classification/models/finetuning/{checkpoint_name}.ckpt"

# Load Base Tokenizer
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
print('\nBase Tokenizer loaded succesfully.')

# Load Base Model
base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)
print('\nBase model loaded succesfully.')

# Load Finetuned Model
model = PatentSentenceClassifier.load_from_checkpoint(
    checkpoint_path,
    model=base_model,
    tokenizer=bert_tokenizer)

model.eval()
model.to(device)
print('\nFinetuned model loaded succesfully.')

# Define Finetuned Tokenizer
tokenizer = model.tokenizer

Using device: cpu

Base Tokenizer loaded succesfully.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at anferico/bert-for-patents and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Base model loaded succesfully.

Finetuned model loaded succesfully.


# Classify Sentences

In [34]:
def classify_text(model, input_text, device='cpu'):
    
    # Tokenize input
    tokenizer = model.tokenizer  # Assuming tokenizer is part of the model
    inputs = tokenizer(input_text, truncation=True, padding=True, max_length=512, return_tensors='pt')
    
    # Move input to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Define label mapping
    int_to_label = {0: 'FUN', 1: 'STR', 2: 'MIX', 3: 'OTH'}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
        pred_idx = torch.argmax(probs).item()
        pred_class = int_to_label[pred_idx]

    return pred_class, [round(p, 2) for p in probs.tolist()]

In [None]:
# Initialize ROUGE scorer
# reference: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore(rouge_keys = ('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

# Import Sentences
test_df = pd.read_excel("/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx")
test_df = test_df.query("agreement==True & sent_tag_mc == 'MIX'")
test_df = test_df.head(50) 

# Select Models
chatgpt_model ='gpt-3.5-turbo'

# Initialize results list
results = []

for i, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing Sentences"):  
    start_time = time.time()              
    try:
        original_sent = row['sent']
        
        # Split Sentence 
        output_string, formatted_prompt, input_count, output_count = split_sentence_chatgpt(original_sent, splitting_prompt, chatgpt_model)

        # Validate Output Format
        if not output_string:  
            raise ValueError(f"Output is empty.")
        try:
            # Convert string to list
            output_string = ast.literal_eval(output_string)  
            #print(f"Output is in list format.") 
        except (SyntaxError, ValueError) as e:
            raise ValueError(f"Output not in list format: {e}")

        # Iterate over the generated sentences
        for generated_sent in output_string:

            # Classify Sub-Sentences
            pre_class, probs = classify_text(model, generated_sent, device)

            # Compute ROUGE Score
            score = rouge(generated_sent, original_sent)
            
            # Append Results
            results.append({
                'sent_id': row['sent_id'],
                'original_sent_class': row['sent_tag_mc'],
                'original_sent': original_sent,
                'prompt': formatted_prompt,
                'generated_sent': generated_sent,
                'pred_sent_class': pre_class,
                'probs': probs,
                'rouge1_precision': round(score['rouge1_precision'].item(), 3),
                'rouge3_precision': round(score['rouge3_precision'].item(), 3),
                'rouge5_precision': round(score['rouge5_precision'].item(), 3),
                'rouge7_precision': round(score['rouge7_precision'].item(), 3),
                'rouge9_precision': round(score['rouge9_precision'].item(), 3),
                'rougeL_precision': round(score['rougeL_precision'].item(), 3),
                'output_string': output_string,
                'input_count': input_count,
                'output_count': output_count,
                'errors': None,
                'elapsed_time_sec': time.time() - start_time
            })

    except Exception as e:
        print(f"Error processing sentence {row['sent_id']}: {str(e)}")
        results.append({
            'sent_id': row['sent_id'],
            'original_sent_class': row['sent_tag_mc'],
            'original_sent': original_sent,
            'prompt': formatted_prompt,
            'generated_sent': None,
            'pred_sent_class': None,
            'probs': None,
            'rouge1_precision': None,
            'rouge3_precision': None,
            'rouge5_precision': None,
            'rouge7_precision': None,
            'rouge9_precision': None,
            'rougeL_precision': None,
            'output_string': output_string,
            'input_count': None,
            'output_count': None,
            'errors': str(e),
            'elapsed_time_sec': time.time() - start_time
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df

# Save results to Excel
results_df.to_excel("/home/fantoni/patent-sentence-classification/results/mix_disambiguation.xlsx", index=False)

Processing Sentences:  48%|████▊     | 24/50 [02:01<01:41,  3.92s/it]

Error processing sentence 1394724: Output not in list format.


Processing Sentences: 100%|██████████| 50/50 [04:02<00:00,  4.86s/it]


# Claim Summarization

In [61]:
# Select patent
#patent_id = 'US8695121B2'; IPC = 'A42B3' 
#patent_id = 'US11133720B2'; IPC = 'H02K3' 
#patent_id = 'US9468782B2'; IPC = 'A62B23' 
#patent_id = 'US11673469B2'; IPC = 'B60K37'
patent_id = 'US20200074811A1'; IPC = 'G07F17'

# Import Sentences
df = pd.read_excel(f"/home/fantoni/patent-sentence-classification/data/patents/{patent_id}_{IPC}.xlsx")
input_text = df[df['section'] == 'first_claim']['sent'].iloc[0] # get first claim
print(input_text)

1. A method of administering a wagering game, comprising:
accepting an ante wager from a player by receiving a chip on a surface of a table;
dealing a partial hand to the player from a set of randomly ordered cards and permitting the player to view the partial hand;
after permitting the player to view the at least one card and while prohibiting the player from folding, accepting from the player an initial election to check after offering the player initial options selected from the group consisting of check or place a play wager of a first value;
dealing at least one other card from the set available to the player to form a complete hand and permitting the player to view the at least one other card;
after permitting the player to view the at least one other card, accepting from the player a subsequent election to place a play wager of a second, lesser value by receiving another chip on the surface of the table after offering the player subsequent options selected from the group consist

## Sentence Splitting

reference: https://community.openai.com/t/different-results-same-prompt-on-openai-api-vs-chatgpt/1062995


In [62]:
# 1. Chunking approach: divide first claim into chunks using simple regex
chunks = [sent.replace('\n', ' ') + '.' for sent in re.split(r'(?:;\sand\n)', input_text)]
df = pd.DataFrame({'text_id': range(1, len(chunks)+1), 'text': chunks})

# 2. As-Is approach: use first claim as is
df = pd.DataFrame({'text_id': [1], 'text': input_text})
df

Unnamed: 0,text_id,text
0,1,"1. A method of administering a wagering game, ..."


In [63]:
# Initialize ROUGE scorer
# reference: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore(rouge_keys = ('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

chatgpt_model ='gpt-4o'
#chatgpt_model ='gpt-3.5-turbo'

temperature = 0
top_p = 1

results = []

for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing Sentences"):  
    start_time = time.time()              
    try:
        text = row['text'] 
        output_string, formatted_prompt, input_count, output_count = split_sentence_chatgpt(text, splitting_prompt, chatgpt_model, temperature, top_p)

        if not output_string:  
            raise ValueError(f"Output is empty.")
        try:
            output_string = ast.literal_eval(output_string)  
            print(f"Output is in list format.") 
        except (SyntaxError, ValueError) as e:
            raise ValueError(f"Output not in list format: {e}")

        for generated_sent in output_string:
            pre_class, probs = classify_text(model, generated_sent, device)
            score = rouge(generated_sent, text)
            
            results.append({
                'text_id': row['text_id'],
                'text': text,
                'prompt': formatted_prompt,
                'generated_sent': generated_sent,
                'pred_sent_class': pre_class,
                'probs': probs,
                'rouge1_precision': round(score['rouge1_precision'].item(), 3),
                'rouge3_precision': round(score['rouge3_precision'].item(), 3),
                'rouge5_precision': round(score['rouge5_precision'].item(), 3),
                'rouge7_precision': round(score['rouge7_precision'].item(), 3),
                'rouge9_precision': round(score['rouge9_precision'].item(), 3),
                'rougeL_precision': round(score['rougeL_precision'].item(), 3),
                'output_string': output_string,
                'input_count': input_count,
                'output_count': output_count,
                'errors': None,
                'elapsed_time_sec': time.time() - start_time
            })

    except Exception as e:
        print(f"Error processing sentence {row['text_id']}: {str(e)}")
        results.append({
            'text_id': row['text_id'],
            'text': text,
            'prompt': formatted_prompt,
            'generated_sent': None,
            'pred_sent_class': None,
            'probs': None,
            'rouge1_precision': None,
            'rouge3_precision': None,
            'rouge5_precision': None,
            'rouge7_precision': None,
            'rouge9_precision': None,
            'rougeL_precision': None,
            'output_string': output_string,
            'input_count': None,
            'output_count': None,
            'errors': str(e),
            'elapsed_time_sec': time.time() - start_time
        })

results_df = pd.DataFrame(results)
results_df.to_excel(f"/home/fantoni/patent-sentence-classification/results/first_claim_{patent_id}_{IPC}_{chatgpt_model}_temp_{temperature}_top_{top_p}_asis.xlsx", index=False)

Processing Sentences:   0%|          | 0/1 [00:00<?, ?it/s]

	Using: model = 'gpt-4o'; temperature = 0; top_p = 1
Output is in list format.


Processing Sentences: 100%|██████████| 1/1 [00:21<00:00, 21.59s/it]


## Generate Prompts for Summarization

In [64]:
# Prompt Definition
task = """Your task is to generate a summary of a given text. Maintain the original words without any changes.\n""" # summarize
#task = """Your task is to rephrase a text. Maintain the original words without any changes.\n""" # rephrase

output_format = "Input: \"{text}\"\nOutput:"
summary_prompt = '\n'.join([task, output_format])
print(summary_prompt)

Your task is to generate a summary of a given text. Maintain the original words without any changes.

Input: "{text}"
Output:


In [67]:
# Import Sentences
results_df = pd.read_excel('/home/fantoni/patent-sentence-classification/results/first_claim_US20200074811A1_G07F17_gpt-4o_temp_0_top_1_asis.xlsx')

# Cluster Sentneces based on Predicted Class
strings_FUN = []
strings_STR = [] 
strings_OTH = []

for idx, row in results_df.iterrows():
    if row['pred_sent_class'] == 'FUN' or row['pred_sent_class'] == 'MIX':
        string = row['generated_sent']
        strings_FUN.append(string)
    elif row['pred_sent_class'] == 'STR' or row['pred_sent_class'] == 'MIX':
        string = row['generated_sent']
        strings_STR.append(string)
    elif row['pred_sent_class'] == 'OTH' :
        string = row['generated_sent']
        strings_OTH.append(string)
    
strings_FUN = '\n'.join(strings_FUN); print(strings_FUN, '\n-------------------------')
strings_STR = '\n'.join(strings_STR); print(strings_STR, '\n-------------------------')
strings_OTH = '\n'.join(strings_OTH); print(strings_OTH, '\n-------------------------')

A method of administering a wagering game, comprising:
accepting an ante wager from a player by receiving a chip on a surface of a table;
dealing a partial hand to the player from a set of randomly ordered cards and permitting the player to view the partial hand;
after permitting the player to view the at least one card and while prohibiting the player from folding, accepting from the player an initial election to check after offering the player initial options selected from the group consisting of check or place a play wager of a first value;
dealing at least one other card from the set available to the player to form a complete hand and permitting the player to view the at least one other card;
after permitting the player to view the at least one other card, accepting from the player a subsequent election to place a play wager of a second, lesser value by receiving another chip on the surface of the table after offering the player subsequent options selected from the group consisting

## Summarize with ChatGPT

In [68]:
def count_tokens(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

def summarize_with_chatgpt(input_text, prompt, model="gpt-4o", temperature=0, top_p=1):
    
    # Define a prompt template for classification
    prompt_template = PromptTemplate.from_template(prompt)

    # Create an OpenAI LLM instance
    llm = ChatOpenAI(
        model=model,
        temperature=temperature,
        top_p=top_p,
        max_retries=1,
        max_tokens=1000 
    )

    # Create a runnable sequence
    split_sentence_chain = prompt_template | llm | StrOutputParser()

    # Format prompt
    formatted_prompt = prompt_template.format(text=input_text)
    #print(f"Generated Prompt:\n{formatted_prompt}") # Debugging statement

    # Perform Classification
    output_string = split_sentence_chain.invoke({"text": input_text}).strip()

    # Calculate token count
    input_count = count_tokens(formatted_prompt)
    output_count = count_tokens(output_string)

    print(f"\tUsing: model = '{model}'; temperature = {temperature}; top_p = {top_p}")

    return output_string, formatted_prompt, input_count, output_count

In [69]:
chatgpt_model ='gpt-4o'
#chatgpt_model ='gpt-3.5-turbo'

temperature = 0
top_p = 1

# Peform Summarization
FUN_summary, formatted_prompt, input_count, output_count = summarize_with_chatgpt(strings_FUN, summary_prompt, chatgpt_model)
STR_summary, formatted_prompt, input_count, output_count = summarize_with_chatgpt(strings_STR, summary_prompt, chatgpt_model)

	Using: model = 'gpt-4o'; temperature = 0; top_p = 1
	Using: model = 'gpt-4o'; temperature = 0; top_p = 1


In [70]:
# Print Summary
print('FUN:', FUN_summary, '\n-------------------------')
print('STR:', STR_summary, '\n-------------------------')

FUN: A method of administering a wagering game involves accepting an ante wager, dealing a partial hand, allowing the player to view it, and offering options to check or place a play wager. After dealing additional cards to form a complete hand, the player can place a lesser play wager or fold. The game concludes by resolving the wagers. 
-------------------------
STR: The input text is empty, so there is nothing to summarize. 
-------------------------


## Evaluate Summarization

In [71]:
# Evaluate Summary: rouge and number of words
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore(rouge_keys = ('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

summary_data = []
for summary in [FUN_summary, STR_summary]:
    claim = input_text.replace('\n', ' ')
    token_summary = count_tokens(summary)
    token_claim = count_tokens(claim)
    score = rouge(summary, claim)
    summary_data.append({
        'first_claim': claim,
        'summary': summary,
        'token_claim': token_claim,
        'token_summary': token_summary,
        'token_summary_%': round((token_summary*100/token_claim), 2),
        'rouge1_precision': round(score['rouge1_precision'].item(), 3),
        'rouge3_precision': round(score['rouge3_precision'].item(), 3),
        'rouge5_precision': round(score['rouge5_precision'].item(), 3),
        'rouge7_precision': round(score['rouge7_precision'].item(), 3),
        'rouge9_precision': round(score['rouge9_precision'].item(), 3),
        'rougeL_precision': round(score['rougeL_precision'].item(), 3),
    })

summary_df = pd.DataFrame(summary_data)
summary_df.to_excel(f"/home/fantoni/patent-sentence-classification/results/summary_{patent_id}_{IPC}_{chatgpt_model}_temp_{temperature}_top_{top_p}_asis.xlsx", index=False)