# Installations and Imports

In [3]:
import torch
import os
import json
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import tiktoken
from dotenv import load_dotenv
import time
import ast
import re
import warnings
warnings.filterwarnings('ignore')

# LangChain Import
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Finetuned Model Import
from transformers import BertTokenizer, BertForSequenceClassification
from src.model import PatentSentenceClassifier

# Load OpenaAI API key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Utils

In [16]:
def count_tokens(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))


def prompt_chatgpt(input_text, prompt, model="gpt-4o", temperature=0, top_p=1):
    
    # Define a prompt template for classification
    prompt_template = PromptTemplate.from_template(prompt)

    # Create an OpenAI LLM instance
    llm = ChatOpenAI(
        model=model,
        temperature=temperature,
        top_p=top_p,
        max_retries=1,
        max_tokens=1000 
    )

    # Create a runnable sequence
    split_sentence_chain = prompt_template | llm | StrOutputParser()

    # Format prompt
    formatted_prompt = prompt_template.format(text=input_text)
    #print(f"Generated Prompt:\n{formatted_prompt}") # Debugging statement

    # Perform Classification
    output_string = split_sentence_chain.invoke({"text": input_text}).strip()

    # Calculate token count
    input_count = count_tokens(formatted_prompt)
    output_count = count_tokens(output_string)

    #print(f"Using: model = '{model}'; temperature = {temperature}; top_p = {top_p}") # Debugging statement

    return output_string, formatted_prompt, input_count, output_count


def classify_text(model, input_text, device='cpu'):
    
    # Tokenize input
    tokenizer = model.tokenizer  # Assuming tokenizer is part of the model
    inputs = tokenizer(input_text, truncation=True, padding=True, max_length=512, return_tensors='pt')
    
    # Move input to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Define label mapping
    int_to_label = {0: 'FUN', 1: 'STR', 2: 'MIX', 3: 'OTH'}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
        pred_idx = torch.argmax(probs).item()
        pred_class = int_to_label[pred_idx]

    return pred_class, [round(p, 2) for p in probs.tolist()]

# Prompt Defintion 

In [12]:
# Prompt Definition
splitting_prompt = """Your task is to divide a given sentence into sub-sentences.
Insert periods to divide the sentence into meaningful sub-sentences. 
Maintain the original words without any changes. 
Do not use pronouns; instead, repeat the original subjects as needed.

Input Format: A single sentence.
Output Format: A list of sub-sentences enclosed in double quotes, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).

Input: \"{text}\"
Output:"""

print(splitting_prompt)

Your task is to divide a given sentence into sub-sentences.
Insert periods to divide the sentence into meaningful sub-sentences. 
Maintain the original words without any changes. 
Do not use pronouns; instead, repeat the original subjects as needed.

Input Format: A single sentence.
Output Format: A list of sub-sentences enclosed in double quotes, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).

Input: "{text}"
Output:


# Sentence Splitting

In [None]:
# Import Sentences
test_df = pd.read_excel("/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx")
test_df = test_df.query("agreement==True & sent_tag_mc == 'MIX'")
input_text = test_df['sent'].iloc[3] # select by index

In [14]:
# Select Model
chatgpt_model ='gpt-3.5-turbo'

# Perfrom Sentence Splitting
output_string, formatted_prompt, input_count, output_count = prompt_chatgpt(input_text, splitting_prompt, chatgpt_model)

# Validate Output Format
if not output_string:  
    raise ValueError(f"Output is empty.")
try:
    output_string = ast.literal_eval(output_string.replace('\n', ''))
    print(f"Output is in list format.") 
except (SyntaxError, ValueError) as e:
    raise ValueError(f"Output not in list format: {e}")

Output is in list format.


In [15]:
# Initialize ROUGE scorer
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore(rouge_keys = ('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

results = []

for generated_sent in output_string:
    score = rouge(generated_sent, input_text)

    results.append({
        'input_text': input_text,
        'prompt': formatted_prompt,
        'generated_sent': generated_sent,
        #'pred_sent_class': pre_class,
        #'probs': probs,
        'rouge1_precision': round(score['rouge1_precision'].item(), 3),
        'rouge3_precision': round(score['rouge3_precision'].item(), 3),
        'rouge5_precision': round(score['rouge5_precision'].item(), 3),
        'rouge7_precision': round(score['rouge7_precision'].item(), 3),
        'rouge9_precision': round(score['rouge9_precision'].item(), 3),
        'rougeL_precision': round(score['rougeL_precision'].item(), 3),
        'output_string': output_string,
        'input_count': input_count,
        'output_count': output_count,
        #'errors': None,
        #'elapsed_time_sec': time.time() - start_time()
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,input_text,prompt,generated_sent,rouge1_precision,rouge3_precision,rouge5_precision,rouge7_precision,rouge9_precision,rougeL_precision,output_string,input_count,output_count
0,"As shown in figures 2-5, the treatment assembl...",Your task is to divide a given sentence into s...,As shown in figures 2-5,1.0,1.0,1.0,0.0,0.0,1.0,"[As shown in figures 2-5, the treatment assemb...",143,41
1,"As shown in figures 2-5, the treatment assembl...",Your task is to divide a given sentence into s...,the treatment assembly 20 of the medical devic...,1.0,1.0,1.0,1.0,1.0,1.0,"[As shown in figures 2-5, the treatment assemb...",143,41
2,"As shown in figures 2-5, the treatment assembl...",Your task is to divide a given sentence into s...,that supports the electrode array 28 thereon.,1.0,1.0,1.0,1.0,0.0,1.0,"[As shown in figures 2-5, the treatment assemb...",143,41


# Sentence Splitting + Sentence Classification

In [18]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set path to checkpoint
checkpoint_name = 'bert-large-uncased_train_10_4'; model_name = "bert-large-uncased"
checkpoint_name = 'bert-for-patents_train_10_4'; model_name = "anferico/bert-for-patents" 
checkpoint_path = f"/home/fantoni/patent-sentence-classification/models/finetuning/{checkpoint_name}.ckpt"

# Load Base Tokenizer
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
print('\nBase Tokenizer loaded succesfully.')

# Load Base Model
base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)
print('\nBase model loaded succesfully.')

# Load Finetuned Model
model = PatentSentenceClassifier.load_from_checkpoint(
    checkpoint_path,
    model=base_model,
    tokenizer=bert_tokenizer)

model.eval()
model.to(device)
print(f"\nFinetuned model loaded succesfully. Using: '{checkpoint_name}'")

# Define Finetuned Tokenizer
tokenizer = model.tokenizer

Using device: cpu

Base Tokenizer loaded succesfully.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at anferico/bert-for-patents and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Base model loaded succesfully.

Finetuned model loaded succesfully. Using: 'bert-for-patents_train_10_4'


In [None]:
# Initialize ROUGE scorer
# reference: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore(rouge_keys = ('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

# Import Sentences
test_df = pd.read_excel("/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx")
test_df = test_df.query("agreement==True & sent_tag_mc == 'MIX'")
test_df = test_df.head(1) # take a subset for testing

# Select Models
chatgpt_model ='gpt-3.5-turbo'

# Initialize results list
results = []

for i, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing Sentences"):  
    start_time = time.time()              
    try:
        original_sent = row['sent']
        
        # Split Sentence 
        output_string, formatted_prompt, input_count, output_count = prompt_chatgpt(original_sent, splitting_prompt, chatgpt_model)

        # Validate Output Format
        if not output_string:  
            raise ValueError(f"Output is empty.")
        try:
            # Convert string to list
            output_string = ast.literal_eval(output_string)  
            #print(f"Output is in list format.") 
        except (SyntaxError, ValueError) as e:
            raise ValueError(f"Output not in list format: {e}")

        # Iterate over the generated sentences
        for generated_sent in output_string:

            # Classify Sub-Sentences
            pre_class, probs = classify_text(model, generated_sent, device)

            # Compute ROUGE Score
            score = rouge(generated_sent, original_sent)
            
            # Append Results
            results.append({
                'sent_id': row['sent_id'],
                'original_sent_class': row['sent_tag_mc'],
                'original_sent': original_sent,
                'prompt': formatted_prompt,
                'generated_sent': generated_sent,
                'pred_sent_class': pre_class,
                'probs': probs,
                'rouge1_precision': round(score['rouge1_precision'].item(), 3),
                'rouge3_precision': round(score['rouge3_precision'].item(), 3),
                'rouge5_precision': round(score['rouge5_precision'].item(), 3),
                'rouge7_precision': round(score['rouge7_precision'].item(), 3),
                'rouge9_precision': round(score['rouge9_precision'].item(), 3),
                'rougeL_precision': round(score['rougeL_precision'].item(), 3),
                'output_string': output_string,
                'input_count': input_count,
                'output_count': output_count,
                'errors': None,
                'elapsed_time_sec': time.time() - start_time
            })

    except Exception as e:
        print(f"Error processing sentence {row['sent_id']}: {str(e)}")
        results.append({
            'sent_id': row['sent_id'],
            'original_sent_class': row['sent_tag_mc'],
            'original_sent': original_sent,
            'prompt': formatted_prompt,
            'generated_sent': None,
            'pred_sent_class': None,
            'probs': None,
            'rouge1_precision': None,
            'rouge3_precision': None,
            'rouge5_precision': None,
            'rouge7_precision': None,
            'rouge9_precision': None,
            'rougeL_precision': None,
            'output_string': output_string,
            'input_count': None,
            'output_count': None,
            'errors': str(e),
            'elapsed_time_sec': time.time() - start_time
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df

# Save results to Excel
results_df.to_excel("/home/fantoni/patent-sentence-classification/results/mix_disambiguation/mix_disambiguation_new.xlsx", index=False)

Processing Sentences: 100%|██████████| 1/1 [00:08<00:00,  8.49s/it]


Unnamed: 0,sent_id,original_sent_class,original_sent,prompt,generated_sent,pred_sent_class,probs,rouge1_precision,rouge3_precision,rouge5_precision,rouge7_precision,rouge9_precision,rougeL_precision,output_string,input_count,output_count,errors,elapsed_time_sec
0,174364,MIX,"A rotatable element, for moving the coupling e...",Your task is to divide a given sentence into s...,A rotatable element is coupled to the coupling...,STR,"[0.01, 0.97, 0.01, 0.0]",1.0,1.0,0.8,0.667,0.0,1.0,[A rotatable element is coupled to the couplin...,149,48,,5.048832
1,174364,MIX,"A rotatable element, for moving the coupling e...",Your task is to divide a given sentence into s...,The rotatable element is for moving the coupli...,FUN,"[0.77, 0.03, 0.2, 0.0]",1.0,0.8,0.692,0.636,0.556,0.882,[A rotatable element is coupled to the couplin...,149,48,,6.003935
2,174364,MIX,"A rotatable element, for moving the coupling e...",Your task is to divide a given sentence into s...,The boreholes and the respective rods prevent ...,FUN,"[0.77, 0.12, 0.1, 0.01]",1.0,1.0,1.0,1.0,1.0,1.0,[A rotatable element is coupled to the couplin...,149,48,,8.49202
