# Installations and Imports

In [98]:
import torch
import os
import json
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import tiktoken
from dotenv import load_dotenv
import time
import ast
import warnings
warnings.filterwarnings('ignore')

# LangChain Import
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Finetuned Model Import
from transformers import BertTokenizer, BertForSequenceClassification
from src.model import PatentSentenceClassifier

# Load OpenaAI API key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# LLM Sentence Splitting

In [99]:
# Prompt Definition
task = """Divide the given sentence into sub-sentences by inserting periods. Maintain the original words without any changes. Do not use pronouns. Repeat the original subjects as needed.
Input Format: A single sentence.
Output Format: A list of sub-sentences, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).
"""
output_format = "Input: '{text}' \nOutput:"
prompt = '\n'.join([task, output_format])
print(prompt)

Divide the given sentence into sub-sentences by inserting periods. Maintain the original words without any changes. Do not use pronouns. Repeat the original subjects as needed.
Input Format: A single sentence.
Output Format: A list of sub-sentences, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).

Input: '{text}' 
Output:


In [100]:
def count_tokens(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

def split_sentence_chatgpt(input_text, prompt, model="gpt-4o"):
    
    # Define a prompt template for classification
    prompt_template = PromptTemplate.from_template(prompt)

    # Create an OpenAI LLM instance
    llm = ChatOpenAI(
        model=model,
        temperature=0,
        max_retries=2,
        max_tokens=1000 
    )

    # Create a runnable sequence
    split_sentence_chain = prompt_template | llm | StrOutputParser()

    # Format prompt
    formatted_prompt = prompt_template.format(text=input_text)
    #print(f"Generated Prompt:\n{formatted_prompt}") # Debugging statement

    # Perform Classification
    output_string = split_sentence_chain.invoke({"text": input_text}).strip()

    # Calculate token count
    input_count = count_tokens(formatted_prompt)
    output_count = count_tokens(output_string)

    return output_string, input_count, output_count

# Test Sentence Splitting

In [101]:
# Import Sentences
test_df = pd.read_excel("/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx")
test_df = test_df.query("agreement==True & sent_tag_mc == 'MIX'")

# Select a Sentence by Index
idx = 3
input_text = test_df['sent'].iloc[idx]

# Select Model
chatgpt_model ='gpt-3.5-turbo'

In [102]:
# Perfrom Sentence Splitting
output_string, input_count, output_count = split_sentence_chatgpt(input_text, prompt, chatgpt_model)

# Validate Output Format
if not output_string:  
    raise ValueError(f"Output is empty.")
try:
    output_string = ast.literal_eval(output_string)
    print(f"Output is in list format.") 
except (SyntaxError, ValueError):
    raise ValueError(f"Output not in list format.")

Output is in list format.


In [89]:
# Initialize ROUGE scorer
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore()
 
# Create a list of tuples containing reference and generated sentence pairs
ref_generated_sents = [(input_text, sent) for sent in output_string] 

# Compute ROUGE scores for each sentence pair
data = []
for ref, gen in ref_generated_sents:
    score = rouge(gen, ref)
    rouge1_precision = score['rouge1_precision'].item()
    rouge2_precision = score['rouge2_precision'].item()
    rougeL_precision = score['rougeL_precision'].item()
    data.append((ref, gen, rouge1_precision, rouge2_precision, rougeL_precision))

df = pd.DataFrame(data, columns=["reference", "generated", "ROUGE-1 Precision", "ROUGE-2 Precision", "ROUGE-L Precision"])
df

Unnamed: 0,reference,generated,ROUGE-1 Precision,ROUGE-2 Precision,ROUGE-L Precision
0,"As shown in figures 2-5, the treatment assembl...",As shown in figures 2-5.,1.0,1.0,1.0
1,"As shown in figures 2-5, the treatment assembl...",The treatment assembly 20 of the medical devic...,1.0,1.0,1.0
2,"As shown in figures 2-5, the treatment assembl...",The carrier assembly 36 supports the electrode...,1.0,0.777778,1.0


# Load Sentence Classification Model

In [80]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set path to checkpoint
checkpoint_name = 'bert-large-uncased_train_10_7'; model_name = "bert-large-uncased"
checkpoint_name = 'bert-for-patents_train_10_7'; model_name = "anferico/bert-for-patents" # https://huggingface.co/anferico/bert-for-patents
checkpoint_path = f"/home/fantoni/patent-sentence-classification/models/finetuning/{checkpoint_name}.ckpt"

# Load Base Tokenizer
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
print('\nBase Tokenizer loaded succesfully.')

# Load Base Model
base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)
print('\nBase model loaded succesfully.')

# Load Finetuned Model
model = PatentSentenceClassifier.load_from_checkpoint(
    checkpoint_path,
    model=base_model,
    tokenizer=bert_tokenizer)

model.eval()
model.to(device)
print('\nFinetuned model loaded succesfully.')

# Define Finetuned Tokenizer
tokenizer = model.tokenizer

Using device: cpu

Base Tokenizer loaded succesfully.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at anferico/bert-for-patents and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Base model loaded succesfully.

Finetuned model loaded succesfully.


# Classify Sentences

In [103]:
def classify_text(model, input_text, device='cpu'):
    
    # Tokenize input
    tokenizer = model.tokenizer  # Assuming tokenizer is part of the model
    inputs = tokenizer(input_text, truncation=True, padding=True, max_length=512, return_tensors='pt')
    
    # Move input to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Define label mapping
    int_to_label = {0: 'FUN', 1: 'STR', 2: 'MIX', 3: 'OTH'}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
        pred_idx = torch.argmax(probs).item()
        pred_class = int_to_label[pred_idx]

    return pred_class, [round(p, 2) for p in probs.tolist()]

In [None]:
# Initialize ROUGE scorer
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore()

# Import Sentences
test_df = pd.read_excel("/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx")
test_df = test_df.query("agreement==True & sent_tag_mc == 'MIX'")
test_df = test_df.head(2)

# Select Models
chatgpt_model ='gpt-3.5-turbo'

# Initialize results list
results = []

for i, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing Sentences"):  
    start_time = time.time()              
    try:
        original_sent = row['sent']
        # Split Sentence 
        output_string, input_count, output_count = split_sentence_chatgpt(original_sent, prompt, chatgpt_model)

        # Validate Output Format
        if not output_string:  
            raise ValueError(f"Output is empty.")
        try:
            # Convert string to list
            output_string = ast.literal_eval(output_string)  
            #print(f"Output is in list format.") 
        except (SyntaxError, ValueError):
            raise ValueError(f"Output not in list format.")

        # Iterate over the generated sentences
        for generated_sent in output_string:

            # Classify Sub-Sentences
            pre_class, probs = classify_text(model, generated_sent, device)

            # Compute ROUGE Score
            score = rouge(generated_sent, original_sent)
            
            # Append Results
            results.append({
                'sent_id': row['sent_id'],
                'original_sent': original_sent,
                'original_sent_class': row['sent_tag_mc'],
                'generated_sent': generated_sent,
                'rouge1_precision': score['rouge1_precision'].item(),
                'rouge2_precision': score['rouge2_precision'].item(),
                'rougeL_precision': score['rougeL_precision'].item(),
                'pred_sent_class': pre_class,
                'probs': probs,
                'input_count': input_count,
                'output_count': output_count,
                'errors': None,
                'elapsed_time_sec': time.time() - start_time
            })

    except Exception as e:
        print(f"Error processing sentence {row['sent_id']}: {str(e)}")
        results.append({
            'sent_id': row['sent_id'],
            'original_sent': row['sent'],
            'original_sent_class': row['sent_tag_mc'],
            'generated_sentence': None,
            'rouge1_precision': None,
            'rouge2_precision': None,
            'rougeL_precision': None,
            'pred_sent_class': None,
            'probs': None,
            'input_count': None,
            'output_count': None,
            'errors': str(e),
            'elapsed_time_sec': time.time() - start_time
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df

# Save results to Excel
#results_df.to_excel("rouge_results.xlsx", index=False)

Processing Sentences:   0%|          | 0/2 [00:00<?, ?it/s]

['A rotatable element is coupled to the coupling element.', 'A rotatable element is for moving the coupling element along a rotation axis of the rotatable element.', 'The boreholes and the respective rods prevent rotation of the coupling element.']
Output is in list format.
STR
FUN


Processing Sentences:  50%|█████     | 1/2 [00:08<00:08,  8.15s/it]

FUN
['In the case of seed which has been treated at different points in time with an active compound combination according to the invention.', 'The individual substances may be present on the seed in different layers.']
Output is in list format.
STR


Processing Sentences: 100%|██████████| 2/2 [00:12<00:00,  6.19s/it]

STR





Unnamed: 0,sent_id,original_sent,original_sent_class,generated_sent,rouge1_precision,rouge2_precision,rougeL_precision,pred_sent_class,probs,input_count,output_count,errors,elapsed_time_sec
0,174364,"A rotatable element, for moving the coupling e...",MIX,A rotatable element is coupled to the coupling...,1.0,1.0,1.0,STR,"[0.01, 0.96, 0.02, 0.0]",128,48,,4.935974
1,174364,"A rotatable element, for moving the coupling e...",MIX,A rotatable element is for moving the coupling...,1.0,0.9375,0.941176,FUN,"[0.75, 0.02, 0.22, 0.01]",128,48,,5.897938
2,174364,"A rotatable element, for moving the coupling e...",MIX,The boreholes and the respective rods prevent ...,1.0,1.0,1.0,FUN,"[0.74, 0.09, 0.15, 0.01]",128,48,,8.154483
3,11901,In the case of seed which has been treated at ...,MIX,In the case of seed which has been treated at ...,1.0,1.0,1.0,STR,"[0.35, 0.57, 0.04, 0.04]",122,39,,1.676124
4,11901,In the case of seed which has been treated at ...,MIX,The individual substances may be present on th...,1.0,1.0,1.0,STR,"[0.05, 0.89, 0.02, 0.05]",122,39,,4.219322
