# Installations and Imports

In [2]:
import torch
import os
import json
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import tiktoken
from dotenv import load_dotenv
import time
import ast
import re
import warnings
warnings.filterwarnings('ignore')

# LangChain Import
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Finetuned Model Import
from transformers import BertTokenizer, BertForSequenceClassification
from src.model import PatentSentenceClassifier

# Load OpenaAI API key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Utils

In [None]:
def count_tokens(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

def prompt_chatgpt(input_text, input_context, prompt, model="gpt-4o", temperature=0, top_p=1):
    
    # Define a prompt template for classification
    prompt_template = PromptTemplate.from_template(prompt)

    # Create an OpenAI LLM instance
    llm = ChatOpenAI(
        model=model,
        temperature=temperature,
        top_p=top_p,
        max_retries=1,
        max_tokens=1000 
    )

    # Create a runnable sequence
    chain = prompt_template | llm | StrOutputParser()

    # Prepare inputs
    inputs = {"input_text": input_text}
    if input_context:
        inputs["input_context"] = input_context

    # Format prompt
    formatted_prompt = prompt_template.format(**inputs)
    ##print(f"Generated Prompt:\n{formatted_prompt}") # Debugging statement

    # Invoke Chain
    output_string = chain.invoke(inputs).strip()

    # Calculate token count
    #input_count = count_tokens(formatted_prompt)
    #output_count = count_tokens(output_string)

    #print(f"Using: model = '{model}'; temperature = {temperature}; top_p = {top_p}") # Debugging statement

    return output_string, formatted_prompt


def classify_text(model, input_text, device='cpu'):
    
    # Tokenize input
    tokenizer = model.tokenizer  # Assuming tokenizer is part of the model
    inputs = tokenizer(input_text, truncation=True, padding=True, max_length=512, return_tensors='pt')
    
    # Move input to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Define label mapping
    int_to_label = {0: 'FUN', 1: 'STR', 2: 'MIX', 3: 'OTH'}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
        pred_idx = torch.argmax(probs).item()
        pred_class = int_to_label[pred_idx]

    return pred_class, [round(p, 2) for p in probs.tolist()]


def build_tree_with_parents(text):
    """
    Parses a text block with tab-indented lines to build a tree structure representing hierarchical relationships.
    Each line's indentation level (based on tab characters '\t') determines its depth in the hierarchy.
    Automatically fixes incorrect indentation levels by ensuring each level increases by exactly one tab.

    Args:
        text (str): A multiline string with tab-indented lines representing a hierarchy.

    Returns:
        List[Dict[str, Any]]: A list where each element is a dictionary with:
            - 'line': The actual content of the line.
            - 'parents': A list of parent lines leading to that line.
    """
    lines = text.strip().split('\n')
    tree = []
    stack = []  # Stack to track parent hierarchy
    indent_stack = []  # Stack to track original indent levels for normalization

    for index, line in enumerate(lines):
        raw_indent = len(line) - len(line.lstrip('\t'))
        content = line.strip()

        if not content:
            continue  # Skip empty lines

        # Determine proper indentation level based on stack depth
        # If current indent is deeper than allowed, adjust it
        while indent_stack and raw_indent <= indent_stack[-1]:
            stack.pop()
            indent_stack.pop()

        # Append the current line with the corrected parents
        tree.append({
            'line': content,
            'parents': stack.copy()
        })

        # Update stacks with the new node
        stack.append(content)
        indent_stack.append(raw_indent)

    return tree


def create_hierarchy(text):
    """
    Arguments:
        text (str): A multiline string where each line begins with one or more '>' characters to indicate hierarchy.

    Returns:
        pd.DataFrame: A DataFrame with the following columns:
            - 'index': Hierarchical index (e.g., '1', '1.1', '1.1.1')
            - 'sentence': The textual content of the line
            - 'parent_indices': List of parent index strings
            - 'parents': List of parent content strings
    """
    
    lines = text.strip().splitlines()
    counters = []
    index_sentence_dict = {}
    rows = []

    for line in lines:
        # Remove the leading '>' used to denote the root node level
        line = line[1:]  # The first '>' is always present, even for root-level items
        
        # Determine level by counting leading '>' characters
        level = len(line) - len(line.lstrip('>'))
        content = line.lstrip('>').strip()
        if not content:
            continue

        # Adjust counters for current level
        if len(counters) <= level:
            counters += [1] * (level + 1 - len(counters))
        else:
            counters = counters[:level + 1]
            counters[level] += 1

        # Build Index
        index = ".".join(map(str, counters[:level + 1]))
        index_sentence_dict[index] = content

        # Generate parent indices and content inline
        parent_indices = [".".join(map(str, counters[:i])) for i in range(1, level + 1)]
        parent_contents = [index_sentence_dict[pidx] for pidx in parent_indices if pidx in index_sentence_dict]

        rows.append({
            "index": index,
            "sentence": content,
            "parent_indices": parent_indices,
            "parents": parent_contents
        })

    return pd.DataFrame(rows)

# Prompts Definition

In [None]:
# =========================================================================================
# Prompt to indent claim
indenting_prompt = """Your task is to format the following patent claim by indenting each logical block of information.
Use  ">" characters to indent the beginning of each block. 

\"{input_text}\"
"""
print(indenting_prompt)

# =========================================================================================
# Prompt to rephrase a text using its context
rephrasing_with_context_prompt = """Your task is to rephrase the given text into Subject-Verb-Object (SVO) structure.
Avoid using pronouns. Instead, repeat the original subject explicitly where needed.

Use the provided context (if any) to resolve references and pronouns in the main text.

Context Format: Supplementary information providing background for the main text.
Input Format: The main text that is to be rephrased.

Context: \"{input_context}\"
Input: \"{input_text}\"
Output:""" 

# =========================================================================================
# Prompt to split a text into sub-sentences using its context
splitting_with_context_prompt = """Your task is to split the given text into sub-sentences, ensuring that:
1. Each sub-sentence must contain only one predicate.
2. Avoid using pronouns. Instead, repeat the original subject explicitly where needed.

Use the provided context (if any) to resolve references and pronouns in the main text.

Context Format: Supplementary information providing background for the main text.
Input Format: The main text that is to be split.
Output Format: A list of sub-sentences enclosed in double quotes, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).

Context: \"{input_context}\"
Input: \"{input_text}\"
Output:
"""

Your task is to format the following patent claim by indenting each logical block of information.
Use  ">" characters to indent the beginning of each block. 

"{input_text}"



## Prompt Trash

In [None]:
# =========================================================================================
##task = """Your task is to generate a summary of a given text. Maintain the original words without any changes.\n""" # summarize
##task = """Your task is to rephrase a text. Maintain the original words without any changes.\n""" # rephrase
rephrasing_prompt = """Your task is to rephrase the following text while keeping the original words unchanged.
Follow these steps in sequence:
1. Rephrase the text into Subject-Verb-Object (SVO) structure.
2. Do not use any past participle verbs in your rephrased version.

Input: \"{input_text}\"
Output:"""

# =========================================================================================
# Prompt to split into sub-sentences
## Maintain the original words without any changes. 
splitting_prompt = """Your task is to divide a given sentence into sub-sentences.
Insert periods to divide the sentence into meaningful sub-sentences. 
Do not use pronouns; instead, repeat the original subjects as needed.

Input Format: A single sentence.
Output Format: A list of sub-sentences enclosed in double quotes, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).

Input: \"{input_text}\"
Output:"""

# Load Classification Model

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set path to checkpoint
checkpoint_name = 'bert-large-uncased_train_10_4'; model_name = "bert-large-uncased"
checkpoint_name = 'bert-for-patents_train_10_4'; model_name = "anferico/bert-for-patents" 
checkpoint_path = f"/home/fantoni/patent-sentence-classification/models/finetuning/{checkpoint_name}.ckpt"

# Load Base Tokenizer
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
print('\nBase Tokenizer loaded succesfully.')

# Load Base Model
base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)
print('\nBase model loaded succesfully.')

# Load Finetuned Model
model = PatentSentenceClassifier.load_from_checkpoint(
    checkpoint_path,
    model=base_model,
    tokenizer=bert_tokenizer)

model.eval()
model.to(device)
print(f"\nFinetuned model loaded succesfully. Using: '{checkpoint_name}'")

# Define Finetuned Tokenizer
tokenizer = model.tokenizer

# Import Data

In [49]:
# 1. Select my patent
#patent_id = 'US9468782B2'; IPC = 'A62B23' 
#patent_id = 'US11673469B2'; IPC = 'B60K37'
#patent_id = 'US20200074811A1'; IPC = 'G07F17'

patent_id = 'US8695121B2'; IPC = 'A42B3'; input_text = """1. A helmet system for removing condensation from a user's field of vision, comprising: a helmet shell having an anterior section, a posterior section, and a venting passage, wherein the helmet shell defines an internal cavity that is in fluid communication with a front portion of the venting passage, and wherein the internal cavity is configured to receive the user's head; a visor coupled with the helmet shell, wherein at least part of the visor defines part of the internal cavity; a humidity sensor positioned within the internal cavity of the helmet shell; and a ventilation system comprising: a base coupled with the helmet shell, wherein the base has a first venting aperture in fluid communication with a rear portion of the venting passage, a base cover coupled with the base, wherein the base cover has a second venting aperture, an air movement assembly disposed between the base and the base cover, wherein the air movement assembly provides fluid communication between the first venting aperture and the second venting aperture, a switch, a power source, and a circuit card comprising: a first input configured to receive a signal from the humidity sensor, a second input configured to receive a second signal from the switch, a first module configured to determine a first instruction for the air movement assembly based on the signal received from the humidity sensor, a second module configured to determine a second instruction for the air movement assembly based on the second signal received from the switch, a first output configured to transmit the first instruction to the air movement assembly, wherein activation of the air movement assembly based on the first instruction operates to remove condensation from the user's field of vision by moving a volume of air of the internal cavity of the helmet shell through the venting passage and moving the volume of air through the second venting aperture of the base cover between the anterior section and the posterior section, and a second output configured to transmit the second instruction to the air movement assembly, wherein the second instruction controls whether power is delivered to the air movement assembly and controls whether the air movement assembly directs air flow from the first venting aperture toward the second venting aperture or from the second venting aperture toward the first venting aperture."""
#patent_id = 'US11133720B2'; IPC = 'H02K3'; input_text = """1. A power tool comprising: a housing; a motor housed inside the housing, the motor having a stator assembly and a rotor rotatably arranged inside the stator, the stator assembly comprising: a stator core defining a plurality of poles and having an outer surface that is substantially cylindrical formed around a longitudinal axis of the motor; at least one magnet wire wound on the plurality of poles forming a plurality of phases in a delta or a wye configuration; and a bus bar comprising: a non-conductive mount arranged on the outer surface of the lamination stack, and a plurality of conductive terminals arranged to receive electric power from a power source and supply electric power to the plurality of phases, wherein each conductive terminal includes: a main portion mounted on the non-conductive mount substantially parallel to the longitudinal axis of the motor, a tang portion folded over the main portion from a first longitudinal end of the main portion, and a connection tab at a second longitudinal end of the main portion, wherein at least a contact portion of the at least one magnet wire is wrapped around the tang portion and fused to make an electric connection to the conductive terminal, and the connection tab is arranged to makes electric contact with a wire supplying electric power to the motor."""

In [84]:
# 2. Select patent Pavanello
IPC = 'F16D65-12'; patent_id = "WO-2019021161-A1" ; input_text = """1. Method for making a brake disc, comprising the following operating steps: a) arranging a brake disc, comprising a braking band (2) provided with two opposed braking surfaces (2a, 2b) , each of which defines at least partially one of two main sides of the disc, the braking band being made of aluminium or aluminium alloy or being made of grey cast iron or steel; b) depositing on the disc a layer of chromium carbide (Cr3C2) and nickel-chromium (NiCr) in particle form by HVOF (High Velocity Oxygen Fuel) technique or by HVAF (High Velocity Air Fuel) technique or by KM (Kinetic Metallization) technique, forming a base protective coating (30) covering at least one of the two braking surfaces of the braking band in direct contact therewith; and c) depositing on said base protective coating (30) a material in particle form consisting of tungsten carbide (WC) , iron (Fe), chromium (Cr) and aluminium (Al) by HVOF (High Velocity Oxygen Fuel) technique or by HVAF (High Velocity Air Fuel) technique or by KM (Kinetic Metallization) technique, forming a surface protective coating (3), consisting of tungsten carbide (WC) and iron (Fe), chromium (Cr) and aluminium (Al) and covering at least one of the two braking surfaces of the braking band."""
IPC = 'F16D55-288'; patent_id = "WO-2019243958-A1"; input_text = """1. A spring (22) for friction pads (1 1 ) associable with a caliper (2) of a disc brake (1 ) for elastically biasing the friction pads (1 1 ) away from a brake disc (3) of the disc brake (1 ), said spring (22) comprising a traverse elongated plate (25) folded so as to form a central stretch (26) and two opposite transverse stretches (27), extending from the central stretch (26) in two opposite transverse directions with respect to a longitudinal median plane (28) of the spring (22), said opposite transverse stretches (27) each forming a supporting stretch (29), a resting stretch (30) and a wing stretch (31 ) extending between the supporting stretch (29) and the resting stretch (30), wherein the supporting stretches (29) border on the central stretch (26) and oriented so that both supporting stretches (29) lie on a same supporting plane (32) transverse, possibly orthogonal, to the longitudinal median plane (28), wherein the wing stretch (31 ) borders on the respective supporting stretch (29) and comprises: - an ascending wing stretch (33) extending from the supporting stretch (29) away from the supporting plane (32) towards an upper side (34) of the spring (22) and away from the longitudinal median plane (28) to an upper apical point (35), - a descending wing stretch (36) extended from the upper apical point (35) further away from the longitudinal median plane (28) and towards a lower side (37) of the spring (22) to a folding line (38) which connects the wing stretch (31 ) to the resting stretch (30), wherein the spring (22) further comprises one or more fixing stretches (39) connected to the central stretch (26) and forming at least two opposite fixing stretches (40) for an elastic snap fixing to corresponding fixing seats (41 ) of the caliper (2), and wherein the resting stretch (30) forms: - a plate-shaped contact portion (44) for a free support, with possibility of sliding, against a corresponding contact surface (45) of the friction pad (1 1 ), - a plate-shaped intermediate portion (46) extending between the wing stretch (31 ) and the contact portion (44), wherein, with the spring (22) undeformed, the intermediate portion (46) extends transversely with respect to the supporting plane (32) towards the lower side (37) of the spring (22) and the contact portion (44) extends, starting from a folding edge (48) formed between the contact portion (44) and the intermediate portion (46), so as to diverge with respect to the supporting plane (32) and towards the longitudinal median plane (28)."""
IPC = 'B6078-1706'; patent_id = "WO-2020058819-A1"; input_text = """1. A braking system (4) for motorcycles comprising - a first manual actuator device (8), operable by means of a lever and / or a pedal, selectively connectable to at least a first braking device (12) placed on a front axle (16) of the vehicle, and/or at least a second braking device (20) placed on said front axle (16) or on a rear axle (22) of the motorcycle, each braking device (4) acting on a relative brake disc or drum (14), - the first manual actuator device (8) being provided with a hydraulic supply circuit (24) that can be selectively connected to a hydraulic input circuit (28) of at least one of said braking devices (12,20) by means of a control valve (32), said control valve (32) being positioned in an operating position, in which it hydraulically disconnects the first manual actuator device (8) from the braking devices (12,20), and in an electrical fault position, in which it hydraulically connects the first manual actuator device (8) with at least one of said braking devices (12,20), at least one electro-hydraulic automatic actuator device (36) fluidly connectable to the hydraulic input circuit (28) of at least one of said braking devices (12, 20) for the respective hydraulic actuation thereof, at least one electromechanical automatic actuator device (40) associated with at least one of said braking devices (12, 20) not provided with a hydraulic input circuit (28), - the electro-hydraulic automatic actuator devices (36) being associated with the front axle (16) and the electromechanical automatic actuator devices (40) being associated with the rear axle (22) of the motorcycle or vice versa, - a single control unit (44) operatively connected to the control valve (32), to the at least one electro-hydraulic automatic actuator device (36), to the at least one electromechanical automatic actuator device (40) and to the first manual actuator device (8) so as to operate said electro-hydraulic (36) and electromechanical (40) automatic actuator devices and said control valve (32) according to the position or configuration of the first manual actuator device (8) and/or according to the dynamics of the motorcycle."""
#IPC = 'B6078-261'; patent_id = "WO-2022144719-A1"; input_text = """1. A braking system (4) for a motorcycle (8) comprising: - at least one first brake (12) associated with a front wheel of said motorcycle (8), - at least one first electro-hydraulic or electromechanical actuator (16), operatively connected to said first brake (12), - at least one first manual actuation command (20), associated with and corresponding to said at least one first brake (12), to send a braking request from a user, - at least a second brake (24) associated with a rear wheel of said motorcycle (8), - at least a second electro-hydraulic or electromechanical actuator (28) operatively connected to said second brake (24), - at least a second manual actuation command (32), associated with and corresponding to said at least one second brake (24), to send a braking request from a user, - a control unit (36) operatively connected to the first manual actuation command (20), the second manual actuation command (32) and said first and second electro- hydraulic or electro-mechanical actuators (16,28), - wherein said control unit (36) is programmed to: - receive a braking request from the user following actuation of at least one of said manual actuation commands (20,32), - interpreting the braking request as a function of which or how many actuation commands have actually been actuated, and/or the intensity of such actuation given by a stroke and/or actuation force or pressure of the corresponding manual actuation command (20,32), - activating at least one of said electro-hydraulic or electro-mechanical actuators (16,28), irrespective of the effective actuation of the corresponding manual actuation command (20,32), so as to obtain a deceleration of the motorcycle (8) as a function of said braking request."""
#IPC = 'C08J5-0405'; patent_id = "WO-2017216367-A1"; input_text = """1. A process to make a flexible composite material comprising flexible ceramic nanofibers and a polymer, the process of making flexible ceramic nanofibers comprising the steps of: a. Preparing a ceramic fibers' precursor solution, the precursor solution comprising (i) a dissolved metal's precursor for ceramic selected from the group consisting of metallic ions and metal containing polymer, where the metals are preferably selected from the group consisting of Si4+, Zr4+, ΤΊ4+, Y3+, Al3+, Zn2+, Mg2+, Pb4+ , Ni2+, Sr2+, Ca2+, La3+ ; (ii) a polymer to increase the precursor solution's viscosity, with the solid content of the precursor solution (polymer plus precursor) being above 5% by weight, preferably 15% by weight, in order to obtain the required deposition, and (iii) solvent capable of providing the precursor solution giving a sufficiently high evaporation rate; b. Allowing the dissolved metal precursors for ceramic to form a final metal oxide also known as ceramic; c. Maintaining the precursor solution's viscosity between 0.01 and 1000 Pascal- second (Pa-s), preferably between 0.01 and 5000 Pascal-second (Pa-s), at a shear rate of 0,01 to 1 s"1, preferably 0.1 s"1, in order to spin usable fibers; d. Spinning the precursor solution by using a spinning process selected from the group consisting of for example forcespinning, electrospinning and blowspinning wherein the spinning parameters are tunable so that the spinning step can result in polymeric fibers and with the spinning parameters being adaptable to each precursor solution; e. Annealing the polymeric fibers obtained from the spinning process, the polymeric fibers comprising the metals precursors for ceramic, until all the organic content is burned out and the metallic ion oxidizes to form a ceramic; f. Tuning and calibrating annealing parameters, the annealing parameters comprising heating and cooling rates, annealing temperature and dwell time consistent with preferably a trapezium shaped thermal profile so a crystallinity comprising a crystal size of 1 to 100 nm and a smoothness of 0.05 to 5 nm of Rq of the resulting 20 to 10000 nm thick fibers is obtained, the annealing parameters being distinct and specific with respect to each material composition; g. Setting the annealing temperature above the ceramic fiber's crystallization point resulting in the formation of ceramic material; and h. Setting a dwell time from 0 to 5 hours or even more."""
#IPC = 'G06F30-30'; patent_id = "US-10733341-B1"; input_text = """1. A computer-implemented method comprising: retrieving, by a network node in a plurality of distributed network nodes, code differentials between successive iterations of an integrated circuit design in a hardware description language; iteratively generating, by the network node, a first plurality of digital blocks for a first chain in a first direction within a two-dimensional distributed digital ledger hosted by the plurality of distributed network nodes, the first plurality of digital blocks containing hashes of corresponding code differentials generated by the network node using a first hashing protocol associated with a first level of security; retrieving, by the network node, a plurality of simulation data records generated through successive simulation operations on the integrated circuit design; iteratively generating, by the network node, a second plurality of digital blocks for a second chain in a second direction within the two-dimensional distributed ledger, the second plurality of digital blocks containing hashes of corresponding simulation data records generated by the network node using a second hashing protocol using a second level of security, such that the second chain is cryptographically separate from the first chain; associating, by the network node, the first plurality of digital blocks and the second plurality of digital blocks to a physically unclonable function in an integrated circuit fabricated from the integrated circuit design; in response to the network node receiving a first query containing the physically unclonable function from a first user with first level security credentials: displaying, by the network node, the code differentials of the integrated circuit design; and in response to the network node receiving a second query containing the physically unclonable function from a second user with second level security credentials: displaying, by the network node, the plurality of simulation data records of the integrated circuit design."""
IPC = 'B65G1-023'; patent_id = "IT-201900008253-A1"; input_text = """1. A system (10) comprising: - a roller box (15) which includes: ● two parallel sides (25), ● at least one horizontal shelf (R) defined by a plurality of idle rollers (20) placed side by side along a side-by-side direction (A) in which each roller (20) is supported at opposite ends by a respective side (25) received in a seat through made in the respective side (25), at least one first end (30) of the ends of each roller (20) presenting a prismatic seat (70), in which the first ends (30) of the rollers (20) all protrude from the same side (25), and ● a connection assembly configured to pivotally connect each seat to the respective end of a roller (20), the system (10), furthermore, comprising: - a drive device (P) which comprises: or a frame (T), either a horizontal bar (65) sliding vertically on the frame (T), or a plurality of mandrels (M) rotatably associated with the bar (65) with respect to a horizontal axis of revolution, in which each mandrel (M) can be inserted in a seat prismatic (70) of a respective roller (20), in which each spindle (M) comprises: ● a prismatic gripping head (75) placed at the free end of a shaft (80), and ● a bushing (85) equipped with an external fluted jacket inserted by coaxial interference on the end opposite the free end of the shaft (80), characterized in that said bushing (85) is made of a rigid material and the shaft (80) is flexible."""

# Extract Hierarchy

In [92]:
# Indent text with prompt
output_string, _, _, _ = prompt_chatgpt(input_text=input_text, input_context=None, prompt=indenting_prompt, model='gpt-3.5-turbo')
print(output_string)

>1. A system (10) comprising:
>>- a roller box (15) which includes:
>>>- two parallel sides (25),
>>>- at least one horizontal shelf (R) defined by a plurality of idle rollers (20) placed side by side along a side-by-side direction (A) in which each roller (20) is supported at opposite ends by a respective side (25) received in a seat through made in the respective side (25), at least one first end (30) of the ends of each roller (20) presenting a prismatic seat (70), in which the first ends (30) of the rollers (20) all protrude from the same side (25), and
>>>- a connection assembly configured to pivotally connect each seat to the respective end of a roller (20), the system (10), furthermore, comprising:
>>>>- a drive device (P) which comprises: or a frame (T), either a horizontal bar (65) sliding vertically on the frame (T), or a plurality of mandrels (M) rotatably associated with the bar (65) with respect to a horizontal axis of revolution, in which each mandrel (M) can be inserted 

In [93]:
df = create_hierarchy(output_string)
df

Unnamed: 0,index,sentence,parent_indices,parents
0,1,1. A system (10) comprising:,[],[]
1,1.1,- a roller box (15) which includes:,[1],[1. A system (10) comprising:]
2,1.1.1,"- two parallel sides (25),","[1, 1.1]","[1. A system (10) comprising:, - a roller box ..."
3,1.1.2,- at least one horizontal shelf (R) defined by...,"[1, 1.1]","[1. A system (10) comprising:, - a roller box ..."
4,1.1.3,- a connection assembly configured to pivotall...,"[1, 1.1]","[1. A system (10) comprising:, - a roller box ..."
5,1.1.3.1,- a drive device (P) which comprises: or a fra...,"[1, 1.1, 1.1.3]","[1. A system (10) comprising:, - a roller box ..."
6,1.1.3.1.1,- a prismatic gripping head (75) placed at the...,"[1, 1.1, 1.1.3, 1.1.3.1]","[1. A system (10) comprising:, - a roller box ..."
7,1.1.3.1.2,- a bushing (85) equipped with an external flu...,"[1, 1.1, 1.1.3, 1.1.3.1]","[1. A system (10) comprising:, - a roller box ..."


# Rephrasing + Splitting + Classification

different results ChatGPT and API ChatGPT : https://community.openai.com/t/different-results-same-prompt-on-openai-api-vs-chatgpt/1062995

In [None]:
# Initialize ROUGE scorer with various n-gram options
# reference: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore(rouge_keys = ('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

# Model configuration
CHATGPT_MODEL ='gpt-4o'
CHATGPT_MODEL ='gpt-3.5-turbo'
TEMPERATURE = 0
TOP_P = 1

results = []

for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing Claim Texts"):  
    try:
        # 1. Rephrase text using context ==========================================================================================================================
        input_text = row['text']
        if row['parents']:
            input_context = ' '.join(row['parents']) 
            rephrased_text, rephrasing_prompt = prompt_chatgpt(input_text, input_context, rephrasing_with_context_prompt, CHATGPT_MODEL, TEMPERATURE, TOP_P)
        else 'None'
        
        # 2. Split text into sub sentences ==========================================================================================================================
        split_text, splitting_prompt = prompt_chatgpt(rephrased_text, input_context, splitting_with_context_prompt, CHATGPT_MODEL, TEMPERATURE, TOP_P)

        # Validate output format
        if not split_text:  
            raise ValueError(f"Output is empty.")
        try:
            split_text = ast.literal_eval(split_text)  
            #print(f"Output is in list format.") 
        except (SyntaxError, ValueError) as e:
            raise ValueError(f"Output not in list format: {e}")
        
        
        # 3. Classify sub sentences ==========================================================================================================================
        for sent in split_text:
            
            # Perfrom Classification
            pred_class, probs = classify_text(model, sent, device)
            
            # Calculate rouge score
            score = rouge(sent, input_text)

            # Append results
            results.append({
                'text_id': row['text_id'],
                'text': input_text,
                'context': input_context,
                'rephrasing_prompt': rephrasing_prompt,
                'rephrased_text': rephrased_text,
                'splitting_prompt': splitting_prompt,
                'sentence': sent,
                'pred_class': pred_class,
                'probs': probs,
                'rouge1_precision': round(score['rouge1_precision'].item(), 3),
                'rouge3_precision': round(score['rouge3_precision'].item(), 3),
                'rouge5_precision': round(score['rouge5_precision'].item(), 3),
                'rouge7_precision': round(score['rouge7_precision'].item(), 3),
                'rouge9_precision': round(score['rouge9_precision'].item(), 3),
                'rougeL_precision': round(score['rougeL_precision'].item(), 3),
                'reph_input_count': reph_input_count,
                'reph_output_count': reph_output_count,
                'split_input_count': split_input_count,
                'split_output_count': split_output_count,
                'errors': None,
            })

    # Process Errors =======================================================================================
    except Exception as e:
        print(f"Error processing sentence {row['text_id']}: {str(e)}")
        results.append({
            'text_id': row['text_id'],
            'text': input_text,
            'context': input_context,
            'rephrasing_prompt': rephrasing_prompt,
            'rephrased_text': None,
            'splitting_prompt': None,
            'sentence': None,
            'pred_class': None,
            'probs': None,
            'rouge1_precision': None,
            'rouge3_precision': None,
            'rouge5_precision': None,
            'rouge7_precision': None,
            'rouge9_precision': None,
            'rougeL_precision': None,
            'reph_input_count': None,
            'reph_output_count': None,
            'split_input_count': None,
            'split_output_count': None,
            'errors': str(e),
            #'elapsed_time_sec': time.time() - start_time
        })

results_df = pd.DataFrame(results)
results_df.to_excel(f"/home/fantoni/patent-sentence-classification/results/claim_simplification/first_claim_{patent_id}_{IPC}_{chatgpt_model}.xlsx", index=False)   
results_df

Processing Claim Texts: 100%|██████████| 9/9 [01:37<00:00, 10.86s/it]


Unnamed: 0,text_id,text,context,rephrasing_prompt,rephrased_text,splitting_prompt,sentence,pred_class,probs,rouge1_precision,rouge3_precision,rouge5_precision,rouge7_precision,rouge9_precision,rougeL_precision,reph_input_count,reph_output_count,split_input_count,split_output_count,errors
0,1,1. A system (10) comprising:,,Your task is to rephrase the given text into S...,"""A system (10) comprises:""",Your task is to split the given text into sub-...,A system (10) comprises,STR,"[0.01, 0.82, 0.01, 0.16]",0.75,0.5,0.0,0.0,0.0,0.75,101,7,148,8,
1,2,a roller box (15) which includes:,1. A system (10) comprising:,Your task is to rephrase the given text into S...,The roller box (15) includes.,Your task is to split the given text into sub-...,The roller box (15) includes.,STR,"[0.01, 0.97, 0.01, 0.01]",0.8,0.333,0.0,0.0,0.0,0.8,108,8,155,9,
2,3,"two parallel sides (25),",1. A system (10) comprising: a roller box (15)...,Your task is to rephrase the given text into S...,The roller box (15) includes two parallel side...,Your task is to split the given text into sub-...,The roller box (15) includes two parallel side...,STR,"[0.0, 0.98, 0.01, 0.0]",0.444,0.286,0.0,0.0,0.0,0.444,115,13,170,15,
3,4,at least one horizontal shelf (R) defined by a...,1. A system (10) comprising: a roller box (15)...,Your task is to rephrase the given text into S...,The system (10) comprises a roller box (15). T...,Your task is to split the given text into sub-...,The system (10) comprises a roller box (15),STR,"[0.0, 0.97, 0.01, 0.01]",0.375,0.0,0.0,0.0,0.0,0.25,222,145,302,154,
4,4,at least one horizontal shelf (R) defined by a...,1. A system (10) comprising: a roller box (15)...,Your task is to rephrase the given text into S...,The system (10) comprises a roller box (15). T...,Your task is to split the given text into sub-...,The roller box (15) includes at least one hori...,STR,"[0.0, 0.98, 0.01, 0.0]",0.727,0.444,0.286,0.0,0.0,0.545,222,145,302,154,
5,4,at least one horizontal shelf (R) defined by a...,1. A system (10) comprising: a roller box (15)...,Your task is to rephrase the given text into S...,The system (10) comprises a roller box (15). T...,Your task is to split the given text into sub-...,The horizontal shelf (R) is defined by a plura...,STR,"[0.01, 0.97, 0.02, 0.01]",1.0,0.636,0.444,0.286,0.0,0.846,222,145,302,154,
6,4,at least one horizontal shelf (R) defined by a...,1. A system (10) comprising: a roller box (15)...,Your task is to rephrase the given text into S...,The system (10) comprises a roller box (15). T...,Your task is to split the given text into sub-...,The idle rollers (20) are placed side by side ...,STR,"[0.0, 0.98, 0.01, 0.0]",0.938,0.714,0.583,0.5,0.375,0.875,222,145,302,154,
7,4,at least one horizontal shelf (R) defined by a...,1. A system (10) comprising: a roller box (15)...,Your task is to rephrase the given text into S...,The system (10) comprises a roller box (15). T...,Your task is to split the given text into sub-...,Each roller (20) is supported at opposite ends...,STR,"[0.21, 0.46, 0.31, 0.02]",1.0,1.0,1.0,1.0,1.0,1.0,222,145,302,154,
8,4,at least one horizontal shelf (R) defined by a...,1. A system (10) comprising: a roller box (15)...,Your task is to rephrase the given text into S...,The system (10) comprises a roller box (15). T...,Your task is to split the given text into sub-...,Each side (25) receives a seat through made in...,STR,"[0.02, 0.88, 0.09, 0.01]",0.923,0.636,0.556,0.429,0.2,0.923,222,145,302,154,
9,4,at least one horizontal shelf (R) defined by a...,1. A system (10) comprising: a roller box (15)...,Your task is to rephrase the given text into S...,The system (10) comprises a roller box (15). T...,Your task is to split the given text into sub-...,At least one first end (30) of the ends of eac...,STR,"[0.0, 0.98, 0.01, 0.01]",0.944,0.812,0.643,0.583,0.5,0.944,222,145,302,154,


# Prova Codice

In [None]:
# Initialize ROUGE scorer with various n-gram options
# reference: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore(rouge_keys = ('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

# Model configuration
#chatgpt_model ='gpt-4o'
chatgpt_model ='gpt-3.5-turbo'
temperature = 0
top_p = 1

results = []

for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing Sentences"):  
    start_time = time.time()              
    try:
        # ==========================================================================================================================
        # Split text into sub-sentences
        text = row['text']
        output_string, formatted_prompt, input_count, output_count = prompt_chatgpt(text, splitting_prompt, chatgpt_model, temperature, top_p)
        
        # Validate output format
        if not output_string:  
            raise ValueError(f"Output is empty.")
        try:
            output_string = ast.literal_eval(output_string)  
            print(f"Output is in list format.") 
        except (SyntaxError, ValueError) as e:
            raise ValueError(f"Output not in list format: {e}")

        for generated_sent in output_string:
            
            # Classify the text
            pred_class, probs = classify_text(model, generated_sent, device)
            
            # =========================================================================================================================
            # If mixed class, retry Splitting  and Classification 
            if pred_class == 'MIX':
                print('Found MIX sentence, retry splitting and classification ...')
                new_output_string, new_formatted_prompt, new_input_count, new_output_count = prompt_chatgpt(generated_sent, splitting_prompt, chatgpt_model, temperature, top_p)
                
                # Validate output format
                if not new_output_string:  
                    raise ValueError(f"Output is empty.")
                try:
                    new_output_string = ast.literal_eval(new_output_string)  
                    print(f"Output is in list format.") 
                except (SyntaxError, ValueError) as e:
                    raise ValueError(f"Output not in list format: {e}")
                
                for new_generated_sent in new_output_string:
                    # Classify the text
                    new_pred_class, new_probs = classify_text(model, new_generated_sent, device)

                    score = rouge(new_generated_sent, text)
                
                    results.append({
                        'text_id': row['text_id'],
                        'text': text,
                        'prompt': new_formatted_prompt,
                        'generated_sent': new_generated_sent,
                        'pred_sent_class': new_pred_class,
                        'probs': new_probs,
                        'rouge1_precision': round(score['rouge1_precision'].item(), 3),
                        'rouge3_precision': round(score['rouge3_precision'].item(), 3),
                        'rouge5_precision': round(score['rouge5_precision'].item(), 3),
                        'rouge7_precision': round(score['rouge7_precision'].item(), 3),
                        'rouge9_precision': round(score['rouge9_precision'].item(), 3),
                        'rougeL_precision': round(score['rougeL_precision'].item(), 3),
                        'output_string': new_output_string,
                        'input_count': new_input_count,
                        'output_count': new_output_count,
                        'errors': None,
                        'elapsed_time_sec': time.time() - start_time
                    })
            # =========================================================================================================================
            # Process non-MIX class directly
            else:
                score = rouge(generated_sent, text)
                
                results.append({
                    'text_id': row['text_id'],
                    'text': text,
                    'prompt': formatted_prompt,
                    'generated_sent': generated_sent,
                    'pred_sent_class': pred_class,
                    'probs': probs,
                    'rouge1_precision': round(score['rouge1_precision'].item(), 3),
                    'rouge3_precision': round(score['rouge3_precision'].item(), 3),
                    'rouge5_precision': round(score['rouge5_precision'].item(), 3),
                    'rouge7_precision': round(score['rouge7_precision'].item(), 3),
                    'rouge9_precision': round(score['rouge9_precision'].item(), 3),
                    'rougeL_precision': round(score['rougeL_precision'].item(), 3),
                    'output_string': output_string,
                    'input_count': input_count,
                    'output_count': output_count,
                    'errors': None,
                    'elapsed_time_sec': time.time() - start_time
                })

    # Process Errors =======================================================================================
    except Exception as e:
        print(f"Error processing sentence {row['text_id']}: {str(e)}")
        results.append({
            'text_id': row['text_id'],
            'text': text,
            'prompt': formatted_prompt,
            'generated_sent': None,
            'pred_sent_class': None,
            'probs': None,
            'rouge1_precision': None,
            'rouge3_precision': None,
            'rouge5_precision': None,
            'rouge7_precision': None,
            'rouge9_precision': None,
            'rougeL_precision': None,
            'output_string': output_string,
            'input_count': None,
            'output_count': None,
            'errors': str(e),
            'elapsed_time_sec': time.time() - start_time
        })

results_df = pd.DataFrame(results)
results_df.to_excel(f"/home/fantoni/patent-sentence-classification/results/mix_disambiguation/first_claim_{patent_id}_{IPC}_{chatgpt_model}.xlsx", index=False)
##results_df.to_excel(f"/home/fantoni/patent-sentence-classification/results/first_claim_{patent_id}_{IPC}_{chatgpt_model}_temp_{temperature}_top_{top_p}_asis.xlsx", index=False)

Processing Sentences:   0%|          | 0/11 [00:00<?, ?it/s]

Output is in list format.


Processing Sentences:   9%|▉         | 1/11 [00:09<01:31,  9.11s/it]

Output is in list format.


Processing Sentences:  18%|█▊        | 2/11 [00:19<01:27,  9.76s/it]

Output is in list format.


Processing Sentences:  27%|██▋       | 3/11 [00:28<01:16,  9.51s/it]

Output is in list format.


Processing Sentences:  36%|███▋      | 4/11 [00:36<01:00,  8.70s/it]

Output is in list format.


Processing Sentences:  45%|████▌     | 5/11 [00:42<00:47,  7.89s/it]

Output is in list format.


Processing Sentences:  55%|█████▍    | 6/11 [00:47<00:34,  6.96s/it]

Output is in list format.


Processing Sentences:  64%|██████▎   | 7/11 [00:54<00:27,  6.80s/it]

Output is in list format.


Processing Sentences:  73%|███████▎  | 8/11 [01:00<00:19,  6.56s/it]

Found MIX sentence, retry splitting and classification ...
Error processing sentence 8: name 'split_sentence_chatgpt' is not defined
Output is in list format.


Processing Sentences:  82%|████████▏ | 9/11 [01:06<00:13,  6.63s/it]

Found MIX sentence, retry splitting and classification ...
Error processing sentence 9: name 'split_sentence_chatgpt' is not defined
Output is in list format.


Processing Sentences:  91%|█████████ | 10/11 [01:14<00:06,  6.78s/it]

Found MIX sentence, retry splitting and classification ...
Error processing sentence 10: name 'split_sentence_chatgpt' is not defined
Output is in list format.


Processing Sentences: 100%|██████████| 11/11 [01:22<00:00,  7.46s/it]

Found MIX sentence, retry splitting and classification ...
Error processing sentence 11: name 'split_sentence_chatgpt' is not defined





In [None]:
# Generate Tree
tree = build_tree_with_parents(output_string)

# Create Dataframe from Tree
df = pd.DataFrame([
    {
        'text_id': idx + 1,
        'text': node['line'],
        'context': ' '.join(node['parents'])
    }
    for idx, node in enumerate(tree)
])

df