# Installations and Imports

In [1]:
import torch
import os
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import tiktoken
from dotenv import load_dotenv
import time
import ast
import re
import warnings
warnings.filterwarnings('ignore')

# LangChain Import
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Finetuned Model Import
from transformers import BertTokenizer, BertForSequenceClassification
from src.model import PatentSentenceClassifier

# Load OpenaAI API key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Utils

In [39]:
def count_tokens(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

def calculate_rouge_scores_precision(text1, text2, rouge_scorer):
    """
    Calculate ROUGE scores precision between two texts.
    
    Parameters:
    - text1: First text for comparison (typically the generated or processed text)
    - text2: Second text for comparison (typically the original reference text)
    - rouge_scorer: Initialized ROUGEScore object
    
    Returns:
    - Dictionary containing rounded ROUGE precision scores
    """
    score = rouge_scorer(text1, text2)
    
    return {
        'rouge1_precision': round(score['rouge1_precision'].item(), 3),
        'rouge3_precision': round(score['rouge3_precision'].item(), 3),
        'rouge5_precision': round(score['rouge5_precision'].item(), 3),
        'rouge7_precision': round(score['rouge7_precision'].item(), 3),
        'rouge9_precision': round(score['rouge9_precision'].item(), 3),
        'rougeL_precision': round(score['rougeL_precision'].item(), 3),
    }

def prompt_chatgpt(input_text, input_context, prompt, model="gpt-4o", temperature=0, top_p=1):
    
    # Define a prompt template for classification
    prompt_template = PromptTemplate.from_template(prompt)

    # Create an OpenAI LLM instance
    llm = ChatOpenAI(
        model=model,
        temperature=temperature,
        top_p=top_p,
        max_retries=1,
        max_tokens=1000 
    )

    # Create a runnable sequence
    chain = prompt_template | llm | StrOutputParser()

    # Prepare inputs
    inputs = {"input_text": input_text}
    if input_context:
        inputs["input_context"] = input_context

    # Format prompt
    formatted_prompt = prompt_template.format(**inputs)
    ##print(f"Generated Prompt:\n{formatted_prompt}") # Debugging statement

    # Invoke Chain
    output_string = chain.invoke(inputs).strip()

    # Calculate token count
    #input_count = count_tokens(formatted_prompt)
    #output_count = count_tokens(output_string)

    #print(f"Using: model = '{model}'; temperature = {temperature}; top_p = {top_p}") # Debugging statement

    return output_string, formatted_prompt


def classify_text(model, input_text, device='cpu'):
    
    # Tokenize input
    tokenizer = model.tokenizer  # Assuming tokenizer is part of the model
    inputs = tokenizer(input_text, truncation=True, padding=True, max_length=512, return_tensors='pt')
    
    # Move input to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Define label mapping
    int_to_label = {0: 'FUN', 1: 'STR', 2: 'MIX', 3: 'OTH'}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
        pred_idx = torch.argmax(probs).item()
        pred_class = int_to_label[pred_idx]

    return pred_class, [round(p, 2) for p in probs.tolist()]


def create_hierarchy(text):
    """
    Arguments:
        text (str): A multiline string where each line begins with one or more '>' characters to indicate hierarchy.

    Returns:
        pd.DataFrame: A DataFrame with the following columns:
            - 'index': Hierarchical index (e.g., '1', '1.1', '1.1.1')
            - 'sentence': The textual content of the line
            - 'parent_indices': List of parent index strings
            - 'parents': List of parent content strings
    """
    
    lines = text.strip().splitlines()
    counters = []
    index_sentence_dict = {}
    rows = []

    for line in lines:
        # Remove the leading '>' used to denote the root node level
        line = line[1:]  # The first '>' is always present, even for root-level items
        
        # Determine level by counting leading '>' characters
        level = len(line) - len(line.lstrip('>'))
        content = line.lstrip('>').strip()
        if not content:
            continue

        # Adjust counters for current level
        if len(counters) <= level:
            counters += [1] * (level + 1 - len(counters))
        else:
            counters = counters[:level + 1]
            counters[level] += 1

        # Build Index
        index = ".".join(map(str, counters[:level + 1]))
        index_sentence_dict[index] = content

        # Generate parent indices and content inline
        parent_indices = [".".join(map(str, counters[:i])) for i in range(1, level + 1)]
        parent_contents = [index_sentence_dict[pidx] for pidx in parent_indices if pidx in index_sentence_dict]

        rows.append({
            "index": index,
            "text": content,
            "parent_indices": parent_indices,
            "parents": parent_contents
        })

    return pd.DataFrame(rows)


def append_result_to_list(results, index, input_text, context=None, rephrasing_prompt=None, 
                         rephrased_text=None, splitting_prompt=None, sentence=None, 
                         pred_class=None, probs=None, rouge_scores=None, error=None):
    """
    Append a single result entry to the results list.
    
    Parameters:
    - results: The list to append results to
    - index: Index level of the hyerarchy
    - input_text: Original claim text
    - context: Parent claim context if available
    - rephrasing_prompt: Prompt used for rephrasing
    - rephrased_text: Text after rephrasing
    - splitting_prompt: Prompt used for splitting
    - sentence: Current sentence being processed
    - pred_class: Predicted classification
    - probs: Classification probabilities
    - rouge_scores: Dict of ROUGE scores (keys: rouge1_precision, rouge3_precision, etc.)
    - error: Error message if processing failed
    
    Returns:
    - The updated results list
    """
    # Create base result dictionary
    result = {
        'index': index,
        'text': input_text,
        'context': context,
        'rephrasing_prompt': rephrasing_prompt,
        'rephrased_text': rephrased_text,
        'splitting_prompt': splitting_prompt,
        'sentence': sentence,
        'pred_class': pred_class,
        'probs': probs,
        'rouge1_precision': None,
        'rouge3_precision': None,
        'rouge5_precision': None,
        'rouge7_precision': None,
        'rouge9_precision': None,
        'rougeL_precision': None,
        'errors': error
    }
    
    # Update with ROUGE scores if provided
    if rouge_scores and not error:
        result.update(rouge_scores)
    
    # Append to results list
    results.append(result)
    
    return results

# Prompts Definition

In [3]:
# =========================================================================================
# Prompt to indent claim
indenting_prompt = """Your task is to format the following patent claim by indenting each logical block of information.
Use  ">" characters to indent the beginning of each block. 

\"{input_text}\"
"""
print(indenting_prompt)

# =========================================================================================
# Prompt to rephrase a text using its context
rephrasing_with_context_prompt = """Your task is to rephrase the given text into Subject-Verb-Object (SVO) structure.
Avoid using pronouns. Instead, repeat the original subject explicitly where needed.

Use the provided context (if any) to resolve references and pronouns in the main text.

Context Format: Supplementary information providing background for the main text.
Input Format: The main text that is to be rephrased.

Context: \"{input_context}\"
Input: \"{input_text}\"
Output:""" 

# =========================================================================================
# Prompt to split a text into sub-sentences using its context
splitting_with_context_prompt = """Your task is to split the given text into sub-sentences, ensuring that:
1. Each sub-sentence must contain only one predicate.
2. Avoid using pronouns. Instead, repeat the original subject explicitly where needed.
3. Do not split inline lists; treat item lists as a single unit.

Use the provided context (if any) to resolve references and pronouns in the main text.

Context Format: Supplementary information providing background for the main text.
Input Format: The main text that is to be split.
Output Format: A list of sub-sentences enclosed in double quotes, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).

Context: \"{input_context}\"
Input: \"{input_text}\"
Output:
"""
print(splitting_with_context_prompt)


Your task is to format the following patent claim by indenting each logical block of information.
Use  ">" characters to indent the beginning of each block. 

"{input_text}"

Your task is to split the given text into sub-sentences, ensuring that:
1. Each sub-sentence must contain only one predicate.
2. Avoid using pronouns. Instead, repeat the original subject explicitly where needed.
3. Do not split inline lists; treat item lists as a single unit.

Use the provided context (if any) to resolve references and pronouns in the main text.

Context Format: Supplementary information providing background for the main text.
Input Format: The main text that is to be split.
Output Format: A list of sub-sentences enclosed in double quotes, separated by commas (e.g., ["sub-sentence1", "sub-sentence2", "sub-sentence3"]).

Context: "{input_context}"
Input: "{input_text}"
Output:



# Load Classification Model

In [4]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set path to checkpoint
checkpoint_name = 'bert-large-uncased_train_10_4'; model_name = "bert-large-uncased"
checkpoint_name = 'bert-for-patents_train_10_4'; model_name = "anferico/bert-for-patents" 
checkpoint_path = f"/home/fantoni/patent-sentence-classification/models/finetuning/{checkpoint_name}.ckpt"

# Load Base Tokenizer
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
print('\nBase Tokenizer loaded succesfully.')

# Load Base Model
base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)
print('\nBase model loaded succesfully.')

# Load Finetuned Model
model = PatentSentenceClassifier.load_from_checkpoint(
    checkpoint_path,
    model=base_model,
    tokenizer=bert_tokenizer)

model.eval()
model.to(device)
print(f"\nFinetuned model loaded succesfully. Using: '{checkpoint_name}'")

# Define Finetuned Tokenizer
tokenizer = model.tokenizer

Using device: cpu

Base Tokenizer loaded succesfully.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at anferico/bert-for-patents and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Base model loaded succesfully.

Finetuned model loaded succesfully. Using: 'bert-for-patents_train_10_4'


# Import Data

In [51]:
# Patents
filepath ="/home/fantoni/patent-sentence-classification/data/claim_simplification/US8695121B2_A42B3.txt"
#filepath ="/home/fantoni/patent-sentence-classification/data/claim_simplification/US11133720B2_H02K3.txt"

# Pavanello Patents
#filepath ="/home/fantoni/patent-sentence-classification/data/claim_simplification/IT-201900008253-A1_B65G1-023.txt"
#filepath ="/home/fantoni/patent-sentence-classification/data/claim_simplification/US-10733341-B1_G06F30-30.txt"
#filepath ="/home/fantoni/patent-sentence-classification/data/claim_simplification/WO-2017216367-A1_C08J5-0405.txt"
#filepath ="/home/fantoni/patent-sentence-classification/data/claim_simplification/WO-2019021161-A1_F16D65-12.txt"
#filepath ="/home/fantoni/patent-sentence-classification/data/claim_simplification/WO-2019243958-A1_F16D55-288.txt"
#filepath ="/home/fantoni/patent-sentence-classification/data/claim_simplification/WO-2020058819-A1_B6078-1706.txt"
#filepath = "/home/fantoni/patent-sentence-classification/data/claim_simplification/WO-2022144719-A1_B6078-261.txt"

with open(filepath, "r") as file:
    input_text = file.read()
    print(input_text)

# Set file name
filename = os.path.splitext(os.path.basename(filepath))[0]

1. A helmet system for removing condensation from a user's field of vision, comprising: a helmet shell having an anterior section, a posterior section, and a venting passage, wherein the helmet shell defines an internal cavity that is in fluid communication with a front portion of the venting passage, and wherein the internal cavity is configured to receive the user's head; a visor coupled with the helmet shell, wherein at least part of the visor defines part of the internal cavity; a humidity sensor positioned within the internal cavity of the helmet shell; and a ventilation system comprising: a base coupled with the helmet shell, wherein the base has a first venting aperture in fluid communication with a rear portion of the venting passage, a base cover coupled with the base, wherein the base cover has a second venting aperture, an air movement assembly disposed between the base and the base cover, wherein the air movement assembly provides fluid communication between the first venti

# Extract Hierarchy

In [52]:
# Indent text with prompt
output_string, _ = prompt_chatgpt(input_text=input_text, input_context=None, prompt=indenting_prompt, model='gpt-3.5-turbo')
print(output_string)

df = create_hierarchy(output_string)
df

>1. A helmet system for removing condensation from a user's field of vision, comprising:
>>a helmet shell having an anterior section, a posterior section, and a venting passage, wherein the helmet shell defines an internal cavity that is in fluid communication with a front portion of the venting passage, and wherein the internal cavity is configured to receive the user's head;
>>a visor coupled with the helmet shell, wherein at least part of the visor defines part of the internal cavity;
>>a humidity sensor positioned within the internal cavity of the helmet shell; and
>>a ventilation system comprising:
>>>a base coupled with the helmet shell, wherein the base has a first venting aperture in fluid communication with a rear portion of the venting passage,
>>>a base cover coupled with the base, wherein the base cover has a second venting aperture,
>>>an air movement assembly disposed between the base and the base cover, wherein the air movement assembly provides fluid communication betwe

Unnamed: 0,index,text,parent_indices,parents
0,1,1. A helmet system for removing condensation f...,[],[]
1,1.1,"a helmet shell having an anterior section, a p...",[1],[1. A helmet system for removing condensation ...
2,1.2,"a visor coupled with the helmet shell, wherein...",[1],[1. A helmet system for removing condensation ...
3,1.3,a humidity sensor positioned within the intern...,[1],[1. A helmet system for removing condensation ...
4,1.4,a ventilation system comprising:,[1],[1. A helmet system for removing condensation ...
5,1.4.1,"a base coupled with the helmet shell, wherein ...","[1, 1.4]",[1. A helmet system for removing condensation ...
6,1.4.2,"a base cover coupled with the base, wherein th...","[1, 1.4]",[1. A helmet system for removing condensation ...
7,1.4.3,an air movement assembly disposed between the ...,"[1, 1.4]",[1. A helmet system for removing condensation ...
8,1.4.4,"a switch,","[1, 1.4]",[1. A helmet system for removing condensation ...
9,1.4.5,"a power source, and","[1, 1.4]",[1. A helmet system for removing condensation ...


# Rephrasing + Splitting + Classification

different results ChatGPT and API ChatGPT : https://community.openai.com/t/different-results-same-prompt-on-openai-api-vs-chatgpt/1062995

In [53]:
import pandas as pd
import re
import ast
from tqdm import tqdm
from torchmetrics.text.rouge import ROUGEScore

# Initialize ROUGE scorer with various n-gram options
# reference: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499
rouge_scorer = ROUGEScore(rouge_keys=('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

# Model configuration
CHATGPT_MODEL = 'gpt-4o'
#CHATGPT_MODEL = 'gpt-3.5-turbo'
TEMPERATURE = 0
TOP_P = 1

# Main processing loop
results = []

for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing Claim Texts"):  
    try:
        input_text = row['text']
        index = row['index']
        word_count = len(re.findall(r'\b\w+\b', input_text))
        input_context = None
        rephrasing_prompt = None
        splitting_prompt = None

        # Case 1: Short text ending with colon - classification only
        # Rephrasing such sentences using contextual information can cause errors in the BOM hierarchy, 
        # so they are classified without rephrasing.
        if input_text.endswith(':') and word_count <= 20:
            
            # Perform Classification
            pred_class, probs = classify_text(model, input_text, device)
            
            # Calculate rouge scores
            rouge_scores = calculate_rouge_scores_precision(input_text, input_text, rouge_scorer)
            
            # Append results using our new function
            append_result_to_list(
                results, 
                index, 
                input_text, 
                sentence=input_text, 
                pred_class=pred_class, 
                probs=probs, 
                rouge_scores=rouge_scores
            )

        # Case 2: Text with parents or long text - rephrasing, splitting and classification
        elif row['parents'] or word_count >= 20:
            
            # 1. Get context from parent claims if available
            n_parents = 1  # Number of parent claims to include
            input_context = ' '.join(row['parents'][-n_parents:]) if row['parents'] else ' '
            
            # 2. Rephrase text using context
            rephrased_text, rephrasing_prompt = prompt_chatgpt(input_text, input_context, rephrasing_with_context_prompt, CHATGPT_MODEL, TEMPERATURE, TOP_P)
            
            # 3. Split text into sub-sentences
            split_text, splitting_prompt = prompt_chatgpt(rephrased_text, input_context, splitting_with_context_prompt, CHATGPT_MODEL, TEMPERATURE, TOP_P)
            
            # Validate output format
            if not split_text:
                raise ValueError("Output is empty.")
    
            try:
                split_text = ast.literal_eval(split_text)
            except (SyntaxError, ValueError) as e:
                raise ValueError(f"Output not in list format: {e}")
            
            # 4. Process each sub-sentence
            for sent in split_text:
                # Classify sub-sentence
                pred_class, probs = classify_text(model, sent, device)
                
                # Calculate rouge scores
                rouge_scores = calculate_rouge_scores_precision(sent, input_text, rouge_scorer)
                
                # Append results using our new function
                append_result_to_list(
                    results, 
                    index, 
                    input_text, 
                    context=input_context,
                    rephrasing_prompt=rephrasing_prompt,
                    rephrased_text=rephrased_text,
                    splitting_prompt=splitting_prompt,
                    sentence=sent,
                    pred_class=pred_class, 
                    probs=probs, 
                    rouge_scores=rouge_scores
                )

        # Case 3: Simple sentences without parents (root sentence) - classification only
        else:
            # Perform Classification
            pred_class, probs = classify_text(model, input_text, device)
            
            # Calculate rouge scores
            rouge_scores = calculate_rouge_scores_precision(input_text, input_text, rouge_scorer)
            
            # Append results using our new function
            append_result_to_list(
                results, 
                index, 
                input_text, 
                sentence=input_text, 
                pred_class=pred_class, 
                probs=probs, 
                rouge_scores=rouge_scores
            )

    # Handle errors
    except Exception as e:
        print(f"Error processing claim {row['index']}: {str(e)}")
        append_result_to_list(
            results,
            index,
            input_text,
            error=str(e)
        )

# Create DataFrame from results and save to Excel
df_results = pd.DataFrame(results)
df_results.to_excel(f"/home/fantoni/patent-sentence-classification/results/claim_simplification/{filename}_{CHATGPT_MODEL}.xlsx", index=False)   
print(f"Results saved to /home/fantoni/patent-sentence-classification/results/claim_simplification/{filename}_{CHATGPT_MODEL}.xlsx")

Processing Claim Texts:   0%|          | 0/17 [00:00<?, ?it/s]

Processing Claim Texts: 100%|██████████| 17/17 [01:50<00:00,  6.49s/it]

Results saved to /home/fantoni/patent-sentence-classification/results/claim_simplification/US8695121B2_A42B3_gpt-4o.xlsx





# Prova Codice

In [None]:
# Initialize ROUGE scorer with various n-gram options
# reference: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore(rouge_keys = ('rouge1', 'rouge3', 'rouge5', 'rouge7', 'rouge9', 'rougeL'))

# Model configuration
#chatgpt_model ='gpt-4o'
chatgpt_model ='gpt-3.5-turbo'
temperature = 0
top_p = 1

results = []

for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing Sentences"):  
    start_time = time.time()              
    try:
        # ==========================================================================================================================
        # Split text into sub-sentences
        text = row['text']
        output_string, formatted_prompt, input_count, output_count = prompt_chatgpt(text, splitting_prompt, chatgpt_model, temperature, top_p)
        
        # Validate output format
        if not output_string:  
            raise ValueError(f"Output is empty.")
        try:
            output_string = ast.literal_eval(output_string)  
            print(f"Output is in list format.") 
        except (SyntaxError, ValueError) as e:
            raise ValueError(f"Output not in list format: {e}")

        for generated_sent in output_string:
            
            # Classify the text
            pred_class, probs = classify_text(model, generated_sent, device)
            
            # =========================================================================================================================
            # If mixed class, retry Splitting  and Classification 
            if pred_class == 'MIX':
                print('Found MIX sentence, retry splitting and classification ...')
                new_output_string, new_formatted_prompt, new_input_count, new_output_count = prompt_chatgpt(generated_sent, splitting_prompt, chatgpt_model, temperature, top_p)
                
                # Validate output format
                if not new_output_string:  
                    raise ValueError(f"Output is empty.")
                try:
                    new_output_string = ast.literal_eval(new_output_string)  
                    print(f"Output is in list format.") 
                except (SyntaxError, ValueError) as e:
                    raise ValueError(f"Output not in list format: {e}")
                
                for new_generated_sent in new_output_string:
                    # Classify the text
                    new_pred_class, new_probs = classify_text(model, new_generated_sent, device)

                    score = rouge(new_generated_sent, text)
                
                    results.append({
                        'text_id': row['text_id'],
                        'text': text,
                        'prompt': new_formatted_prompt,
                        'generated_sent': new_generated_sent,
                        'pred_sent_class': new_pred_class,
                        'probs': new_probs,
                        'rouge1_precision': round(score['rouge1_precision'].item(), 3),
                        'rouge3_precision': round(score['rouge3_precision'].item(), 3),
                        'rouge5_precision': round(score['rouge5_precision'].item(), 3),
                        'rouge7_precision': round(score['rouge7_precision'].item(), 3),
                        'rouge9_precision': round(score['rouge9_precision'].item(), 3),
                        'rougeL_precision': round(score['rougeL_precision'].item(), 3),
                        'output_string': new_output_string,
                        'input_count': new_input_count,
                        'output_count': new_output_count,
                        'errors': None,
                        'elapsed_time_sec': time.time() - start_time
                    })
            # =========================================================================================================================
            # Process non-MIX class directly
            else:
                score = rouge(generated_sent, text)
                
                results.append({
                    'text_id': row['text_id'],
                    'text': text,
                    'prompt': formatted_prompt,
                    'generated_sent': generated_sent,
                    'pred_sent_class': pred_class,
                    'probs': probs,
                    'rouge1_precision': round(score['rouge1_precision'].item(), 3),
                    'rouge3_precision': round(score['rouge3_precision'].item(), 3),
                    'rouge5_precision': round(score['rouge5_precision'].item(), 3),
                    'rouge7_precision': round(score['rouge7_precision'].item(), 3),
                    'rouge9_precision': round(score['rouge9_precision'].item(), 3),
                    'rougeL_precision': round(score['rougeL_precision'].item(), 3),
                    'output_string': output_string,
                    'input_count': input_count,
                    'output_count': output_count,
                    'errors': None,
                    'elapsed_time_sec': time.time() - start_time
                })

    # Process Errors =======================================================================================
    except Exception as e:
        print(f"Error processing sentence {row['text_id']}: {str(e)}")
        results.append({
            'text_id': row['text_id'],
            'text': text,
            'prompt': formatted_prompt,
            'generated_sent': None,
            'pred_sent_class': None,
            'probs': None,
            'rouge1_precision': None,
            'rouge3_precision': None,
            'rouge5_precision': None,
            'rouge7_precision': None,
            'rouge9_precision': None,
            'rougeL_precision': None,
            'output_string': output_string,
            'input_count': None,
            'output_count': None,
            'errors': str(e),
            'elapsed_time_sec': time.time() - start_time
        })

results_df = pd.DataFrame(results)
results_df.to_excel(f"/home/fantoni/patent-sentence-classification/results/mix_disambiguation/first_claim_{patent_id}_{IPC}_{chatgpt_model}.xlsx", index=False)
##results_df.to_excel(f"/home/fantoni/patent-sentence-classification/results/first_claim_{patent_id}_{IPC}_{chatgpt_model}_temp_{temperature}_top_{top_p}_asis.xlsx", index=False)

Processing Sentences:   0%|          | 0/11 [00:00<?, ?it/s]

Output is in list format.


Processing Sentences:   9%|▉         | 1/11 [00:09<01:31,  9.11s/it]

Output is in list format.


Processing Sentences:  18%|█▊        | 2/11 [00:19<01:27,  9.76s/it]

Output is in list format.


Processing Sentences:  27%|██▋       | 3/11 [00:28<01:16,  9.51s/it]

Output is in list format.


Processing Sentences:  36%|███▋      | 4/11 [00:36<01:00,  8.70s/it]

Output is in list format.


Processing Sentences:  45%|████▌     | 5/11 [00:42<00:47,  7.89s/it]

Output is in list format.


Processing Sentences:  55%|█████▍    | 6/11 [00:47<00:34,  6.96s/it]

Output is in list format.


Processing Sentences:  64%|██████▎   | 7/11 [00:54<00:27,  6.80s/it]

Output is in list format.


Processing Sentences:  73%|███████▎  | 8/11 [01:00<00:19,  6.56s/it]

Found MIX sentence, retry splitting and classification ...
Error processing sentence 8: name 'split_sentence_chatgpt' is not defined
Output is in list format.


Processing Sentences:  82%|████████▏ | 9/11 [01:06<00:13,  6.63s/it]

Found MIX sentence, retry splitting and classification ...
Error processing sentence 9: name 'split_sentence_chatgpt' is not defined
Output is in list format.


Processing Sentences:  91%|█████████ | 10/11 [01:14<00:06,  6.78s/it]

Found MIX sentence, retry splitting and classification ...
Error processing sentence 10: name 'split_sentence_chatgpt' is not defined
Output is in list format.


Processing Sentences: 100%|██████████| 11/11 [01:22<00:00,  7.46s/it]

Found MIX sentence, retry splitting and classification ...
Error processing sentence 11: name 'split_sentence_chatgpt' is not defined





In [None]:
# Generate Tree
tree = build_tree_with_parents(output_string)

# Create Dataframe from Tree
df = pd.DataFrame([
    {
        'text_id': idx + 1,
        'text': node['line'],
        'context': ' '.join(node['parents'])
    }
    for idx, node in enumerate(tree)
])

df