In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

import torch

print(torch.cuda.is_available())

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
from natsort import natsorted
from lib.utils.promptLoader import PromptLoader

prompt = PromptLoader("resources/prompts/lightfootcat_prompt.yaml")
image_path = "resources/images/lightfootcat/images/cropped"
images = natsorted([os.path.join(image_path, i) for i in os.listdir(image_path)])
sample_images = images[:5]#np.random.choice(images, 5)

def plot_images(images):
    fig, ax = plt.subplots(1, 5, figsize=(20, 5))
    for i, image in enumerate(images):
        img = cv2.imread(image) if isinstance(image, str) else image
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        ax[i].imshow(img)
        ax[i].axis('off')
    plt.show()


In [None]:

plot_images(sample_images)

In [None]:
from lib.model.ocr_model import OCRModel

ocr = OCRModel(prompt=prompt)

In [None]:
sample_outs_without_preprocessing = [ocr([image]) for image in sample_images]

for i, image in enumerate(sample_images):
    img = cv2.imread(image)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.axis('off')
    plt.title(f"Image {i+1}")
    plt.show()
    print(f"Image {i+1} OCR Output:")
    print(sample_outs_without_preprocessing[i])
    print("\n" + "="*50 + "\n")

In [None]:
### Preprocessing the images

print("Initial Images")
plot_images(sample_images)

print("Step 1: Grayscale Conversion")
gray_images = [cv2.cvtColor(cv2.imread(image), cv2.COLOR_BGR2GRAY) for image in sample_images]
plot_images(gray_images)

print("Step 2: Remove shadows")
thresh = lambda x: cv2.adaptiveThreshold(x, 255,
                               cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                               cv2.THRESH_BINARY, 31, 10)

median_filter = lambda x: cv2.medianBlur(x, 3)
shadows_removes = [median_filter(thresh(image)) for image in gray_images]
plot_images(shadows_removes)


print("Step 3: Noise Reductiion")
denoised_images = [cv2.bilateralFilter(image, 9, 75, 75) for image in shadows_removes]
plot_images(denoised_images)

print("Setp 4: Binarization (black and White) via Otsu's method")
binarized_images = [cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] for image in denoised_images]
plot_images(binarized_images)

print("Step 5: Deskewing")
print("Skipping deskewing for now")
deskewed_images = binarized_images
#deskewed_images = [deskew(image) for image in binarized_images]
#plot_images(deskewed_images)

print("Step 6: Morphological opening (erosion followed by dilation)")
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,1))#np.ones((1, 1), np.uint8)
opened = lambda x: cv2.morphologyEx(x, cv2.MORPH_OPEN,
                                    kernel, iterations=1)
opened_images = [opened(image) for image in deskewed_images]
plot_images(opened_images)

print("Step 7: Optional dilation to thicken strokes")
kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
processed = [cv2.dilate(i, kernel2, iterations=1) for i in opened_images]
plot_images(processed)

temp_dir = "temp/"
processod_images = []
for ind, i in enumerate(processed):
    fname = os.path.join(temp_dir, str(ind) + ".png")
    cv2.imwrite(fname, i)
    processod_images.append(fname)
    print(fname)
    print("\n" + "="*50 + "\n")

In [None]:
sample_outs_with_preprocessing = [ocr.extract_text([image], clean=False) for image in processed]

for i, image in enumerate(processed):
    #img = cv2.imread(image)
    img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.axis('off')
    plt.title(f"Image {i+1}")
    plt.show()
    print(f"Image {i+1} OCR Output:")
    print(sample_outs_with_preprocessing[i])
    print("\n" + "="*50 + "\n")

In [None]:

index = 2
print(sample_outs_with_preprocessing[index])

In [None]:
import re

def clean_text(text):

    text = text.strip()
    # Clean line breaks
    text = re.sub(r"-\n+", "", text)
    text = re.sub(r"^(\d*|[a-z]{0,3})\s?(Joh|Cat).*\n", "", text, re.I)
    text = re.sub(r"^.*(l?o?gue|ghtfoot|foot)\s?(\d*)?\n", "", text, re.I)
    text = re.sub(r"([a-zA-Z])-\n([a-zA-Z])", r"\1\2", text)
    text = re.sub(r"([A-Z]+)\s*(EAE|FAE|EAF)", r"\1EAE", text)

    # Add a newline before any indexing patterns like 1. or i. or a., but only if not part of a word ending
    # Ensure the pattern is preceded by whitespace or start of line, and not a letter (to avoid word endings)
    # exclusions = r'(?:e\.g\.|i\.e\.|etc\.|cf\.|vs\.)'
    # text = re.sub(rf'(?<![a-zA-Z0-9])\s+(?!{exclusions})(\d+\.)\n?', r'\n\1', text)
    # text = re.sub(rf'(?<![a-zA-Z0-9])\s+(?!{exclusions})([ivxlc]+\.)\n?', r'\n\1', text)
    # text = re.sub(rf'(?<![a-zA-Z0-9])\s+(?!{exclusions})([a-z]\.)\n?', r'\n\1', text)

    text = re.sub(r"\n\n+", "\n\n", text)
    return text


def full_clean_text(text_list: list[str]) -> list[str]:

    cleaned_texts = []

    for i, text in enumerate(text_list):
        text = text if isinstance(text, list) else [text]

        cleaned = [clean_text(t) for t in text if not(re.match(r"^\n*$", t))]

        cleaned_texts.append("".join(cleaned))

    return cleaned_texts

In [None]:

# sample = sample_outs_with_preprocessing[index]
# sample = sample.split("\n\n\n\n\n")
# out = " ".join(full_clean_text(sample))
# print("Sample OCR Output:")
# print(sample)
# print("Cleaned OCR Output:")
# print(out)

In [None]:
clean_n = lambda x: re.sub(r"\n\n\n\n\n", "\n\n", x)
sample_outs_with_preprocessing = [clean_text(clean_n(i)) for i in sample_outs_with_preprocessing]

In [None]:
text = "".join(sample_outs_with_preprocessing)
overlap = 50
max_chunk_size = 2000
def chunker(text, overlap=50, max_chunk_size=2000, add_overlap=True):

    chunks = []

    current_chunk = []

    chunk_size = 0

    paragraphs = re.split("\n\s*\n", text)
    for paragraph in paragraphs:
        para_length = len(paragraph)

        if chunk_size + para_length <= max_chunk_size:
            current_chunk.append(paragraph)
            chunk_size += para_length
        else:
            if current_chunk:
                chunks.append("\n\n".join(current_chunk))
            current_chunk = []
            chunk_size = 0

            lines = paragraph.split("\n")
            num_of_lines_not_taken = len(lines)

            while num_of_lines_not_taken > 0:

                num_lines_to_take = min(int(float(max_chunk_size / para_length) * len(lines)), num_of_lines_not_taken)

                lines_to_add = lines[:num_lines_to_take]
                joined_lines = "\n".join(lines_to_add)

                while len(joined_lines) > max_chunk_size and num_lines_to_take > 1:
                    num_lines_to_take -= 1
                    lines_to_add = lines[:num_lines_to_take]
                    joined_lines = "\n".join(lines_to_add)


                if len(joined_lines) <= max_chunk_size:
                    current_chunk.append(joined_lines)
                    chunk_size += len(joined_lines)
                
                if current_chunk:
                    chunks.append("\n\n".join(current_chunk))
                current_chunk = []
                chunk_size = 0
                num_of_lines_not_taken -= num_lines_to_take        

    if current_chunk:
        chunks.append("\n\n".join(current_chunk) if len(current_chunk) > 1 else current_chunk[0])

    
# time to add overlap between chunks
    if not add_overlap:
        return chunks
    
    
    for i in range(1, len(chunks)):
        chunks[i] = " ".join(chunks[i-1].split()[-overlap:]) + " " + chunks[i]

    return chunks


In [None]:
chunks = chunker(text, overlap=overlap, max_chunk_size=max_chunk_size, add_overlap=True)

In [None]:
chunks 

In [None]:
OCR_CLEANING_SYSTEM_PROMPT = (
    "You are an expert in cleaning OCR induced errors in the text. \n"
    "Follow the instructions below to clean the text, ensuring the text flows coherently with the previous context:\n"
    "1. Fix OCR induced typographical errors, such as incorrect characters or spacing.\n"
    "- Use provided context and common sense to identify and correct errors.\n"
    "- For example, 'l' and '1' or 'o' and '0' are often confused.\n"
    "- Ensure that the text is grammatically correct and coherent.\n"
    "- Remove any unnecessary line breaks or extra spaces.\n"
    "- Identify and correct word splits and line breaks.\n"
    "- Only fix clear OCR errors. DO NOT ALTER THE CONTEXT OR MEANING of the text.\n"
    "- DO NOT add any generated text, punctuation, or capitalization.\n"
    "2. Ensure structure is maintained.\n"
    "- Maintain original structure, including paragraphs and line breaks.\n"
    "- Preserve the original content. \n"
    "- Keep all importatnt information intact.\n"
    "- DO NOT add any new text not present in the text. \n"
    "3. Ensure flow and coherence.\n"
    "- Ensure the text flows naturally and coherently.\n"
    "- Use provided context to ensure the text makes sense.\n"
    "- HANDLE text that starts or ends mid-sentence correctly. \n\n"
    "4. Return ONLY the cleaned text.\n"
    "- Do not add any additional information, explanations, or thoughts.\n"
    "- Do not include your thoughts, explanations, or steps.\n"
    "- Do not add any new text not present in the text.\n"
)
OCR_CLEANING_PROMPT = lambda context, text: (
    # "IMPORTATANT: RETURN ONLY THE CLEANED TEXT. Preserve the orignial structure and content. Do not add anything else. Do not include your thoughts, explantions or steps.\n\n"
    f"Previous context:\n {context}\n\n"
    f"Text to clean:\n {text}\n\n"
    "Cleaned text:\n"
)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gc

model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", temperature=0.0)

In [None]:
def ocr_clean(system_message, message, model, tokenizer):
    
    conversation = [{"role": "system", "content": system_message}, {"role": "user", "content": message}]

    # format and tokenize the tool use prompt 
    inputs = tokenizer.apply_chat_template(
                conversation,
                return_dict=True,
                return_tensors="pt",
    )

    print("Inputs generated")
    print("Input size:", inputs.input_ids.shape)
    max_tokens = inputs.input_ids.shape[1]
    print("Max tokens:", max_tokens)
    

    inputs.to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=max_tokens)
    print("Outputs generated")
    generated_ids = [
                output_ids[len(input_ids) :]
                for input_ids, output_ids in zip(inputs.input_ids, outputs)
            ]
    final_out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print("Final Output Generated")
    #print(final_out[0])
    
    return final_out[0]

In [None]:
outs = []
context = ""
for i, chunk in enumerate(chunks):
    print("========================================================================")
    print(f"Chunk {i+1}/{len(chunks)}")
    message = OCR_CLEANING_PROMPT(context, chunk)
    print("Message generated")
    out = ocr_clean(OCR_CLEANING_SYSTEM_PROMPT, message, model, tokenizer)
    print("Output generated")
    outs.append(out)
    context = out[-500:] if len(out) > 500 else out
    print("========================================================================")
    gc.collect()

In [None]:
from fuzzywuzzy import fuzz, process

In [None]:
import re
import string
from fuzzywuzzy import fuzz

def extract_tokens_and_spans(text: str) -> tuple[list[str], list[tuple[int, int]]]:
    """
    Extracts tokens and their character spans from a given text.
    This function identifies non-whitespace runs in the text, strips leading and trailing punctuation,
    and returns a list of tokens along with their character spans in the original text.
    It uses regular expressions to find non-whitespace sequences and captures their start and end positions.

    Args:
        text (str): The input text from which to extract tokens and spans.

    Returns:
        tuple[list[str], list[tuple[int, int]]]: A tuple containing: clean tokens (list of str) and their corresponding spans (list of tuples).
    """
    tokens, spans = [], []
    for match in re.finditer(r'\S+', text):                # find non-whitespace runs
        tok = match.group()
        clean = tok.strip(string.punctuation)          # strip leading/trailing punctuation
        if clean:
            tokens.append(clean)
            spans.append(match.span())                     # (start_char, end_char) of the original tok
    return tokens, spans


def merge_chunks_fuzzy(chunk_a, chunk_b,
                       overlap_words=50,
                       window_size=10,
                       threshold=90):
    """
    Fuzzy-merge two text chunks by detecting an overlap of up to `overlap_words`
    (scanning `chunk_b` in a sliding window of that many cleaned tokens + a little buffer),
    but splice them together on the ORIGINAL strings so all whitespace/newlines/punctuation
    outside the matched overlap are preserved.
    """

    tokens_a, _ = extract_tokens_and_spans(chunk_a)
    tokens_b, spans_b = extract_tokens_and_spans(chunk_b)



    """
    Iteratively

    Left chunk: tokens_a <- Start from double the overlap words to create a tail
    Right chunk: tokens_b <- Start from doubles the overlap words to create a head

    Find the ratio, and best index / ratio

    until index is 0 -> regenerate the tail and head with the new index such that the new tail is tail[new_idx:] and head is head[:new_idx + 1]
    If the best ratio is above the threshold, splice the two chunks together at the best index.
    If the best ratio is below the threshold, concatenate the two chunks.
    If the best index is 0, just concatenate the two chunks.
    
    """

    window = int((overlap_words * 1.2) + window_size)
    best_ratio, best_i = 0, 0
    while window > 0:
        tail = " ".join(tokens_a[-window:])
        head = " ".join(tokens_b[:window + 1])

        ratio = fuzz.partial_ratio(tail, head)

        if ratio > best_ratio:
            best_ratio = ratio
            best_i = window
        
        window -= 1
    # if we found a good overlap → compute the character-offset in chunk_b

    if best_ratio >= threshold:
        cut_pos = spans_b[best_i][0] if best_i < len(spans_b) else len(chunk_b)
        # splice: keep all of chunk_a, then everything in chunk_b from that char-offset onward
        return chunk_a + "\n\n" + chunk_b[cut_pos:]
    else:
        # no confident overlap → just concatenate in full
        return chunk_a + "\n\n" + chunk_b


def merge_sentences(sents, overlap_words=50, window_size=10, threshold=90):

    assert len(sents) > 0, "No sentences to merge"


    if len(sents) == 1:
        return sents[0]

    merge_to = sents[0]

    for i in range(1, len(sents)):
        merge_to = merge_chunks_fuzzy(merge_to, sents[i], overlap_words=50, window_size=10, threshold=90)
    return merge_to

In [None]:
"Hello"[0:]
"Hello"[:1]

In [None]:
final_merge = merge_sentences(outs, overlap_words=overlap, window_size=10, threshold=90)

In [None]:
print(final_merge)

In [None]:
from lib.data_processing.text_processing import TextProcessor
divisions = ["Dicotyledones", "Monocotyledones", "Pteridophyta", "Hepaticae", "Algae"]
text_processor = TextProcessor()
#text_blocks = text_processor.make_text_blocks(text_structure)

In [None]:
c_text = text_processor.preprocess_text(final_merge, divisions[0])
print(c_text)

In [None]:
div_split = text_processor.split_by_divisions(c_text, divisions)
print(div_split)

In [None]:
sample = div_split["Dicotyledones"]

finds = re.finditer(text_processor.family_regex, sample)

find_matches = [i for i in finds]
text_chunks = []

for idx, i in enumerate(find_matches):
    match = re.sub(r"[.\n\t,]*\s*([A-Z]+)\s*[.\n\t,]*", r"\1", i.group())
    start = i.end()
    end = find_matches[idx+1].start() if idx+1 < len(find_matches) else None
    text_chunk = sample[start:end] if end else sample[start:]
    text_chunks.append(dict(family=match, text=text_chunk))

In [None]:
text_chunks

In [None]:
for i in text_chunks:
    print("Family: {0} ==> {1}".format(i["family"], len(i["text"])))

In [None]:
# print("="*50)  # Separator for readability

# family_chunk_overlap = 100
# family_chunk_size = 2000
# for i in range(len(text_chunks)):

#     if len(text_chunks[i]['text']) < max_chunk_size:
#         text_chunks[i]['chunks'] = [text_chunks[i]['text']]
#         print("Chunk is smaller than max_chunk_size, skipping chunking.")
#         print("="*50)
#         continue
    
#     print("Found chunk larger than max_chunk_size: {0} characters".format(len(text_chunks[i]['text'])))
#     print(f"Family: {text_chunks[i]['family']}")
#     print(f"Text: {text_chunks[i]['text'][:100]}...")  # Print first 100 characters
#     print("="*50)  # Separator for readability

#     new_chunks = chunker(text_chunks[i]['text'], overlap=family_chunk_overlap, max_chunk_size=family_chunk_size, add_overlap=False)

#     text_chunks[i]['chunks'] = new_chunks

In [None]:
text_chunks

In [None]:
# # MARKDOWN_SYSTEM_PROMPT = (
# #     "You are an expert in converting text to markdown format. \n"
# #     "Follow the instructions below to convert the text to markdown format:\n"
# #     "1. Convert the text to markdown format.\n"
# #     "- Preserve original content, heading. Add a blank line before and after each heading.\n"
# #     "- If headers are capitalised, keep them capitalised.\n"
# #     "- Use appropriate markdown syntax for headings, lists, and other elements.\n"
# #     "- Ensure that the text is properly formatted and easy to read.\n"
# #     "- Use appropriate markdown syntax for links, images, and other elements.\n"
# #     "2. Remove any content that may have been added by the LLM and was not present in the original text.\n"
# #     "- Remove any unnecessary line breaks or extra spaces.\n"
# #     "- Identify and correct word splits and line breaks.\n"
# #     "3. Preserve all original content. \n"
# #     "4. Ensure structure is maintained.\n"
# #     "- Maintain original structure, including paragraphs and line breaks.\n"
# #     "- Preserve the original content. \n"
# #     "- Keep all important information intact.\n"
# #     "- DO NOT add any new text not present in the text. \n\n"
# #     )

# MARKDOWN_SYSTEM_PROMPT = (
#     "You are an expert in converting text to JSON Lines.\n"
#     "I have a botanical catalogue where each record starts with a species name line, then a number of folders, then items.\n"
#     "Please parse the following into a JSON Lines (NDJSON) stream. Each object should have:\n"
#     "- `species`: the species name (the first line of the record)\n"
#     "- `folders`: a list of folder names (the lines after the species name and before the items)\n"
#     "Ensure the following instructions are followed:\n"
#     "1. Parse into JSON Lines.\n"
#     "- Ensure each record is a valid JSON object.\n"
#     "- Each record should have the keys `species` and `folders`.\n"
#     "- The `species` key should contain the species name.\n"
#     "- The `folders` key should lines of text under the species.\n"
#     "- Do not add any new text not present in the text.\n"
#     "- Collect the lines of text under the species name as a list of strings and store in `folders`.\n"
#     "- Use common sesne and provided context to ensure the text makes sense.\n\n"
#     "2. Ensure flow and coherence.\n"
#     "- Ensure the text flows naturally and coherently.\n"
#     "- Use provided context to ensure the text makes sense.\n"
#     "- HANDLE text that starts or ends mid-sentence correctly. \n\n"
#     "3. Preserve all original content. \n"
#     "- Preserve original content, headings, and structure.\n"
#     "- Do not add any new text not present in the text.\n"
#     "- Keep all important information intact.\n"
#     "- DO NOT add any new text not present in the text. \n\n"
#     "4. ONLY return the JSON Lines output.\n"
#     "- Do not add any additional information, explanations, or thoughts.\n"
#     "- Do not include your thoughts, explanations, or steps.\n"
#     "- Do not add any new text not present in the text.\n"
#     )

# MARKDOWN_PROMPT = lambda text: (
#     f"Parse into JSON lines:\n {text}\n\n"
# )

In [None]:
# for i, chunk in enumerate(text_chunks):
#     print("========================================================================")
#     print(f"Processing chunk for family: {chunk['family']}")

#     for j, sub_chunk in enumerate(chunk['chunks']):
#         print(f"Processing chunk {j+1}/{len(chunk['chunks'])}")
#         message = MARKDOWN_PROMPT(sub_chunk)
#         print("Message generated")
#         out = ocr_clean(MARKDOWN_SYSTEM_PROMPT, message, model, tokenizer)
#         print("Output generated")

#         if 'chunks_cleaned' not in text_chunks[i]:
#             text_chunks[i]['chunks_cleaned'] = []
#         text_chunks[i]['chunks_cleaned'].append(out)
#     print("========================================================================")
#     gc.collect()

In [None]:
text_chunks

In [None]:
index = 6

In [None]:
#!pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.0/en_core_eco_md-1.0.2.tar.gz

In [None]:
from taxonerd import TaxoNERD

class SpeciesChunker:

    SPECIES_REGEX = r"([A-Z][a-z]+(?: [a-z]+)\s?(?:[a-zA-Z\[\]\(\)\.\s\,]+)?)"

    def __init__(self, threshold=90):

        self.threshold = threshold
        self.nlp = None
        
    def load(self):
        if self.nlp is not None:
            raise RuntimeError("Chunker is already loaded. Please create a new instance to load again.")
        
        taxonerd = TaxoNERD(prefer_gpu=False)
        self.nlp = taxonerd.load("en_core_eco_md", exclude=[], threshold=self.threshold)

    def chunk_species(self, text: str) -> list[str]:
        """
        Chunk the text into species records using TaxoNERD.
        Returns a list of dictionaries with species and folders.
        """
        if self.nlp is None:
            print("Chunker is not loaded. Loading Chunker...")
            self.load()

        doc = self.nlp(text)
        species_names = doc.ents

        all_valid_species = "|".join(re.escape(i.text) for i in species_names if re.match(self.SPECIES_REGEX, i.text))

        split_regex = re.compile(rf"^(([0-9]+\.\s?)?(\s+|-)?({all_valid_species})\s*.*)")

        text_splits = text.split("\n")

        chunks = []

        current_chunk = ""

        for line in text_splits:
            if not(line.strip()):
                print("Skipping empty line")
                continue
            if re.match(split_regex, line):
                print(current_chunk)
                if current_chunk:
                    chunks.append(current_chunk.strip())
                    print("Chunk added:\n", current_chunk.strip())
                    print("=" * 50)

                current_chunk = line.strip()
                print("Matched:", line)
                print(current_chunk)
            else:
                current_chunk += "\n" + line.strip()
                print("Not matched:", line)
        
        if current_chunk:
            chunks.append(current_chunk.strip())
        
        return chunks
    
    def group_into_major_chunks(self, chunks: list[str], max_chunk_size: int = 2000) -> list[str]:

        major_chunks = []
        current_chunk = ""

        for chunk in chunks:
            if len(current_chunk) + len(chunk) > max_chunk_size:
                major_chunks.append(current_chunk.strip())
                current_chunk = chunk.strip()
            else:
                current_chunk += "\n\n" + chunk.strip()
        
        if current_chunk:
            major_chunks.append(current_chunk.strip())

        return major_chunks


In [None]:
chunker = SpeciesChunker(threshold=70)

chunker.load()

In [None]:
for i, text_chunk in enumerate(text_chunks):

    chunks = chunker.chunk_species(text_chunk["text"])
    text_chunks[i]["species_chunks"] = chunker.group_into_major_chunks(chunks, max_chunk_size=2000)

In [None]:
text_chunks

In [1]:
import spacy

In [5]:
nlp = spacy.load("en_core_eco_md")

In [6]:
a = """
Dicotyledones
ACERACRAE
Acer campestre L.
1 folder. Acer campestre [TA]
Acer pseudoplatanus L.
2 folders. ot
Folder 1. Acer Pseudo-Platanus
[G]. i. “Maple. Bulls: [Bulstrode]

Park” [JL]
Folder 2. Acer Pseudo-Platanus
[TA].
"""

In [8]:
doc = nlp(a)

In [28]:
doc.get_lca_matrix().shape

(64, 64)

In [31]:
doc.ents[0].text

'Dicotyledones'