In [1]:
import math

input_text = """Butterflies are winged insects from the lepidopteran suborder Rhopalocera, characterized by large, often brightly coloured wings that often fold together when at rest, and a conspicuous, fluttering flight. The group comprises the superfamilies Hedyloidea (moth-butterflies in the Americas) and Papilionoidea (all others). The oldest butterfly fossils have been dated to the Paleocene, about 56 million years ago, though they likely originated in the Late Cretaceous, about 101 million years ago.[1]

Butterflies have a four-stage life cycle, and like other holometabolous insects they undergo complete metamorphosis.[2] Winged adults lay eggs on the food plant on which their larvae, known as caterpillars, will feed. The caterpillars grow, sometimes very rapidly, and when fully developed, pupate in a chrysalis. When metamorphosis is complete, the pupal skin splits, the adult insect climbs out, expands its wings to dry, and flies off.

Some butterflies, especially in the tropics, have several generations in a year, while others have a single generation, and a few in cold locations may take several years to pass through their entire life cycle.[3]

Butterflies are often polymorphic, and many species make use of camouflage, mimicry, and aposematism to evade their predators.[4] Some, like the monarch and the painted lady, migrate over long distances. Many butterflies are attacked by parasites or parasitoids, including wasps, protozoans, flies, and other invertebrates, or are preyed upon by other organisms. Some species are pests because in their larval stages they can damage domestic crops or trees; other species are agents of pollination of some plants. Larvae of a few butterflies (e.g., harvesters) eat harmful insects, and a few are predators of ants, while others live as mutualists in association with ants. Culturally, butterflies are a popular motif in the visual and literary arts. The Smithsonian Institution says "butterflies are certainly one of the most appealing creatures in nature".[5] """

In [2]:
# import nltk
# nltk.download('punkt_tab')

In [3]:
max_chunck_length = 400
import nltk.data

pkt_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = pkt_tokenizer.tokenize(input_text)

merged_sentences = []

for i in range(len(sentences)):
    sentence = sentences[i]
    if i >= 1 and len(merged_sentences[-1]) + len(sentence) <= max_chunck_length:
        merged_sentences[-1] += " " + sentence
    else:
        if i >= 1:
            merged_sentences.append(sentences[i - 1])
        merged_sentences.append(sentence)

merged_sentences

['Butterflies are winged insects from the lepidopteran suborder Rhopalocera, characterized by large, often brightly coloured wings that often fold together when at rest, and a conspicuous, fluttering flight. The group comprises the superfamilies Hedyloidea (moth-butterflies in the Americas) and Papilionoidea (all others).',
 'The group comprises the superfamilies Hedyloidea (moth-butterflies in the Americas) and Papilionoidea (all others).',
 'The oldest butterfly fossils have been dated to the Paleocene, about 56 million years ago, though they likely originated in the Late Cretaceous, about 101 million years ago. [1]\n\nButterflies have a four-stage life cycle, and like other holometabolous insects they undergo complete metamorphosis. [2] Winged adults lay eggs on the food plant on which their larvae, known as caterpillars, will feed.',
 '[2] Winged adults lay eggs on the food plant on which their larvae, known as caterpillars, will feed.',
 'The caterpillars grow, sometimes very rapi

In [4]:
if True:
    from unsloth import FastLanguageModel

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="Gemma2/gemma2_2b_context",  # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 2060. Max memory: 6.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [5]:
from dataclasses import dataclass
import re

# alpaca_prompt = You MUST copy from above!

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Extract the most confident information in the sentence below as much as possible, and express the relationships in RDF Triples that complement the existing RDF triples. Do not use information from common sense.
### Existing RDF triples:
{}
### Input:
{}
### Response:
{}"""


@dataclass
class Triple:
    subject: str
    predicate: str
    object: str
    id: int

    def __eq__(self, other: "Triple"):  # Python type hinting sucks
        return self.subject == other.subject and self.predicate == other.predicate and self.object == other.object

    def __str__(self):
        return f"<T>{self.subject}<R>{self.predicate}<S>{self.object}"


def generate_rdf(context: list[Triple], text: str, id: int) -> list[Triple]:
    # context pre-processing
    if len(context) == 0:
        context_str = "None"
    else:
        context_str = " ".join(str(con) for con in context)

    inputs = tokenizer(
        [
            prompt.format(
                context_str,
                text,  # input
                "",  # output - leave this blank for generation!
            )
        ], return_tensors="pt").to("cuda")
    print(f"Processing \"{text[:50]}...{text[-10:]}\"")
    outputs = model.generate(**inputs, max_new_tokens=max(len(text) + 100, 700), use_cache=True)
    response = tokenizer.batch_decode(outputs)
    response = response[0].replace('\n', '')
    rdf_string = response.split("### Response:")[1]
    print(f"Done!")
    print(rdf_string)
    # convert rdf string to list
    rdfs = []
    rdf_string = rdf_string.removeprefix("<bos>").removesuffix("<eos>")
    for _triple in rdf_string.split("<T>"):
        print(_triple)
        try:
            if _triple == "":
                continue
            split = re.split("<S>|<R>", _triple)
            subject = split[0]
            predicate = split[1]
            _object = split[2]

            new_triple = Triple(subject, predicate, _object, id)

            if not (any(con == new_triple for con in context) or any(con == new_triple for con in rdfs)):
                rdfs.append(new_triple)

        except Exception as e:
            print(f"NON-STANDARD TRIPLE {_triple} ({e})")
            continue
    print("DONE")
    return rdfs


In [6]:
triples = []
for idx, m_sentence in enumerate(merged_sentences):
    print(f"PROCESSING ({idx + 1}/{len(merged_sentences)})")
    triples += generate_rdf(triples, m_sentence, idx)
    # print(triples)

triples

PROCESSING (1/11)
Processing "Butterflies are winged insects from the lepidopter...l others)."
Done!
<T>Butterflies<R>order<S>Lepidoptera<T>Lepidoptera<R>class<S>Insecta<T>Butterflies<R>order<S>Rhopalocera<T>Rhopalocera<R>characteristic<S>Large_wings<T>Butterflies<R>order<S>Papilionoidea<T>Butterflies<R>class<S>Insecta<T>Butterflies<R>order<S>Hedyloidea<eos>

Butterflies<R>order<S>Lepidoptera
Lepidoptera<R>class<S>Insecta
Butterflies<R>order<S>Rhopalocera
Rhopalocera<R>characteristic<S>Large_wings
Butterflies<R>order<S>Papilionoidea
Butterflies<R>class<S>Insecta
Butterflies<R>order<S>Hedyloidea
DONE
PROCESSING (2/11)
Processing "The group comprises the superfamilies Hedyloidea (...l others)."
Done!
<T>Butterflies<R>superfamily<S>Papilionoidea<T>Butterflies<R>order<S>Hedyloidea<T>Hedyloidea<R>region<S>Americas<eos>

Butterflies<R>superfamily<S>Papilionoidea
Butterflies<R>order<S>Hedyloidea
Hedyloidea<R>region<S>Americas
DONE
PROCESSING (3/11)
Processing "The oldest butterfly fossils hav

[Triple(subject='Butterflies', predicate='order', object='Lepidoptera', id=0),
 Triple(subject='Lepidoptera', predicate='class', object='Insecta', id=0),
 Triple(subject='Butterflies', predicate='order', object='Rhopalocera', id=0),
 Triple(subject='Rhopalocera', predicate='characteristic', object='Large_wings', id=0),
 Triple(subject='Butterflies', predicate='order', object='Papilionoidea', id=0),
 Triple(subject='Butterflies', predicate='class', object='Insecta', id=0),
 Triple(subject='Butterflies', predicate='order', object='Hedyloidea', id=0),
 Triple(subject='Butterflies', predicate='superfamily', object='Papilionoidea', id=1),
 Triple(subject='Hedyloidea', predicate='region', object='Americas', id=1),
 Triple(subject='Butterflies', predicate='lifeCycleStage', object='Egg', id=2),
 Triple(subject='Butterflies', predicate='order', object='Holometabola', id=2),
 Triple(subject='Butterflies', predicate='lifeCycleStage', object='Caterpillar', id=2),
 Triple(subject='Butterflies', pre

In [9]:
from pyvis.network import Network

net = Network(bgcolor="#222222", font_color="white", notebook=True, directed=True)


# Parse rdf_strings

def add_triples(rdf: Triple, color: str):
    net.add_node(rdf.subject, color=color)
    net.add_node(rdf.object, color=color)
    # if not any(edge['from'] == rdf.subject and edge['to'] == rdf.object and edge['title'] == rdf.predicate for edge in net.edges): # should be deprecated later
    net.add_edge(rdf.subject, rdf.object, title=rdf.predicate, color=color)


import random

r = lambda: random.randint(0, 255)
net.toggle_physics(True)

colors = {}

for idx, triple in enumerate(triples):
    if triple.id in colors:
        color = colors[triple.id]
    else:
        color = '#%02X%02X%02X' % (r(), r(), r())
        colors[triple.id] = color

    add_triples(triple, color)
    # net.show(f"{idx}.html", notebook=False)

print(net.nodes)

[{'color': '#B7CA30', 'id': 'Butterflies', 'label': 'Butterflies', 'shape': 'dot', 'font': {'color': 'white'}}, {'color': '#B7CA30', 'id': 'Lepidoptera', 'label': 'Lepidoptera', 'shape': 'dot', 'font': {'color': 'white'}}, {'color': '#B7CA30', 'id': 'Insecta', 'label': 'Insecta', 'shape': 'dot', 'font': {'color': 'white'}}, {'color': '#B7CA30', 'id': 'Rhopalocera', 'label': 'Rhopalocera', 'shape': 'dot', 'font': {'color': 'white'}}, {'color': '#B7CA30', 'id': 'Large_wings', 'label': 'Large_wings', 'shape': 'dot', 'font': {'color': 'white'}}, {'color': '#B7CA30', 'id': 'Papilionoidea', 'label': 'Papilionoidea', 'shape': 'dot', 'font': {'color': 'white'}}, {'color': '#B7CA30', 'id': 'Hedyloidea', 'label': 'Hedyloidea', 'shape': 'dot', 'font': {'color': 'white'}}, {'color': '#3B7AD8', 'id': 'Americas', 'label': 'Americas', 'shape': 'dot', 'font': {'color': 'white'}}, {'color': '#35B56C', 'id': 'Egg', 'label': 'Egg', 'shape': 'dot', 'font': {'color': 'white'}}, {'color': '#35B56C', 'id': '

In [10]:
net.toggle_physics(True)
#save the HTML instead of show the html

# from IPython.core.display import display
net.show("network.html")

network.html


# Dev: check similarity

In [14]:
node_names = [n["id"] for n in net.nodes]
node_names

['Butterflies',
 'Lepidoptera',
 'Insecta',
 'Rhopalocera',
 'Large_wings',
 'Papilionoidea',
 'Hedyloidea',
 'Americas',
 'Egg',
 'Holometabola',
 'Caterpillar',
 'Paleozoic',
 'Cretaceous_period',
 'Winged_insect',
 'Food_plant',
 'Pupation',
 '"rapidly"',
 'Adult',
 'Adult_insect',
 'Wings',
 'Climbing',
 'Complete_metamorphosis',
 'Tropical_rainforest',
 '"Several generations in a year, or a single generation, depending on the location"',
 '"Several years in cold locations"',
 'Aposematism',
 'Mimicry',
 'True',
 'None',
 '"None"',
 'Parasitoid',
 'Wasp',
 'Flies',
 'Other_invertebrates',
 'Protozoan',
 'Domestic_crop',
 'Other_insect',
 'Tree',
 'Plant',
 'Crop',
 'Coleoptera',
 'Arthropoda',
 'Diptera',
 'Hymenoptera',
 'Diplura',
 'Pollination',
 'Coleoptera ',
 'Ant',
 'Hymenoptera ',
 'Diplura ',
 'Diptera ',
 'Lepidoptera ',
 'Insecta ',
 'Arthropoda ',
 'Winged_insect ',
 'Papilionoidea ',
 'Rhopalocera ',
 'Paleozoic ',
 'Cretaceous_period ',
 'Holometabola ',
 'Diptera <']

In [20]:
# mydifflib.py
from difflib import SequenceMatcher
from heapq import nlargest as _nlargest

def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
    """Use SequenceMatcher to return a list of the indexes of the best 
    "good enough" matches. word is a sequence for which close matches 
    are desired (typically a string).
    possibilities is a list of sequences against which to match word
    (typically a list of strings).
    Optional arg n (default 3) is the maximum number of close matches to
    return.  n must be > 0.
    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
    that don't score at least that similar to word are ignored.
    """

    if not n >  0:
        raise ValueError("n must be > 0: %r" % (n,))
    if not 0.0 <= cutoff <= 1.0:
        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
    result = []
    s = SequenceMatcher()
    s.set_seq2(word)
    for idx, x in enumerate(possibilities):
        s.set_seq1(x)
        if s.real_quick_ratio() >= cutoff and \
           s.quick_ratio() >= cutoff and \
           s.ratio() >= cutoff:
            result.append((s.ratio(), idx))

    # Move the best scorers to head of list
    result = _nlargest(n, result)

    # Strip scores for the best n matches
    return [x for score, x in result]

for idx, name in enumerate(node_names):
    # nodes_other = [x for x in node_names if x != name]
    indexes = get_close_matches_indexes(name, node_names)
    indexes.remove(idx)
    if indexes == []:
        continue
    print(f"{name}({idx}) | {[f'{node_names[i]}({i})' for i in indexes]}")

Lepidoptera(1) | ['Lepidoptera (51)', 'Diptera(42)']
Insecta(2) | ['Insecta (52)']
Rhopalocera(3) | ['Rhopalocera (56)']
Papilionoidea(5) | ['Papilionoidea (55)']
Holometabola(9) | ['Holometabola (59)']
Paleozoic(11) | ['Paleozoic (57)']
Cretaceous_period(12) | ['Cretaceous_period (58)']
Winged_insect(13) | ['Winged_insect (54)', 'Other_insect(36)']
Pupation(15) | ['Pollination(45)']
Adult_insect(18) | ['Other_insect(36)', 'Winged_insect(13)']
True(27) | ['Tree(37)']
None(28) | ['"None"(29)']
"None"(29) | ['None(28)']
Other_insect(36) | ['Adult_insect(18)', 'Other_invertebrates(33)']
Tree(37) | ['True(27)']
Coleoptera(40) | ['Coleoptera (46)', 'Hymenoptera(43)']
Arthropoda(41) | ['Arthropoda (53)']
Diptera(42) | ['Diptera (50)', 'Diptera <(60)']
Hymenoptera(43) | ['Hymenoptera (48)', 'Coleoptera(40)']
Diplura(44) | ['Diplura (49)', 'Diptera(42)']
Pollination(45) | ['Pupation(15)']
Coleoptera (46) | ['Coleoptera(40)', 'Lepidoptera (51)']
Hymenoptera (48) | ['Hymenoptera(43)', 'Coleopter