In [None]:
import math

input_text = """Typhoon Yagi, known in the Philippines as Severe Tropical Storm Enteng, was a powerful and destructive tropical cyclone which impacted the Philippines, China and Vietnam in early September 2024. Yagi, which means goat or the constellation of Capricornus in Japanese, is the eleventh named storm, the first violent typhoon and Category 5 storm of the annual typhoon season. It was one of the most intense typhoons ever to strike northern Vietnam, the strongest typhoon to strike Hainan during the meteorological autumn and the strongest since Rammasun in 2014. It is one of only four Category 5 super typhoons recorded in the South China Sea, alongside Pamela in 1954, Rammasun in 2014 and Rai in 2021.

Yagi originated from a low-pressure area that formed on August 30, approximately 540 km (330 mi) northwest of Palau. On September 1, the system was classified as a tropical storm and named Yagi by the Japan Meteorological Agency (JMA). After making landfall over Casiguran, Aurora in the Philippines, on September 2, Yagi weakened as it moved inland through the rugged terrain of the Cordillera Central of Luzon. It later emerged over the South China Sea and began merging with a secondary circulation west of Lingayen Gulf, with its deep convection starting to wrap and develop convective bands extending west and south. On September 5, the JMA reported that the storm reached its peak intensity with ten-minute sustained winds of 195 km/h (120 mph) and a central pressure of 915 hPa (27.02 inHg). It subsequently peaked as a Category 5-equivalent super typhoon on the Saffir-Simpson scale, with one-minute sustained winds of 260 km/h (160 mph). After weakening during an eyewall replacement cycle, Yagi slightly restrengthened before making landfall near Wenchang in China's Hainan Province on September 6. Yagi passed over northern Hainan and directly over Haikou, before briefly making landfall over Xuwen County in mainland Guangdong Province and moving into the open waters of the Gulf of Tonkin. It made landfall over Haiphong and Quang Ninh, Vietnam, on September 7 and moved southwestward inland until it was last noted on September 8.

The combination of Yagi and the southwest monsoon led to heavy rains over Luzon, causing widespread flash floods in various areas. The Hong Kong Observatory issued a Gale or Storm No. 8 for Hong Kong as Typhoon Yagi approached. Power outages and downed trees were reported in Hainan; in preparation for Typhoon Yagi, schools in areas in the trajectory of the storm were closed along with local transport services across the island province. In Vietnam, several structures including electric poles were uprooted, leading to power outages in various areas. In total, the typhoon caused at least 48 deaths, 321 injuries, and left 39 people missing, resulting in US$9.29 billion in damage across several countries. 
"""

In [None]:
# import nltk
# nltk.download('punkt_tab')

In [None]:
max_chunck_length = 400
import nltk.data

pkt_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = pkt_tokenizer.tokenize(input_text)

merged_sentences = []

for i in range(len(sentences)):
    sentence = sentences[i]
    if i >= 1 and len(merged_sentences[-1]) + len(sentence) <= max_chunck_length:
        merged_sentences[-1] += " " + sentence
    else:
        if i >= 1:
            merged_sentences.append(sentences[i - 1])
        merged_sentences.append(sentence)

merged_sentences

In [1]:
if True:
    from unsloth import FastLanguageModel

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="./graph_2b_tokenfix",
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
    

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 2060. Max memory: 6.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [8]:
tokenizer("<bos>")

{'input_ids': [2, 2], 'attention_mask': [1, 1]}

In [None]:
# Custom Logit Processors
import random
import torch
from transformers.generation import LogitsProcessor, LogitsProcessorList

def disable_tokens(scores, banned_tokens):
    for token in banned_tokens:
        scores[0][token] = -math.inf
    return scores


def find_largest_index(lst: list, value):
    try:
        return len(lst) - 1 - lst[::-1].index(value)
    except:
        return -1 # not found

# Enforces <T><R><S> Structure
class TRSLogits(LogitsProcessor):
    def __init__(self, _tokenizer):
        self.tokenizer = _tokenizer
        self.t_token = _tokenizer("<unused0>")['input_ids'][1] # <T>
        self.r_token = _tokenizer("<unused1>")['input_ids'][1] # <R>
        self.s_token = _tokenizer("<unused2>")['input_ids'][1] # <S>
        
        self.eos_token = _tokenizer("<eos>")['input_ids'][1] # <EOS>
        
        # self.response_template_token = _tokenizer(response_template)['input_ids'][1]
    
    def __call__(self, input_ids, scores) -> torch.FloatTensor:
        # get the closest token of interest
        ids_list = input_ids.tolist()[0]
        # print(ids_list)
        
        t_near_pos = find_largest_index(ids_list, self.t_token)
        r_near_pos = find_largest_index(ids_list, self.r_token)
        s_near_pos = find_largest_index(ids_list, self.s_token)
        
        near_pos = max(t_near_pos, r_near_pos, s_near_pos)
        
        if near_pos == len(ids_list) - 1:
            # Just generated start token
            # Enforce content generation (no special tokens!)
            banned_tokens = [self.t_token, self.r_token, self.s_token, self.eos_token] # No special tokens allowed
            # print(f"#BAN at [{self.tokenizer.batch_decode(input_ids)[0][-10:]}]")
        # New special token enforce
        elif near_pos == t_near_pos:
            # T - setup
            banned_tokens = [self.t_token,               self.s_token, self.eos_token] # R allowed
            # print(f"#T at [{self.tokenizer.batch_decode(input_ids)[0][-10:]}]")
        elif near_pos == r_near_pos:
            # R - setup
            banned_tokens = [self.t_token, self.r_token,               self.eos_token] # S allowed
            # print(f"#R at [{self.tokenizer.batch_decode(input_ids)[0][-10:]}]")
        elif near_pos == s_near_pos:
            # S - setupz
            banned_tokens = [              self.r_token, self.s_token,               ] # T, end allowed
            # print(f"#S at [{self.tokenizer.batch_decode(input_ids)[0][-10:]}]")
        else:
            print("jjifdoasjddio not supposed to happen arghh!")
            raise Exception
               
        disabled_scores = disable_tokens(scores, banned_tokens)
        # print(input_ids)
        # print(f"{scores} -> {disabled_scores}")
        return disabled_scores
    

In [None]:
from dataclasses import dataclass
import re

# alpaca_prompt = You MUST copy from above!

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Extract the most confident information in the sentence below as much as possible, and express the relationships in RDF Triples that complement the existing RDF triples. Do not use information from common sense.
### Existing RDF triples:
{}
### Input:
{}
### Response:
<unused0>{}""" # start with <T>


@dataclass
class Triple:
    subject: str
    predicate: str
    object: str
    id: int

    def __eq__(self, other: "Triple"):  # Python type hinting sucks
        return self.subject == other.subject and self.predicate == other.predicate and self.object == other.object

    def __str__(self):
        return f"<unused0>{self.subject}<unused1>{self.predicate}<unused2>{self.object}"


def generate_rdf(context: list[Triple], text: str, id: int) -> list[Triple]:
    # context pre-processing
    if len(context) == 0:
        context_str = "None"
    else:
        context_str = " ".join(str(con) for con in random.sample(context, min(len(context), 10)))

    inputs = tokenizer(
        [
            prompt.format(
                context_str,
                text,  # input
                "",  # output - leave this blank for generation!
            )
        ], return_tensors="pt").to("cuda")
    print(f"Processing \"{text[:50]}...{text[-10:]}\"")
    
    outputs = model.generate(
        **inputs, 
        
        # temperature = 0.9,
        # max_new_tokens = max(len(text) + 100, 700), 
        
        logits_processor = LogitsProcessorList([TRSLogits(tokenizer)]),
        # num_beams = 3,
        # early_stopping = True,
        
        use_cache=True, # Use cache = false is broken haha, but beam search is broken when not using cache hahahah ;-;
    )
    
    response = tokenizer.batch_decode(outputs)
    
    response = response[0].replace('\n', '')
    rdf_string = response.split("### Response:")[1]
    
    print(f"Done!")
    # print(rdf_string)
    # convert rdf string to list
    rdfs = []
    rdf_string = rdf_string.removeprefix("<bos>").removesuffix("<eos>")
    for _triple in rdf_string.split("<unused0>"):
        print(_triple)
        try:
            if _triple == "":
                continue
            split = re.split("<unused1>|<unused2>", _triple)
            subject = split[0]
            predicate = split[1]
            _object = split[2]

            new_triple = Triple(subject, predicate, _object, id)

            if not (any(con == new_triple for con in context) or any(con == new_triple for con in rdfs)):
                rdfs.append(new_triple)

        except Exception as e:
            print(f"NON-STANDARD TRIPLE {_triple} ({e})")
            continue
    print("DONE")
    return rdfs


In [None]:
triples = []
for idx, m_sentence in enumerate(merged_sentences):
    print(f"PROCESSING ({idx + 1}/{len(merged_sentences)})")
    triples += generate_rdf(triples, m_sentence, idx)
    # print(triples)

triples

In [None]:
from pyvis.network import Network

net = Network(bgcolor="#222222", font_color="white", notebook=True, directed=True)


# Parse rdf_strings

def add_triples(rdf: Triple, color: str):
    net.add_node(rdf.subject, color=color)
    net.add_node(rdf.object, color=color)
    # if not any(edge['from'] == rdf.subject and edge['to'] == rdf.object and edge['title'] == rdf.predicate for edge in net.edges): # should be deprecated later
    net.add_edge(rdf.subject, rdf.object, title=rdf.predicate, color=color)


import random

r = lambda: random.randint(0, 255)
net.toggle_physics(True)

colors = {}

for idx, triple in enumerate(triples):
    if triple.id in colors:
        color = colors[triple.id]
    else:
        color = '#%02X%02X%02X' % (r(), r(), r())
        colors[triple.id] = color

    add_triples(triple, color)
    # net.show(f"{idx}.html", notebook=False)

print(net.nodes)

In [None]:
net.toggle_physics(True)
#save the HTML instead of show the html

# from IPython.core.display import display
net.show("network.html")

# Dev: check similarity

In [None]:
node_names = [n["id"] for n in net.nodes]
node_names

In [None]:
# mydifflib.py
from difflib import SequenceMatcher
from heapq import nlargest as _nlargest

def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
    """Use SequenceMatcher to return a list of the indexes of the best 
    "good enough" matches. word is a sequence for which close matches 
    are desired (typically a string).
    possibilities is a list of sequences against which to match word
    (typically a list of strings).
    Optional arg n (default 3) is the maximum number of close matches to
    return.  n must be > 0.
    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
    that don't score at least that similar to word are ignored.
    """

    if not n >  0:
        raise ValueError("n must be > 0: %r" % (n,))
    if not 0.0 <= cutoff <= 1.0:
        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
    result = []
    s = SequenceMatcher()
    s.set_seq2(word)
    for idx, x in enumerate(possibilities):
        s.set_seq1(x)
        if s.real_quick_ratio() >= cutoff and \
           s.quick_ratio() >= cutoff and \
           s.ratio() >= cutoff:
            result.append((s.ratio(), idx))

    # Move the best scorers to head of list
    result = _nlargest(n, result)

    # Strip scores for the best n matches
    return [x for score, x in result]

for idx, name in enumerate(node_names):
    # nodes_other = [x for x in node_names if x != name]
    indexes = get_close_matches_indexes(name, node_names)
    indexes.remove(idx)
    if indexes == []:
        continue
    print(f"{name}({idx}) | {[f'{node_names[i]}({i})' for i in indexes]}")