# IMPORT


In [1]:
import os, json, pickle, re, math, torch, multiprocessing, nltk
from tqdm import tqdm
import numpy as np

import wikipedia
from joblib import Parallel, delayed 
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import networkx as nx
from pyvis.network import Network
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# OFFLINE WIKI CLASS

In [2]:
# IMPORTING
from wikiDump_cleaner import Cleaner
import bz2, os, re, json, pickle
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from tqdm import tqdm
from difflib import SequenceMatcher
# from joblib import Parallel, delayed 


class offline_Wiki():
    def __init__(self, 
                 wiki_index_file = None,
                 wikiDump_bz2_file = None,
                 index_folder = None,
                 verbose = False
                 ):
        self.verbose = verbose
        self.wiki_index_file = wiki_index_file
        self.wikiDump_bz2_file = wikiDump_bz2_file
        
        self.prefixx = "index_"
        self.suffixx = ".p" 
        self.files_index = []
        self.index_keys = []
        self.index_folder = index_folder

        if self.index_folder:
            self.files_index = os.listdir(index_folder)
            self.index_keys = sorted([".".join(os.path.basename(i).split(".")[:-1]).split("_")[1] for i in self.files_index], key=lambda x: x.lower())

        else:
            _choice = input("Wiki index folder for is not provided. Do you want to create index ? (1/0) : ").strip()
            if _choice in "10" and len(_choice) == 1 :
                if int(_choice):
                    self.index_folder = self.index_maker(wiki_index_file, wikiDump_bz2_file, verbose=self.verbose)
                    self.files_index = os.listdir(self.index_folder)
                    self.index_keys = sorted([".".join(os.path.basename(i).split(".")[:-1]).split("_")[1] for i in self.files_index], key=lambda x: x.lower())
                else:
                    print("Index folder not made ...")
    ###
    ### --- INDEX MAKER ---
    ###
                    
    def get_start_bytes_list(self, txt_wiki_dump_index_path, bz2_wiki_dump_path, verbose = False):
        start_bytes = []
 
        with open(txt_wiki_dump_index_path, "r", encoding="utf-8") as f:
            index_file = f.readlines()
        
        if verbose:
            start_bytes = [int(x.split(":")[0]) for x in tqdm(index_file)] # <--- SIMPLY USING A for LOOP IS VERY VERY FAST (7 SEC)
            # start_bytes = Parallel(n_jobs=N_JOB_COUNT)(delayed(get_start_bytes_list_helper)(x) for x in tqdm(index_file)) # <--- USING Parallel FINISHES IN >3 MINS 😮
        else:
            start_bytes = [int(x.split(":")[0]) for x in index_file]

        # to deduplicate the list
        start_bytes = list(set(start_bytes))  #
        
        # but we want them in a specific order

        file_size = os.path.getsize(bz2_wiki_dump_path)
        start_bytes.append(file_size + 1)

        start_bytes.sort()

        if verbose:
            print(f"GOT {len(start_bytes)} START BYTES")
        
        return start_bytes
    
    def acceptableWord(self, word, verbose = False):
        flag = False
    
        if verbose:
            print(f"Checking {word}", end="\r")
    
        if any((i in word) for i in ["File:", 
                                    "Template:", 
                                    "Wikipedia:", 
                                    "Category:", 
                                    "Help:", 
                                    "Portal:",
                                    "MediaWiki:",
                                    "Draft:",
                                    "Module:"]):
            return flag
        
        if any([word.endswith(ext) for ext in [".jpg",
                                                ".png", 
                                                ".gif", 
                                                ".zip", 
                                                ".ogg", 
                                                ".mp3", 
                                                ".mp4", 
                                                ".webp"]]):
            return flag
        
        flag = True
    
        return flag

    def clean_filename(self, filename):
        # Remove invalid characters for filenames
        return re.sub(r'[^\w\-_.() ]', ' ', filename)
    
    def save_pickle(self, dataa, filename):
        with open(filename, "wb") as f:
            pickle.dump(dataa, f)

    def store_dictionary_in_bins(self, word_dictionary, binsize=10000, 
                                index_folder ="./indexes/", verbose = False):
        if not os.path.exists(index_folder):
            os.makedirs(index_folder)

        sorted_keys = sorted(word_dictionary.keys())
        # sorted_keys = sorted(word_dictionary.keys(), key=lambda x: x.lower())  # Sort keys case-insensitively
        num_bins = len(sorted_keys) // binsize + (1 if len(sorted_keys) % binsize != 0 else 0)

        for i in tqdm(range(num_bins)):
            start_idx = i * binsize
            end_idx = min((i + 1) * binsize, len(sorted_keys))
            bin_keys = sorted_keys[start_idx:end_idx]
            bin_data = {key: (word_dictionary[key]) for key in bin_keys}

            first_key = bin_keys[0]
            filename = f"index_{self.clean_filename(first_key).strip()}.p"
            # Check if the cleaned filename is less than 3 characters
            if len(filename) < 8+2 and i > 0:
                # Try using the second key in bin_keys as the filename
                # if len(bin_keys) > 1:
                #     second_key = bin_keys[1]
                #     cleaned_filename = clean_filename(f"index_{second_key}.json")
            # if len(filename) < 8+3:
                print(i, filename, first_key , len(filename))
                # print(cleaned_filename, second_key , len(cleaned_filename))
                continue

            file_path = index_folder+filename
            self.save_pickle(bin_data, file_path)
            # # with open(file_path, 'w') as f:
            # #     json.dump(bin_data, f, indent=3)
            if verbose:
                print(f"Stored {len(bin_data)} elements in {filename}")

    def index_maker(self, index_file, wikiDump, index_folder = "./indexes/", verbose = False):
        if not index_file:
            index_file = self.wiki_index_file
        
        if not wikiDump :
            wikiDump = self.wikiDump_bz2_file
        
        # wiki_bz2_file_size = os.path.getsize(wikiDump)
        
        start_byte_list = self.get_start_bytes_list(index_file, wikiDump, verbose=verbose)
        
        if verbose:
            print("making start - end bytes list")
        start_end_list = [(start_byte_list[i], start_byte_list[i+1]) for i in range(len(start_byte_list)-1)]

        start_byte_list.clear() # EMPTYING MEMEORY
        
        if verbose:
            print("start - end bytes list made")

        start_end_dict = dict(start_end_list)

        word_start_end_dict = {}       
        if verbose : 
            print(f"Opening {index_file}.")

        with open(index_file, "r", encoding = "utf-8") as f:
            index_file = f.readlines()

        if verbose:
            print(f"length of index file : {len(index_file)}")

        for line in tqdm(index_file):
            start_byte, _idk, word = line.split(":")[0], line.split(":")[1], ":".join(line.split(":")[2:]).strip() 
            start_byte = int(start_byte)
            # word = ":".join(word)
            # if word 
            # print(f"{start_byte}, {start_end_dict[start_byte]} : {word}")
            if not self.acceptableWord(word):
                # print(word)
                continue
            word_start_end_dict[word] = (start_byte, start_end_dict[start_byte])
            # input()
            # start_byte = 
        # start_bytes = [int(x.split(":")[0]) for x in tqdm(index_file)] # <--- SIMPLY USING A for LOOP IS VERY VERY FAST (7 SEC)
        if verbose:
            print(f"Length of word - (start, end) dict is : {len(word_start_end_dict)}")

        index_file.clear() # EMPTYING MEMEORY

        if verbose:
            print("Making bins")
        self.store_dictionary_in_bins(word_start_end_dict, binsize=20000, index_folder=index_folder) # STORING IN BINS
        
        if verbose:
            print("Bins made")
        
        return index_folder

    ###
    ### --- INDEX READER ---
    ###

    def load_pickle(self, filename):
        res = None
        with open(filename, "rb") as f:
            res = pickle.load(f)
        return res
    
    ### ----- SIMILARITY FINDING FUNCTIONS | STARTS -----

    def search_closest_words(self, keywords, word):
        word_lower = word.lower()  # Convert search word to lowercase
        start = 0
        end = len(keywords) - 1
        closest_words = []

        while start <= end:
            mid = (start + end) // 2

            # Convert current keyword to lowercase for comparison
            keyword_mid_lower = keywords[mid].lower()

            # Check if the word falls between keywords[mid] and keywords[mid+1]
            if keyword_mid_lower < word_lower < keywords[mid + 1].lower():
                closest_words.append(keywords[mid])
                closest_words.append(keywords[mid + 1])
                break
            elif word_lower < keyword_mid_lower:
                end = mid - 1
            else:
                start = mid + 1

        return closest_words

    def jaccard_similarity_word(self, s1, s2):
        set1 = set(s1.lower())  # Convert s1 to lowercase
        set2 = set(s2.lower())  # Convert s2 to lowercase
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union if union != 0 else 0

    def jaccard_similarity(self, str1, str2):
    # Convert input strings to sets of words
        set1 = set(str1.lower().split())
        set2 = set(str2.lower().split())
        
        # Calculate Jaccard similarity
        intersection = len(set1 & set2)
        union = len(set1 | set2)
        # similarity = intersection / union
        
        # return similarity
        return intersection / union if union != 0 else 0

    def weighted_jaccard_similarity(self, s1, s2):
        set1 = set(s1.lower().split())  # Convert to lowercase and split into words
        set2 = set(s2.lower().split())
        # intersection_weight = sum(min(list(set1).count(w), list(set2).count(w)) for w in set1.intersection(set2))
        # union_weight = sum(max(list(set1).count(w), list(set2).count(w)) for w in set1.union(set2))
        intersection_weight = len(set1.intersection(set2))
        union_weight = len(set1.union(set2))
        return intersection_weight / union_weight if union_weight != 0 else 0
        
    def is_fuzz_similar(self, string1, string2, threshold = 80, verbose = False):
        ratioo = fuzz.ratio(string1, string2)
        if verbose:
            print(f"Fuzzy similarity between {string1} and {string2} is {ratioo}")
        if ratioo >= threshold:
            return True 
        return False 

    ### ----- SIMILARITY FINDING FUNCTIONS | ENDS -----

    def find_similar_keys(self, word, dictionary, threshold=0.5, verbose=False):
        similar_keys = []

        if verbose:
            print(f"Finding {word}...")

        word_lower = word.lower()  # Convert word to lowercase

        for key in dictionary:
            # similarity = fuzz.ratio(word_lower, key)
            # similarity = self.jaccard_similarity_word(word_lower, key)  # Use lowercase word for comparison
            similarity = self.jaccard_similarity(word_lower, key)  # Use lowercase word for comparison
            # similarity = weighted_jaccard_similarity(word_lower, key)  # Using weighted Jaccard
            if similarity > threshold:
                similar_keys.append((key, similarity))

        similar_keys.sort(key=lambda x: x[1], reverse=True)  # Sort by similarity
        similar_keys = [key[0] for key in similar_keys]  # Extract keys only

        return similar_keys if similar_keys else None

    def fetch_word_from_list(self, target_word, similar_words, threshold = 0.7): 
        for word in similar_words:
            if word == target_word:
                return word
        for word in similar_words:    
            if word.lower() == target_word.lower():
                return word
        
        similarr = []
        word_lower = word.lower()
        for key in similar_words:
            # _similarity = fuzz.ratio(word_lower, key)
            _similarity = self.jaccard_similarity(word_lower, key)  # Use lowercase word for comparison

            # _similarity = self.weighted_jaccard_similarity(word_lower, key)  # Use lowercase word for comparison
            # similarity = weighted_jaccard_similarity(word_lower, key)  # Using weighted Jaccard
            if _similarity > threshold:
                similarr.append((key, _similarity))

        similarr.sort(key=lambda x: x[1], reverse=True)  # Sort by similarity
        similarr = [key[0] for key in similarr]  # Extract keys only
        
        return similarr[0] if similarr else None
        # return find_most_similar_word(target_word, similar_words, max_similarity=threshold)  
        # return None

    def find_most_similar_word(self, query, keywords, max_similarity = 0.7):
        # max_similarity = 0
        most_similar_word = None

        # Iterate through the keywords and find the most similar one to the query
        for keyword in keywords:
            similarity = SequenceMatcher(None, query, keyword).ratio()
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_word = keyword

        return most_similar_word


    def page_cleaner(self, page_body, summaryOnly = False):
        cleaner = Cleaner()
        cleaned_page = cleaner.clean_text(page_body)
        cleaned_text, links = cleaner.build_links(cleaned_page)

        if summaryOnly:
            cleaned_text = cleaned_text.split("==")[0] # THE FIRST PARAGRAPH/SECTION HAS THE SUMMARY

        return cleaned_text, links

    def extract_cleaned_page(self, page_soup, summaryOnly = False, verbose = False, wantLinks = False, wikibaseurl = "https://en.wikipedia.org/wiki/"):
        """
        RETURNS THE PAGE TITLE, URL, AND PAGE CONTENT
        YOU MAY WANT TO CHANGE THIS ACCORDING IT YOUR NEED
        
        """
        page_title = page_soup.find("title").text
        page_body = page_soup.find("text").text
        page_redirect = page_soup.find("redirect") 
        page_url = wikibaseurl+page_title.replace(' ', '_')

        if page_redirect:
            page_redirect = page_redirect['title']
            page_url = wikibaseurl+page_redirect.replace(' ', '_')
            # page_title = page_redirect.replace(' ', '_')
            
        page_body, _ = self.page_cleaner(page_body, 
                                    summaryOnly=summaryOnly)
    
        if verbose:
            print(f"Page cleaning done... \nGot Title : {page_title}, \nCleaned page body : {page_body} \nPage url : {page_url}, {'and Links : {_}'*wantLinks}\n")
        
        returning = [page_title, page_url, page_body]

        if wantLinks:
            returning.append(_)

        return returning

    def extract_pages(self, page_xml):
        soup = BeautifulSoup(page_xml, "lxml")
        pages = soup.find_all("page")
        return pages


    def decompress_xml(self, bz2_wiki_dump_path, start_byte, end_byte, verbose = False):
        decomp = bz2.BZ2Decompressor()
        with open(bz2_wiki_dump_path, 'rb') as f:
            f.seek(start_byte)
            readback = f.read(end_byte - start_byte - 1)
            page_xml = decomp.decompress(readback).decode()

            pages = self.extract_pages(page_xml)
            
            if verbose:
                print(f"FOUND : {len(pages)} PAGES BETWEEN {start_byte} BYTE AND {end_byte} BYTE.")
            
        return pages 
    
    def word_match(self, word, verbose = False, summaryOnly = True):
        near_words = self.search_closest_words(self.index_keys, word)
        if verbose:
            print(f"near words similar to {word} : {near_words}")
        near_words_file_path = [self.index_folder + self.prefixx + word + self.suffixx for word in near_words]
        
        to_search_into = {}
        [to_search_into.update(self.load_pickle(file_pathh)) for file_pathh in near_words_file_path]
        similar_keywords = self.find_similar_keys(word, to_search_into)
        # similar_keywords = self.find_most_similar_keys(word, to_search_into)

        if verbose:
            print(f"Similar keywords found in Wiki : {similar_keywords}")
        
        if not similar_keywords:
            if verbose:
                print("No similar keyword found !!!")
            return None
  
        wanted = self.fetch_word_from_list(word, similar_keywords)
        
        if not wanted:
            wanted = similar_keywords[0]
        
        if verbose:
            print(f"Wanted : {wanted}")

        _start, _end = to_search_into[wanted]
        
        if verbose:
            print(f"Byte start : {_start}, Byte end : {_end}")
        
        decompressed_pages = self.decompress_xml(self.wikiDump_bz2_file, _start, _end)
        for page_xml in decompressed_pages:
            # print(page_xml)
            _page_title,_page_url, _page_summary = "", "", ""
            _page_title, _page_url, _page_summary = self.extract_cleaned_page(page_xml, summaryOnly=summaryOnly, verbose=verbose)
            if self.is_fuzz_similar(wanted, _page_title, threshold=90,verbose=verbose): 
                # offline_dict[_page_title] = {'title' : _page_title, 
                #                             'url' : _page_url, 
                #                             'summary' : _page_summary}
                return {'title' : _page_title, 
                        'url' : _page_url, 
                        'summary' : _page_summary}
        return None
        
                




# KB CLASS 

In [3]:
class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add
            

    def get_wikipedia_data(self, candidate_entity, useWiki = True, offline_wiki = None, verbose = False):
        # print("\n\n--- offline", offline_Wiki)
        entity_data = None
        stop_words = set(stopwords.words('english'))
        if len(candidate_entity.split()) > 4:
            word_tokens = word_tokenize(candidate_entity)
            candidate_entity = " ".join([w for w in word_tokens if not w.lower() in stop_words])

        try:
            if offline_wiki:
                if verbose:
                    print(f"Finding {candidate_entity} in offline Wiki")
                _entity_data = offline_wiki.word_match(candidate_entity, verbose = verbose)
                
                if verbose:
                    print(f"Got {_entity_data} after word_match from offline Wiki")

                if "REDIRECT".lower() in _entity_data["summary"][:10].lower():
                    # entity_data = _entity_data
                    _word = _entity_data["url"].split("/wiki/")[-1].strip()
                    if verbose:
                        print(f"REDIRECT found !!! Candidate entitiy {candidate_entity} === changed to ==> {_word}")

                    entity_data = self.get_wikipedia_data(_word, useWiki=useWiki, offline_wiki=offline_Wiki, verbose=verbose)
                else:                    
                    ratioo = fuzz.ratio(candidate_entity, _entity_data['title'])
                    if verbose:
                        print(f"Fuzz ration : {ratioo}")
                    if ratioo > 50 :
                        entity_data = _entity_data
                        if verbose:
                            print(f"Got {entity_data} from offline wiki with similarity ration = {ratioo}.")
            
            if useWiki and not entity_data:
                if verbose:
                    print(f"Finding {candidate_entity} in online Wiki")
                page = wikipedia.page(candidate_entity, auto_suggest=False)
                entity_data = {
                    "title": page.title,
                    "url": page.url,
                    "summary": page.summary
                }
            
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date, 
                     useWiki = True, offlineWiki = None, verbose = False):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        if verbose:
            print(f"Candidate entities : {candidate_entities}")
            
        # entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]
        
        # TRY 2
        entities = []
        if useWiki:
            entities = Parallel(n_jobs=N_JOB_COUNT)(delayed(self.get_wikipedia_data)(ent, useWiki, offlineWiki, verbose=verbose) for ent in candidate_entities)
            # entities = [self.get_wikipedia_data(ent, useWiki, offlineWiki) for ent in candidate_entities]

        else:
            entities = [{"title": ent,
                         "url": "",
                         "summary": ""
                        } for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

# FOR GPU

In [4]:
def get_cpu_count():
    c = multiprocessing.cpu_count()
    return c 

def check_gpu():
    for i in range(torch.cuda.device_count()):
        device_name = f'cuda:{i}'
        print(f'{i} device name:{torch.cuda.get_device_name(torch.device(device_name))}')

def get_gpu():
    return [f'cuda:{i}' for i in range(torch.cuda.device_count())]

print(check_gpu())

print(get_gpu())

print(get_cpu_count())

N_JOB_COUNT = get_cpu_count()//2
N_JOB_COUNT = 1

0 device name:NVIDIA GeForce RTX 3060 Laptop GPU
None
['cuda:0']
16


# HELPER FUNCTIONS

In [5]:
def loadJSON(filepathh):
    _dataa = {} 
    if os.path.exists(filepathh):
        with open(filepathh, "r", encoding="utf-8") as _f:
            _dataa = json.load(_f)
    else:
        print(f"{filepathh} does not exists...\n") 
    return _dataa 

def loadTXT(filepathh):
    _dataa = ""
    if os.path.exists(filepathh):
        with open(filepathh, "r", encoding="utf-8") as _f:
            _dataa = _f.read()
    else:
        print(f"{filepathh} does not exists...\n") 
    return _dataa 

def loadFILE(filepathh = ""):
    if os.path.exists(filepathh):
        if filepathh.endswith(".txt"):
            return loadTXT(filepathh)
        elif filepathh.endswith(".json"):
            return loadJSON(filepathh)
        else:
            print("\n- Invalid File format 😐 !!!\n")
            return None
    else:
        print(f"{filepathh} does not exists...\n") 

def remove_garbage(text):
    # Remove garbage Unicode characters
    cleaned_text = text.encode().decode('unicode-escape')
    # Remove any remaining non-printable characters
    cleaned_text = re.sub(r'[^\x20-\x7E]', '', cleaned_text)
    return cleaned_text

def clean_sentence(sentence):
    # Remove extra white spaces
    cleaned_sentence = re.sub(r'\s+', ' ', sentence)
    # Remove unwanted characters except alphabets, numbers, punctuation marks, '@', '-', and '_'
    cleaned_sentence = re.sub(r'[^a-zA-Z0-9@#\-_.,?!\'" ]', '', cleaned_sentence)
    # Remove words containing '#' and 'pic.twitter.com'
    cleaned_sentence = ' '.join(word if '#' not in word and 'pic.twitter.com' not in word else ' ' for word in cleaned_sentence.split() )
    return cleaned_sentence.strip()

def clean_document(document):
    document = remove_garbage(document)
    # Tokenize the document into sentences
    sentences = sent_tokenize(document)
    # Clean each sentence
    cleaned_sentences = [clean_sentence(sentence) for sentence in sentences]
    return cleaned_sentences

In [6]:
def save_network_html(kb, filename="network.html", 
                      verbose = False, 
                      physics = False,
                      show = False):

    if not os.path.exists(filename):
        with open(filename, 'w') as _file:
            _file.write("")

    # create network
    G = nx.Graph()
    net = Network(directed=True, 
                  notebook=True,
                  width="1000px", 
                  height="1000px",
                #   bgcolor="#eeeeee"
                )
    if verbose:
        print("Network initialized")

    # nodes
    color_entity = "#00FF00"
    if verbose:
        print(f"there are {len(kb.entities)} entities in KB")
    for e in kb.entities:
        G.add_node(e)
        net.add_node(e, label=e, shape="dot", color=color_entity)
        # net.add_node(e, label=e, physics = physics, shape="dot", color=color_entity)
    
    # edges
    if verbose:
        print(f"there are {len(kb.relations)} relations in KB")
    
    # for r in kb.relations:
    #     G.add_edge(r['head'], r["tail"], )
    #     # net.add_edge(r["head"], r["tail"], title=r["type"], label=r["type"])
    labels = {}
    for r in kb.relations:
        G.add_edge(r['head'], r["tail"])
        labels[(r["head"], r["tail"])] = r["type"]
        net.add_edge(r["head"], r["tail"], title=r["type"], label=r["type"])

    scale=10 # Scaling the size of the nodes by 10*degree
    d = dict(G.degree)

    pos = nx.spring_layout(G)
    #Updating dict
    d.update((x, scale*y) for x, y in d.items())

    #Setting up size attribute
    nx.set_node_attributes(G,d,'size')
    nx.set_edge_attributes(G,labels, 'labels')
    # nx.draw_networkx_edge_labels(
    #                             G, pos,
    #                             edge_labels=labels,
    #                             # font_color='red'
    #                             )
    if verbose:
        print(f"Trying to make graph")

    # net.from_nx(G)   
    
    # save network
    if physics:
        net.repulsion(
            node_distance=200,
            central_gravity=0.3,
            spring_length=200,
            spring_strength=0.05,
            damping=0.09
        )

    net.set_edge_smooth('dynamic')

    if verbose:
        print(f"Trying to show graph")

    net.show(filename)

def save_kb(kb, filename, verbose = False):
    if verbose:
        print(f"there are {len(kb.entities)} entities in KB")
        print(f"there are {len(kb.relations)} relations in KB")

    with open(filename, "wb") as f:
        pickle.dump(kb, f)

def load_kb(filename):
    res = None
    with open(filename, "rb") as f:
        res = pickle.load(f)
    return res


# DOING NER

In [7]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [8]:
def split_docs(text, 
               max_text_count = 1000,
               verbose = False):
    
    sentences = sent_tokenize(text)
    chunks = []
    chunk = ""
    len_sentence = 0
    chunk_len = 0
    for sentence in sentences:
      len_sentence = len(sentence.strip().split())
      if chunk_len+len_sentence < max_text_count:
         chunk += sentence+" "
         chunk_len += len_sentence
         continue
      chunk_len = len_sentence
      chunks.append(chunk)
      chunk = sentence
      
      # chunk = textwrap.wrap(sentence, max_text_count)
    if verbose:
      print(len(chunks))
    return chunks

def _from_text_to_kb(text, article_url, kb = None,
                    useGPU=0, 
                    span_length=128, 
                    article_title=None,
                    article_publish_date=None, 
                    verbose=False,
                    useWiki=True,
                    offline_Wiki = None):
    
    # tokenize whole text
    # print(text)
    # input()
    with torch.no_grad():
        inputs = tokenizer([text], 
                        max_length = 1000,
                        #    max_length=512,
                        padding=True,  
                        truncation=True, 
                        return_tensors="pt")

        # compute span boundaries
        # print(inputs.values())
        num_tokens = len(inputs["input_ids"][0])
        if verbose:
            print(f"Input has {num_tokens} tokens")
        num_spans = math.ceil(num_tokens / span_length)
        
        if verbose:
            print(f"Input has {num_spans} spans")
        overlap = math.ceil((num_spans * span_length - num_tokens) / 
                            max(num_spans - 1, 1))
        
        # input()
        spans_boundaries = []
        start = 0
        for i in range(num_spans):
            spans_boundaries.append([start + span_length * i,
                                    start + span_length * (i + 1)])
            start -= overlap
        if verbose:
            print(f"Span boundaries are {spans_boundaries}")

        # transform input with spans
        tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
        tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                        for boundary in spans_boundaries]
        inputs = {
            "input_ids": torch.stack(tensor_ids),
            "attention_mask": torch.stack(tensor_masks)
        }
        
        # generate relations
        num_return_sequences = 3
        gen_kwargs = {
            "max_length": 256,
            "length_penalty": 0,
            "num_beams": 3,
            "num_return_sequences": num_return_sequences
        }

        generated_tokens = model.generate(
                                            inputs["input_ids"].to(model.device),
                                            attention_mask=inputs["attention_mask"].to(model.device),
                                            **gen_kwargs,
                                            )

        # decode relations
        decoded_preds = tokenizer.batch_decode(generated_tokens,
                                            skip_special_tokens=False)

        # create kb
        if not kb:
            kb = KB()

        i = 0
        # for sentence_pred in tqdm(decoded_preds, leave=False):
        _relations = Parallel(n_jobs=N_JOB_COUNT)(delayed(extract_relations_from_model_output)(sentence_pred) for sentence_pred in decoded_preds)

        for sentence_pred in decoded_preds:
            current_span_index = i // num_return_sequences
            # relations = extract_relations_from_model_output(sentence_pred)
            relations = _relations[i]

            if verbose:
                print(f"{i}. extraction of relations done, it has {len(relations)} relations", end="\r")
                
            for relation in relations:
                relation["meta"] = {
                    article_url: {
                        "spans": [spans_boundaries[current_span_index]]
                    }
                }
                kb.add_relation(relation, 
                                article_title,
                                article_publish_date, 
                                useWiki=useWiki,
                                offlineWiki=offline_Wiki,
                                verbose=verbose)
            i += 1

    return kb

def from_text_to_kb(text, article_url, kb = None,
                    useGPU=0, 
                    span_length=128, 
                    article_title=None,
                    article_publish_date=None, 
                    verbose=False,
                    max_token = 1000,
                    max_doc_text = 1000,
                    useWiki = True,
                    offlineWiki = None):
    # with torch.no_grad():
    #     # tokenize whole text
    #     # inputs = tokenizer([text], return_tensors="pt")
    #     # num_tokens = len(inputs["input_ids"][0])

    input_words = text.split()
    num_tokens = len(input_words)

    if verbose:
        # print(f"Input has {num_tokens} tokens")
        print(f"Input has {num_tokens} words")

    if not kb:
        kb = KB()
    
    _kb = kb 

    _offlineWiki = offlineWiki

    # compute span boundaries
    # num_tokens = len(inputs["input_ids"][0])
    if num_tokens > max_token:
        if verbose:
            print("input len > token size, splitting doc in smaller chunks")
        text = split_docs(text, max_text_count=max_doc_text)
    
    if type(text) == str:
        text = [text]
    
    # for _text in tqdm(text, leave=False):
    for _text in text: 
        # print(_text)
        # print(_text[0])
        # input()
        _kb = _from_text_to_kb(_text, article_url, 
                            useGPU=useGPU, 
                            span_length=span_length, 
                            article_title=article_title,
                            article_publish_date=article_publish_date, 
                            verbose=verbose,
                            kb=_kb,
                            useWiki=useWiki,
                            offline_Wiki=_offlineWiki)
    return _kb
                

# -- BREAK --

# EXP

In [9]:
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")


### LOADING MODEL

In [10]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [11]:
model.to(device)


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50272, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50272, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

### LOADING OFFLINE WIKI

In [12]:
WIKI_INDEX_FILE = "D://WikiDump/enwiki-20240220-pages-articles-multistream-index.txt/enwiki-20240220-pages-articles-multistream-index.txt"
WIKI_BZ2_FILE = "D://WikiDump/enwiki-20240220-pages-articles-multistream.xml.bz2"

INDEX_FOLDER = "./indexes/"

In [13]:
offline_wikipedia = offline_Wiki(wiki_index_file=WIKI_INDEX_FILE,
                                wikiDump_bz2_file=WIKI_BZ2_FILE, verbose=True,
                                index_folder=INDEX_FOLDER)

In [14]:
# testing
print(offline_wikipedia.word_match("tigers", summaryOnly=False))
# input()

test_words = ["sachin", "trump", "wedding", "white house", "modi", "apple", "windows 11"]
for test_word in test_words:
    print(offline_wikipedia.word_match(test_word, summaryOnly=False))

{'title': 'Tigers', 'url': 'https://en.wikipedia.org/wiki/Tiger', 'summary': 'REDIRECT Tiger'}
{'title': 'Sachin', 'url': 'https://en.wikipedia.org/wiki/Sachin', 'summary': 'Sachin may refer to:\nSachin (given name), an Indian given name, including a list of people with the name\n* Sachin (actor) (born 1957), Indian actor and filmmaker\n* Sachin (boxer), Indian boxer\n* Sachin Tendulkar (born 1973), Indian cricketer\n==Films==\nSachein, a 2005 Tamil film directed by John Mahendran, sometimes spelled "Sachin"\n A Billion Dreams, a 2017 Indian biographical film of Sachin Tendulkar\nSachin (film), a 2018 Malayalam film directed by Santhosh Nair\n The Ultimate Winner, a 2023 Indian film based on the cricketer\n==Places==\nSachin, Pas-de-Calais, a town in northern France\nSachin, Gujarat, a suburban area of Surat in India\n*Sachin INA, a town and an industrial notified area\n* Sachin railway station, a small railway station in Surat district, Gujarat\nSachin State, a princely state of India

### TESTING


In [15]:
statement = "The Sun rises in East direction. The Earth is smaller than the planet Jupiter. Tigers are from Cat family."

In [16]:
texts = statement.split('. ')

In [17]:
kb = KB()
max_lenn = 1000
spann = 64

kb = from_text_to_kb(statement, "", kb = kb,
                    verbose=0,
                    span_length=spann,
                    max_doc_text=max_lenn,
                    useWiki=1,
                    offlineWiki=offline_wikipedia)
kb.print()


Entities:
  ('Tiger', {'url': 'https://en.wikipedia.org/wiki/Tiger', 'summary': "The tiger (Panthera tigris) is the largest living cat species and a member of the genus Panthera native to Asia. It has a powerful, muscular body with a large head and paws, a long tail, and distinctive black, mostly vertical stripes on orange fur. It was first scientifically described in 1758 and is traditionally classified into eight recent subspecies, though some recognise only two subspecies, namely mainland Asian tigers and island tigers of the Sunda Islands.\nThroughout the tiger's range, it inhabits mainly forests, from coniferous and temperate broadleaf and mixed forests in the Russian Far East and Northeast China to tropical and subtropical moist broadleaf forests on the Indian subcontinent and Southeast Asia. The tiger is an apex predator and preys mainly on ungulates such as deer and wild boar, which it takes by ambush. It lives a mostly solitary life and occupies home ranges, which it defends f

In [18]:
kb = KB()
max_lenn = 1000
spann = 64

for text in tqdm(texts):
    kb = from_text_to_kb(text, "", kb = kb,
                         verbose=0,
                         span_length=spann,
                         max_doc_text=max_lenn,
                         useWiki=1,
                         offlineWiki=offline_wikipedia)
kb.print()
    

100%|██████████| 3/3 [00:22<00:00,  7.53s/it]

Entities:
  ('Earth', {'url': 'https://en.wikipedia.org/wiki/Earth', 'summary': "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water. Almost all of Earth's water is contained in its global ocean, covering 70.8% of Earth's crust. The remaining 29.2% of Earth's crust is land, most of which is located in the form of continental landmasses within one hemisphere, Earth's land hemisphere. Most of Earth's land is somewhat [and covered by vegetation, while large"})
  ('Jupiter', {'url': 'https://en.wikipedia.org/wiki/Jupiter', 'summary': "Jupiter is the fifth planet from the Sun and the largest in the Solar System. A gas giant, Jupiter's mass is more than two and a half times that of all the other planets in the Solar System combined and slightly less than one one-thousandth the mass of the Sun. Jupiter orbits the Sun at a distance of 5.20 AU 




In [None]:
# kb = KB()
# max_lenn = 1000
# spann = 128

# for text in tqdm(texts):
#     kb = from_text_to_kb(text, "", kb = kb,
#                          verbose=0,
#                          span_length=spann,
#                          max_doc_text=max_lenn,
#                          useWiki=1,
#                          offlineWiki=None)
# kb.print()
    

100%|██████████| 3/3 [01:06<00:00, 22.11s/it]

Entities:
  ('Earth', {'url': 'https://en.wikipedia.org/wiki/Earth', 'summary': "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water. Almost all of Earth's water is contained in its global ocean, covering 70.8% of Earth's crust. The remaining 29.2% of Earth's crust is land, most of which is located in the form of continental landmasses within Earth's land hemisphere. Most of Earth's land is somewhat humid and covered by vegetation, while large sheets of ice at Earth's polar deserts retain more water than Earth's groundwater, lakes, rivers and atmospheric water combined. Earth's crust consists of slowly moving tectonic plates, which interact to produce mountain ranges, volcanoes, and earthquakes. Earth has a liquid outer core that generates a magnetosphere capable of deflecting most of the destructive solar winds and cosmic radiation.\n




# RAG

In [19]:
entities_in_kb = kb.entities
doc_titles = []
docs = {}
for _entt, _values in entities_in_kb.items():
    _url = _values['url']
    _url_word = _url.split("wiki/")[-1].strip()
    print(_url_word)
    doc_titles.append(_url_word) 
    
    # docs[_url_word] = offline_wikipedia.word_match(_url_word, verbose=0, summaryOnly=False)['summary']
    # --- NEED TO FIX THIS ---

    docs[_url_word] = wikipedia.page(_url_word, auto_suggest=False).content.replace("\n\n", " ") # <--- NEED TO FIX THIS


Earth
Jupiter
Tiger
Cat
Felidae


In [20]:
docs

{'Earth': 'Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water. Almost all of Earth\'s water is contained in its global ocean, covering 70.8% of Earth\'s crust. The remaining 29.2% of Earth\'s crust is land, most of which is located in the form of continental landmasses within Earth\'s land hemisphere. Most of Earth\'s land is somewhat humid and covered by vegetation, while large sheets of ice at Earth\'s polar deserts retain more water than Earth\'s groundwater, lakes, rivers and atmospheric water combined. Earth\'s crust consists of slowly moving tectonic plates, which interact to produce mountain ranges, volcanoes, and earthquakes. Earth has a liquid outer core that generates a magnetosphere capable of deflecting most of the destructive solar winds and cosmic radiation.\nEarth has a dynamic atmosphere, which sustains Earth\'s surfac

In [21]:
data = ""
data = "".join([f"{_content}\n\n" for _word, _content in docs.items()])
print(data)


Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water. Almost all of Earth's water is contained in its global ocean, covering 70.8% of Earth's crust. The remaining 29.2% of Earth's crust is land, most of which is located in the form of continental landmasses within Earth's land hemisphere. Most of Earth's land is somewhat humid and covered by vegetation, while large sheets of ice at Earth's polar deserts retain more water than Earth's groundwater, lakes, rivers and atmospheric water combined. Earth's crust consists of slowly moving tectonic plates, which interact to produce mountain ranges, volcanoes, and earthquakes. Earth has a liquid outer core that generates a magnetosphere capable of deflecting most of the destructive solar winds and cosmic radiation.
Earth has a dynamic atmosphere, which sustains Earth's surface conditions and prot

### RETRIVER

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

SPLIT LARGE TEXT

In [23]:
# SPLITTING LARGE DOCS IN SMALLER CHUNKS
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=60)
chunks = text_splitter.split_text(data)
chunks


["Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water. Almost all of Earth's water is contained in its global ocean, covering 70.8% of Earth's crust.",
 "in its global ocean, covering 70.8% of Earth's crust. The remaining 29.2% of Earth's crust is land, most of which is located in the form of continental landmasses within Earth's land hemisphere. Most of Earth's land is somewhat humid and covered by vegetation, while large sheets of ice at Earth's",
 "covered by vegetation, while large sheets of ice at Earth's polar deserts retain more water than Earth's groundwater, lakes, rivers and atmospheric water combined. Earth's crust consists of slowly moving tectonic plates, which interact to produce mountain ranges, volcanoes, and earthquakes. Earth",
 'produce mountain ranges, volcanoes, and earthquakes. Earth has a liquid outer core that g

VECTORIZE AND STORE 

In [24]:
# MAKING TEXT EMBEDDING (VECTORIZING CHUNKS)
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
# model_kwargs = {'device': 'cpu'}
model_kwargs = {'device': device}

encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
                                    model_name=modelPath, 
                                    model_kwargs=model_kwargs, 
                                    encode_kwargs=encode_kwargs 
                                    )
vector_store = FAISS.from_texts(chunks, embedding=embeddings)

In [48]:
statement.split('. ')

['The Sun rises in East direction',
 'The Earth is smaller than the planet Jupiter',
 'Tigers are from Cat family.']

In [27]:
user_qns = statement.split(". ")

In [50]:
user_qn = ""
relevant_doc = []
for i in user_qns[:3]:
    user_qn = i 
    relevant_doc = vector_store.similarity_search(user_qn, k = 3)

In [51]:
relevant_doc

[Document(page_content='The tiger (Panthera tigris) is the largest living cat species and a member of the genus Panthera native to Asia. It has a powerful, muscular body with a large head and paws, a long tail, and distinctive black, mostly vertical stripes on orange fur. It was first scientifically described in 1758 and'),
 Document(page_content='=== Hunting and diet === The tiger is a carnivore and an apex predator feeding mainly on ungulates, with a particular preference for sambar deer, Manchurian wapiti, barasingha and wild boar. Tigers kill large ungulates like gaur and opportunistically, smaller prey like monkeys, peafowl and other'),
 Document(page_content='sequencing of 32 samples support six monophyletic tiger clades corresponding with the six living proposed subspecies and indicate they descended from a common ancestor around 110,000 years ago. Studies in 2021 and 2023 also affirmed the genetic distinctiveness and separation of these tigers.')]

### PREPARE MODEL

In [25]:
# pip install accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM
from transformers import pipeline, set_seed
from transformers import T5Tokenizer, T5ForConditionalGeneration

from llama3 import llama3




### llama3

In [26]:
llama3_rag_model = llama3()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [39]:
user_qn

'Tigers are from Cat family.'

In [40]:
relevant_doc

[Document(page_content='The tiger (Panthera tigris) is the largest living cat species and a member of the genus Panthera native to Asia. It has a powerful, muscular body with a large head and paws, a long tail, and distinctive black, mostly vertical stripes on orange fur. It was first scientifically described in 1758 and'),
 Document(page_content='=== Hunting and diet === The tiger is a carnivore and an apex predator feeding mainly on ungulates, with a particular preference for sambar deer, Manchurian wapiti, barasingha and wild boar. Tigers kill large ungulates like gaur and opportunistically, smaller prey like monkeys, peafowl and other'),
 Document(page_content='sequencing of 32 samples support six monophyletic tiger clades corresponding with the six living proposed subspecies and indicate they descended from a common ancestor around 110,000 years ago. Studies in 2021 and 2023 also affirmed the genetic distinctiveness and separation of these tigers.')]

In [41]:
contextt = '\n'.join(['[DOC ' + str(i) + '] : '+docc.page_content for i, docc in enumerate(relevant_doc)])

In [42]:
prompt = f"""<<CONTEXT>>\n{contextt}\n\n<<CHECK>> {user_qn}"""

In [43]:
print(prompt)

<<CONTEXT>>
[DOC 0] : The tiger (Panthera tigris) is the largest living cat species and a member of the genus Panthera native to Asia. It has a powerful, muscular body with a large head and paws, a long tail, and distinctive black, mostly vertical stripes on orange fur. It was first scientifically described in 1758 and
[DOC 1] : === Hunting and diet === The tiger is a carnivore and an apex predator feeding mainly on ungulates, with a particular preference for sambar deer, Manchurian wapiti, barasingha and wild boar. Tigers kill large ungulates like gaur and opportunistically, smaller prey like monkeys, peafowl and other
[DOC 2] : sequencing of 32 samples support six monophyletic tiger clades corresponding with the six living proposed subspecies and indicate they descended from a common ancestor around 110,000 years ago. Studies in 2021 and 2023 also affirmed the genetic distinctiveness and separation of these tigers.

<<CHECK>> Tigers are from Cat family.


In [44]:
llama3_rag_model.llama3_summary(user_qn, 
                                contextt, 
                                verbose=1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



 --- Generating outputs --- 


 --- Got some response --- 
 

 --- Returning after decoding response... --- 



'True'

In [29]:
user_qns

['The Sun rises in East direction',
 'The Earth is smaller than the planet Jupiter',
 'Tigers are from Cat family.']

------ LOOP -------

In [34]:
def clean_shitt(shitt):
    shitt = shitt.lower()
    shitts = [s.strip() for s in shitt.split()]
    if len(shitts) > 4:
        return None
    if "true" in shitts:
        return 1
    elif "false" in shitts:
        return 0
    else:
        return -1
    
def calc_truth(truth_dict):
    total_count = len(truth_dict)
    
    frac_1 = sum(1 for value in truth_dict.values() if value == 1)/total_count if total_count > 0 else 0
    frac_0 = sum(1 for value in truth_dict.values() if value == 0)/total_count if total_count > 0 else 0
    frac_minus_1 = sum(1 for value in truth_dict.values() if value == -1)/total_count if total_count > 0 else 0

    return {
        "True" : frac_1,
        "False" : frac_0,
        "PantsOnFire" : frac_minus_1
    }




In [35]:
sentence_truth = {}
for sent_no, user_qn in tqdm(enumerate(user_qns), desc="Sentences"):

    print("--- Fetching relevant docs...")
    
    relevant_docs = vector_store.similarity_search(user_qn, k = 3)
    print("--- Relevant docs fetched...")

    contextt = '\n'.join(['[DOC ' + str(i) + '] : '+ docc.page_content 
                                            for i, docc in enumerate(relevant_docs)])
    
    prompt = f"""<<CONTEXT>>\n{contextt}\n\n<<CHECK>> {user_qn}"""
    
    
    llama_shitt = llama3_rag_model.llama3_summary(user_qn, 
                                                contextt, 
                                                verbose=1)
    
    sentence_truth[sent_no] = clean_shitt(llama_shitt)
    print(sentence_truth)


Sentences: 0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


--- Fetching relevant docs...
--- Relevant docs fetched...
<<CONTEXT>>
[DOC 0] : of Capricorn faces the Sun. In each instance, winter occurs simultaneously in the opposite hemisphere.
[DOC 1] : Earth–Sun distance causes an increase of about 6.8% in solar energy reaching Earth at perihelion relative to aphelion. Because the Southern Hemisphere is tilted toward the Sun at about the same time that Earth reaches the closest approach to the Sun, the Southern Hemisphere receives slightly more
[DOC 2] : to the Sun, the Southern Hemisphere receives slightly more energy from the Sun than does the northern over the course of a year. This effect is much less significant than the total energy change due to the axial tilt, and most of the excess energy is absorbed by the higher proportion of water in

<<CHECK>> The Sun rises in East direction

 --- Generating outputs --- 



Sentences: 1it [00:15, 15.97s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



 --- Got some response --- 
 

 --- Returning after decoding response... --- 

{0: 1}
--- Fetching relevant docs...
--- Relevant docs fetched...
<<CONTEXT>>
[DOC 0] : across its equator is longer than the diameter measured between its poles. On Jupiter, the equatorial diameter is 9,276 km (5,764 mi) longer than the polar diameter.
[DOC 1] : === Size and mass === Jupiter's mass is 318 times that of Earth; 2.5 times that of all the other planets in the Solar System combined. It is so massive that its barycentre with the Sun lies above the Sun's surface at 1.068 solar radii from the Sun's centre.: 6  Jupiter's radius is about one tenth
[DOC 2] : As a result, Jupiter is thought to have about as large a diameter as a planet of its composition and evolutionary history can achieve. The process of further shrinkage with increasing mass would continue until appreciable stellar ignition was achieved. Although Jupiter would need to be about 75

<<CHECK>> The Earth is smaller than the planet Jupi

Sentences: 2it [00:31, 15.82s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



 --- Got some response --- 
 

 --- Returning after decoding response... --- 

{0: 1, 1: 1}
--- Fetching relevant docs...
--- Relevant docs fetched...
<<CONTEXT>>
[DOC 0] : The tiger (Panthera tigris) is the largest living cat species and a member of the genus Panthera native to Asia. It has a powerful, muscular body with a large head and paws, a long tail, and distinctive black, mostly vertical stripes on orange fur. It was first scientifically described in 1758 and
[DOC 1] : === Hunting and diet === The tiger is a carnivore and an apex predator feeding mainly on ungulates, with a particular preference for sambar deer, Manchurian wapiti, barasingha and wild boar. Tigers kill large ungulates like gaur and opportunistically, smaller prey like monkeys, peafowl and other
[DOC 2] : sequencing of 32 samples support six monophyletic tiger clades corresponding with the six living proposed subspecies and indicate they descended from a common ancestor around 110,000 years ago. Studies in 2021 

Sentences: 3it [00:47, 15.87s/it]


 --- Got some response --- 
 

 --- Returning after decoding response... --- 

{0: 1, 1: 1, 2: 1}





In [36]:
sentence_truth

{0: 1, 1: 1, 2: 1}

In [37]:
calc_truth(sentence_truth)

{'True': 1.0, 'False': 0.0, 'PantsOnFire': 0.0}