# IMPORTS

In [1]:
import pandas as pd 
import os
import json
import numpy as np

# from spacy.matcher import Matcher 
# from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm
import re
from nltk.tokenize import sent_tokenize

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
import math
import torch
import IPython
from pyvis.network import Network
import wikipedia

import pickle
import textwrap

from joblib import Parallel, delayed
import multiprocessing
from collections import Counter, defaultdict

from functools import partial
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from fuzzywuzzy import fuzz




# OFFLINE WIKI CLASS

In [2]:
from WikiDump_Search.wikiDumpSearch import offline_Wiki


# KB CLASS

In [3]:
class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add
            

    def get_wikipedia_data(self, candidate_entity, useWiki = True, offline_wiki = None, verbose = False):
        # print("\n\n--- offline", offline_Wiki)
        entity_data = None
        stop_words = set(stopwords.words('english'))
        if len(candidate_entity.split()) > 4:
            word_tokens = word_tokenize(candidate_entity)
            candidate_entity = " ".join([w for w in word_tokens if not w.lower() in stop_words])

        try:
            if offline_wiki:
                if verbose:
                    print(f"Finding {candidate_entity} in offline Wiki")
                _entity_data = offline_wiki.word_match(candidate_entity, verbose = verbose)
                
                if verbose:
                    print(f"Got {_entity_data} after word_match from offline Wiki")

                if "REDIRECT" in _entity_data["summary"][:10]:
                    entity_data = _entity_data
                else:                    
                    ratioo = fuzz.ratio(candidate_entity, _entity_data['title'])
                    if verbose:
                        print(f"Fuzz ration : {ratioo}")
                    if ratioo > 50 :
                        entity_data = _entity_data
                        if verbose:
                            print(f"Got {entity_data} from offline wiki with similarity ration = {ratioo}.")
            
            if useWiki and not entity_data:
                if verbose:
                    print(f"Finding {candidate_entity} in online Wiki")
                page = wikipedia.page(candidate_entity, auto_suggest=False)
                entity_data = {
                    "title": page.title,
                    "url": page.url,
                    "summary": page.summary
                }
            
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date, 
                     useWiki = True, offlineWiki = None, verbose = False):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        if verbose:
            print(f"Candidate entities : {candidate_entities}")
            
        # entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]
        
        # TRY 2
        entities = []
        if useWiki:
            entities = Parallel(n_jobs=N_JOB_COUNT)(delayed(self.get_wikipedia_data)(ent, useWiki, offlineWiki, verbose=verbose) for ent in candidate_entities)
            # entities = [self.get_wikipedia_data(ent, useWiki, offlineWiki) for ent in candidate_entities]

        else:
            entities = [{"title": ent,
                         "url": "",
                         "summary": ""
                        } for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

# FOR GPU

In [4]:
def get_cpu_count():
    c = multiprocessing.cpu_count()
    return c 

def check_gpu():
    for i in range(torch.cuda.device_count()):
        device_name = f'cuda:{i}'
        print(f'{i} device name:{torch.cuda.get_device_name(torch.device(device_name))}')

def get_gpu():
    return [f'cuda:{i}' for i in range(torch.cuda.device_count())]

print(check_gpu())

print(get_gpu())

print(get_cpu_count())

N_JOB_COUNT = get_cpu_count()//2
N_JOB_COUNT = -1 # PARALLEL CAUSES ERROR, SO KEEPING IT 1
N_JOB_COUNT = 4

0 device name:NVIDIA GeForce RTX 3060 Laptop GPU
None
['cuda:0']
16


# HELPER FUNCTION

In [5]:
def loadJSON(filepathh):
    _dataa = {} 
    if os.path.exists(filepathh):
        with open(filepathh, "r", encoding="utf-8") as _f:
            _dataa = json.load(_f)
    else:
        print(f"{filepathh} does not exists...\n") 
    return _dataa 

def loadTXT(filepathh):
    _dataa = ""
    if os.path.exists(filepathh):
        with open(filepathh, "r", encoding="utf-8") as _f:
            _dataa = _f.read()
    else:
        print(f"{filepathh} does not exists...\n") 
    return _dataa 

def loadFILE(filepathh = ""):
    if os.path.exists(filepathh):
        if filepathh.endswith(".txt"):
            return loadTXT(filepathh)
        elif filepathh.endswith(".json"):
            return loadJSON(filepathh)
        else:
            print("\n- Invalid File format 😐 !!!\n")
            return None
    else:
        print(f"{filepathh} does not exists...\n") 

def remove_garbage(text):
    # Remove garbage Unicode characters
    cleaned_text = text.encode().decode('unicode-escape')
    # Remove any remaining non-printable characters
    cleaned_text = re.sub(r'[^\x20-\x7E]', '', cleaned_text)
    return cleaned_text

def clean_sentence(sentence):
    # Remove extra white spaces
    cleaned_sentence = re.sub(r'\s+', ' ', sentence)
    # Remove unwanted characters except alphabets, numbers, punctuation marks, '@', '-', and '_'
    cleaned_sentence = re.sub(r'[^a-zA-Z0-9@#\-_.,?!\'" ]', '', cleaned_sentence)
    # Remove words containing '#' and 'pic.twitter.com'
    cleaned_sentence = ' '.join(word if '#' not in word and 'pic.twitter.com' not in word else ' ' for word in cleaned_sentence.split() )
    return cleaned_sentence.strip()

def clean_document(document):
    document = remove_garbage(document)
    # Tokenize the document into sentences
    sentences = sent_tokenize(document)
    # Clean each sentence
    cleaned_sentences = [clean_sentence(sentence) for sentence in sentences]
    return cleaned_sentences

In [6]:
def save_network_html(kb, filename="network.html", 
                      verbose = False, 
                      physics = False,
                      show = False):

    if not os.path.exists(filename):
        with open(filename, 'w') as _file:
            _file.write("")

    # create network
    G = nx.Graph()
    net = Network(directed=True, 
                  notebook=True,
                  width="1000px", 
                  height="1000px",
                #   bgcolor="#eeeeee"
                )
    if verbose:
        print("Network initialized")

    # nodes
    color_entity = "#00FF00"
    if verbose:
        print(f"there are {len(kb.entities)} entities in KB")
    for e in kb.entities:
        G.add_node(e)
        net.add_node(e, label=e, shape="dot", color=color_entity)
        # net.add_node(e, label=e, physics = physics, shape="dot", color=color_entity)
    
    # edges
    if verbose:
        print(f"there are {len(kb.relations)} relations in KB")
    
    # for r in kb.relations:
    #     G.add_edge(r['head'], r["tail"], )
    #     # net.add_edge(r["head"], r["tail"], title=r["type"], label=r["type"])
    labels = {}
    for r in kb.relations:
        G.add_edge(r['head'], r["tail"])
        labels[(r["head"], r["tail"])] = r["type"]
        net.add_edge(r["head"], r["tail"], title=r["type"], label=r["type"])

    scale=10 # Scaling the size of the nodes by 10*degree
    d = dict(G.degree)

    pos = nx.spring_layout(G)
    #Updating dict
    d.update((x, scale*y) for x, y in d.items())

    #Setting up size attribute
    nx.set_node_attributes(G,d,'size')
    nx.set_edge_attributes(G,labels, 'labels')
    # nx.draw_networkx_edge_labels(
    #                             G, pos,
    #                             edge_labels=labels,
    #                             # font_color='red'
    #                             )
    if verbose:
        print(f"Trying to make graph")

    # net.from_nx(G)   
    
    # save network
    if physics:
        net.repulsion(
            node_distance=200,
            central_gravity=0.3,
            spring_length=200,
            spring_strength=0.05,
            damping=0.09
        )

    net.set_edge_smooth('dynamic')

    if verbose:
        print(f"Trying to show graph")

    net.show(filename)

def save_kb(kb, filename, verbose = False):
    if verbose:
        print(f"there are {len(kb.entities)} entities in KB")
        print(f"there are {len(kb.relations)} relations in KB")

    with open(filename, "wb") as f:
        pickle.dump(kb, f)

def load_kb(filename):
    res = None
    with open(filename, "rb") as f:
        res = pickle.load(f)
    return res


# DOING NER

In [7]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [8]:
def split_docs(text, 
               max_text_count = 1000,
               verbose = False):
    
    sentences = sent_tokenize(text)
    chunks = []
    chunk = ""
    len_sentence = 0
    chunk_len = 0
    for sentence in sentences:
      len_sentence = len(sentence.strip().split())
      if chunk_len+len_sentence < max_text_count:
         chunk += sentence+" "
         chunk_len += len_sentence
         continue
      chunk_len = len_sentence
      chunks.append(chunk)
      chunk = sentence
      
      # chunk = textwrap.wrap(sentence, max_text_count)
    if verbose:
      print(len(chunks))
    return chunks

def _from_text_to_kb(text, article_url, kb = None,
                    useGPU=0, 
                    span_length=128, 
                    article_title=None,
                    article_publish_date=None, 
                    verbose=False,
                    useWiki=True,
                    offline_Wiki = None):
    
    # tokenize whole text
    # print(text)
    # input()
    with torch.no_grad():
        inputs = tokenizer([text], 
                        max_length = 1000,
                        #    max_length=512,
                        padding=True,  
                        truncation=True, 
                        return_tensors="pt")

        # compute span boundaries
        # print(inputs.values())
        num_tokens = len(inputs["input_ids"][0])
        if verbose:
            print(f"Input has {num_tokens} tokens")
        num_spans = math.ceil(num_tokens / span_length)
        
        if verbose:
            print(f"Input has {num_spans} spans")
        overlap = math.ceil((num_spans * span_length - num_tokens) / 
                            max(num_spans - 1, 1))
        
        # input()
        spans_boundaries = []
        start = 0
        for i in range(num_spans):
            spans_boundaries.append([start + span_length * i,
                                    start + span_length * (i + 1)])
            start -= overlap
        if verbose:
            print(f"Span boundaries are {spans_boundaries}")

        # transform input with spans
        tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
        tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                        for boundary in spans_boundaries]
        inputs = {
            "input_ids": torch.stack(tensor_ids),
            "attention_mask": torch.stack(tensor_masks)
        }
        
        # generate relations
        num_return_sequences = 3
        gen_kwargs = {
            "max_length": 256,
            "length_penalty": 0,
            "num_beams": 3,
            "num_return_sequences": num_return_sequences
        }

        generated_tokens = model.generate(
                                            inputs["input_ids"].to(model.device),
                                            attention_mask=inputs["attention_mask"].to(model.device),
                                            **gen_kwargs,
                                            )

        # decode relations
        decoded_preds = tokenizer.batch_decode(generated_tokens,
                                            skip_special_tokens=False)

        # create kb
        if not kb:
            kb = KB()

        i = 0
        # for sentence_pred in tqdm(decoded_preds, leave=False):
        _relations = Parallel(n_jobs=N_JOB_COUNT)(delayed(extract_relations_from_model_output)(sentence_pred) for sentence_pred in decoded_preds)

        for sentence_pred in decoded_preds:
            current_span_index = i // num_return_sequences
            # relations = extract_relations_from_model_output(sentence_pred)
            relations = _relations[i]

            if verbose:
                print(f"{i}. extraction of relations done, it has {len(relations)} relations", end="\r")
                
            for relation in relations:
                relation["meta"] = {
                    article_url: {
                        "spans": [spans_boundaries[current_span_index]]
                    }
                }
                kb.add_relation(relation, 
                                article_title,
                                article_publish_date, 
                                useWiki=useWiki,
                                offlineWiki=offline_Wiki,
                                verbose=verbose)
            i += 1

    return kb

def from_text_to_kb(text, article_url, kb = None,
                    useGPU=0, 
                    span_length=128, 
                    article_title=None,
                    article_publish_date=None, 
                    verbose=False,
                    max_token = 1000,
                    max_doc_text = 1000,
                    useWiki = True,
                    offlineWiki = None):
    # with torch.no_grad():
    #     # tokenize whole text
    #     # inputs = tokenizer([text], return_tensors="pt")
    #     # num_tokens = len(inputs["input_ids"][0])

    input_words = text.split()
    num_tokens = len(input_words)

    if verbose:
        # print(f"Input has {num_tokens} tokens")
        print(f"Input has {num_tokens} words")

    if not kb:
        kb = KB()
    
    _kb = kb 

    _offlineWiki = offlineWiki

    # compute span boundaries
    # num_tokens = len(inputs["input_ids"][0])
    if num_tokens > max_token:
        if verbose:
            print("input len > token size, splitting doc in smaller chunks")
        text = split_docs(text, max_text_count=max_doc_text)
    
    if type(text) == str:
        text = [text]
    
    # for _text in tqdm(text, leave=False):
    for _text in text: 
        # print(_text)
        # print(_text[0])
        # input()
        _kb = _from_text_to_kb(_text, article_url, 
                            useGPU=useGPU, 
                            span_length=span_length, 
                            article_title=article_title,
                            article_publish_date=article_publish_date, 
                            verbose=verbose,
                            kb=_kb,
                            useWiki=useWiki,
                            offline_Wiki=_offlineWiki)
    return _kb
                

In [9]:
def save_network_html(kb, filename="network.html", 
                      verbose = False, 
                      physics = False,
                      show = False):

    if not os.path.exists(filename):
        with open(filename, 'w') as _file:
            _file.write("")

    # create network
    G = nx.Graph()
    net = Network(directed=True, 
                  notebook=True,
                  width="1000px", 
                  height="1000px",
                #   bgcolor="#eeeeee"
                )
    if verbose:
        print("Network initialized")

    # nodes
    color_entity = "#00FF00"
    if verbose:
        print(f"there are {len(kb.entities)} entities in KB")
    for e in kb.entities:
        G.add_node(e)
        net.add_node(e, label=e, shape="dot", color=color_entity)
        # net.add_node(e, label=e, physics = physics, shape="dot", color=color_entity)
    
    # edges
    if verbose:
        print(f"there are {len(kb.relations)} relations in KB")
    
    # for r in kb.relations:
    #     G.add_edge(r['head'], r["tail"], )
    #     # net.add_edge(r["head"], r["tail"], title=r["type"], label=r["type"])
    labels = {}
    for r in kb.relations:
        G.add_edge(r['head'], r["tail"])
        labels[(r["head"], r["tail"])] = r["type"]
        net.add_edge(r["head"], r["tail"], title=r["type"], label=r["type"])

    scale=10 # Scaling the size of the nodes by 10*degree
    d = dict(G.degree)

    pos = nx.spring_layout(G)
    #Updating dict
    d.update((x, scale*y) for x, y in d.items())

    #Setting up size attribute
    nx.set_node_attributes(G,d,'size')
    nx.set_edge_attributes(G,labels, 'labels')
    # nx.draw_networkx_edge_labels(
    #                             G, pos,
    #                             edge_labels=labels,
    #                             # font_color='red'
    #                             )
    if verbose:
        print(f"Trying to make graph")

    # net.from_nx(G)   
    
    # save network
    if physics:
        net.repulsion(
            node_distance=200,
            central_gravity=0.3,
            spring_length=200,
            spring_strength=0.05,
            damping=0.09
        )

    net.set_edge_smooth('dynamic')

    if verbose:
        print(f"Trying to show graph")

    net.show(filename)

# --- BREAK ---

# EXP

### LOAD MODEL

In [10]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [11]:
model.to(get_gpu()[0])


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50272, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50272, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

### LOADING OFFLINE WIKI

In [12]:
WIKI_INDEX_FILE = "D://WikiDump/enwiki-20240220-pages-articles-multistream-index.txt/enwiki-20240220-pages-articles-multistream-index.txt"
WIKI_BZ2_FILE = "D://WikiDump/enwiki-20240220-pages-articles-multistream.xml.bz2"

INDEX_FOLDER = "./WikiDump_Search/indexes/"

In [13]:
offline_wikipedia = offline_Wiki(wiki_index_file=WIKI_INDEX_FILE,
                                wikiDump_bz2_file=WIKI_BZ2_FILE, verbose=True,
                                index_folder=INDEX_FOLDER)

In [14]:
# testing
print(offline_wikipedia.word_match("tigers", summaryOnly=False))
input()

test_words = ["sachin", "trump", "wedding", "white house", "modi", "apple", "windows 11"]
for test_word in test_words:
    print(offline_wikipedia.word_match(test_word))

{'title': 'Tigers', 'url': 'https://en.wikipedia.org/wiki/Tiger', 'summary': 'REDIRECT Tiger'}
{'title': 'Sachin', 'url': 'https://en.wikipedia.org/wiki/Sachin', 'summary': 'Sachin may refer to:\nSachin (given name), an Indian given name, including a list of people with the name\n* Sachin (actor) (born 1957), Indian actor and filmmaker\n* Sachin (boxer), Indian boxer\n* Sachin Tendulkar (born 1973), Indian cricketer\n'}
{'title': 'Trump', 'url': 'https://en.wikipedia.org/wiki/Trump', 'summary': 'Trump most commonly refers to:\nDonald Trump (born 1946), President of the United States from 2017 to 2021 \nTrump (card games), any playing card given an ad-hoc high rank\nTrump may also refer to:\n'}
{'title': 'Wedding', 'url': 'https://en.wikipedia.org/wiki/Wedding', 'summary': 'A wedding is a ceremony where two people are united in marriage. Wedding traditions and customs vary greatly between cultures, ethnic groups, races, religions, denominations, countries, social classes, and sexual ori

## LIAR

In [15]:
store_folder = "./LIAR_data/"
statements_file = "./statements.json"


In [16]:
data_filepath = loadFILE(store_folder+statements_file) 

##### FOR FAKE DATA (INTERMEDIATE)

In [17]:
countt = 0
for d_text in data_filepath["fake"]:
    print(" ".join(clean_document(d_text)))
    if countt > 5:
        break
    print()
    countt+=1

The things I said during the prank call by a blogger posing as GOP contributor David Koch are the things Ive said publicly all along about the Wisconsin budget debate.

They delayed my swearing-in here in Massachusetts for a couple weeks so they could ram the health care law through.

1 percent of candidates that the National Rifle Association endorsed in 2012 won.

Were the most highly taxed nation in the world.

I have voted every year in Wisconsin.

Obama "remains silent" and "is never asked how he feels about his church honoring Farrakhan."

Barack Hussein Obama will force doctors to assist homosexuals in buying surrogate babies.


In [18]:
texts = []
_data = data_filepath["fake"]
articles_limit = 3542
# articles_limit = int(input(f"enter number of docs to take (max : {len(_data)}): "))
print(f"taking {articles_limit} docs in KG")

countt = 0
for d_text in _data:
    _txt = " ".join(clean_document(d_text))
    # if len(_txt.split()) > 2000:
    #     continue
    texts.append(_txt)
    countt += 1
    if countt > articles_limit:
        break

taking 3542 docs in KG


In [21]:
len(texts)

3541

In [22]:
max_lenn = 1000
spann = 64

In [23]:
kb = KB()
max_lenn = 1000
for text in tqdm(texts):
    # print(len(text.split()))
    # print(text)
    kb = from_text_to_kb(text, "", kb = kb,
                         useGPU = 1, 
                         verbose = 0, 
                         span_length = spann, 
                         max_doc_text = max_lenn,
                         useWiki = 0,                       # KEEPING IT 0, AS FOR NOW WE ONLY NEED THE POSSIBLE ENTITIES
                         offlineWiki = offline_wikipedia)
    # kb.print()
    # input()

100%|██████████| 3541/3541 [17:54<00:00,  3.30it/s]


In [24]:
len(kb.entities), kb.print()

Entities:
  ('David Koch', {'url': '', 'summary': ''})
  ('GOP', {'url': '', 'summary': ''})
  ('Wisconsin budget debate', {'url': '', 'summary': ''})
  ('ram the health care law through', {'url': '', 'summary': ''})
  ('Massachusetts', {'url': '', 'summary': ''})
  ('health care law', {'url': '', 'summary': ''})
  ('2012', {'url': '', 'summary': ''})
  ('National Rifle Association endorsed in 2012', {'url': '', 'summary': ''})
  ('National Rifle Association', {'url': '', 'summary': ''})
  ('tax', {'url': '', 'summary': ''})
  ('nation', {'url': '', 'summary': ''})
  ('most highly taxed nation', {'url': '', 'summary': ''})
  ('world', {'url': '', 'summary': ''})
  ('voted every year', {'url': '', 'summary': ''})
  ('Wisconsin', {'url': '', 'summary': ''})
  ('every year', {'url': '', 'summary': ''})
  ('state', {'url': '', 'summary': ''})
  ('his church honoring Farrakhan', {'url': '', 'summary': ''})
  ('Obama', {'url': '', 'summary': ''})
  ('his church', {'url': '', 'summary': ''})


(6514, None)

In [25]:
filename = f"LIAR_fake_entites_intermediate_RAW_{articles_limit}.html"
save_kb(kb, os.path.basename(filename)[:-5]  + ".p")

##### FOR REAL DATA (INTERMEDIATE)

In [26]:
texts = []
_data = data_filepath["real"]
articles_limit = 4505
# articles_limit = int(input(f"enter number of docs to take (max : {len(_data)}): "))
print(f"taking {articles_limit} docs in KG")

countt = 0
for d_text in _data:
    _txt = " ".join(clean_document(d_text))
    # if len(_txt.split()) > 2000:
    #     continue
    texts.append(_txt)
    countt += 1
    if countt > articles_limit:
        break

taking 4505 docs in KG


In [27]:
len(texts)

4503

In [28]:
kb = KB()
max_lenn = 1000
for text in tqdm(texts):
    # print(len(text.split()))
    # print(text)
    kb = from_text_to_kb(text, "", kb = kb,
                         useGPU = 1, 
                         verbose = 0, 
                         span_length = spann, 
                         max_doc_text = max_lenn,
                         useWiki = 0,                       # KEEPING IT 0, AS FOR NOW WE ONLY NEED THE POSSIBLE ENTITIES
                         offlineWiki = offline_wikipedia)

100%|██████████| 4503/4503 [22:46<00:00,  3.29it/s]


In [29]:
len(kb.entities), kb.print()

Entities:
  ('New Jersey', {'url': '', 'summary': ''})
  ('New York', {'url': '', 'summary': ''})
  ('high-tax states', {'url': '', 'summary': ''})
  ('South', {'url': '', 'summary': ''})
  ('government', {'url': '', 'summary': ''})
  ('recession', {'url': '', 'summary': ''})
  ('its attempt to blunt the recession', {'url': '', 'summary': ''})
  ('personal income tax', {'url': '', 'summary': ''})
  ('U.S.', {'url': '', 'summary': ''})
  ('income tax', {'url': '', 'summary': ''})
  ('tax breaks', {'url': '', 'summary': ''})
  ('John Loughlin', {'url': '', 'summary': ''})
  ('voted to let people accused of domestic violence keep their guns', {'url': '', 'summary': ''})
  ('On torture', {'url': '', 'summary': ''})
  ('torture', {'url': '', 'summary': ''})
  ('Taylor Swift', {'url': '', 'summary': ''})
  ('national debt', {'url': '', 'summary': ''})
  ('interest', {'url': '', 'summary': ''})
  ('fast food worker', {'url': '', 'summary': ''})
  ('public assistance', {'url': '', 'summary': '

(7946, None)

In [30]:
filename = f"LIAR_real_entites_intermediate_RAW_{articles_limit}.html"
save_kb(kb, os.path.basename(filename)[:-5]  + ".p")