In [1]:
import spacy_sentence_bert

In [2]:
import re
import urllib

import json
import pandas as pd
import torch.nn as nn
import texthero as hero
from urllib.parse import urlsplit
import requests
import torch
from bs4 import BeautifulSoup
from requests import get
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import XLNetTokenizer, XLNetForSequenceClassification

In [3]:
from summarizer import Summarizer,TransformerSummarizer

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
nlp = spacy_sentence_bert.load_model('en_nli_roberta_base')

In [5]:
text = "Narendra modi is the prime minister of USA"

In [7]:
def filter_data(text, res, top=5):
    doc1 = nlp(text)
    sim = []
    for a in res:
        sim.append(doc1.similarity(nlp(a[1])))
    # print(sim)
    zipped = zip(sim, res)
    zipped = sorted(zipped, reverse=True)
    high_conf = [a for s, a in zipped if s >= 0.7]
    low_conf = [a for s, a in zipped if 0.3 <= s < 0.7]
#     print(high_conf, low_conf)
    return high_conf, low_conf

In [8]:
def search(term, num_results=10, lang="en"):
    usr_agent = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'}
        # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
        #               'Chrome/61.0.3163.100 Safari/537.36'}

    def fetch_results(search_term, number_results, language_code):
        escaped_search_term = search_term.replace(' ', '+')

        google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results+1,
                                                                              language_code)
        response = get(google_url, headers=usr_agent)
        response.raise_for_status()

        return response.text

    def parse_results(raw_html):
        soup = BeautifulSoup(raw_html, 'html.parser')
        result_block = soup.find_all('div', attrs={'class': 'g'})
        for result in result_block:
            link = result.find('a', href=True)
            title = result.find('h3')
            text = result.findAll('span')[-1]
            # print(text.text)

            if link and title:
                yield link['href'], text.text

    html = fetch_results(term, num_results, lang)
#     print(list(parse_results(html)))
    return list(parse_results(html))

In [9]:
def get_search_results(text, res, top=5):
    text_url = urllib.parse.quote_plus(text)
    res = search(text_url)
    blacklisted_phras= ['.pdf', '.xlsx', '.csv', '/download', 'facebook.com', 'youtube.com', 'patrika.com',
                        'maharashtratimes.com', 'books.google', '.txt', '.vocab']
    filtered_res = []
    for r in res:
        found = False
        for b_url in blacklisted_phras:
            if b_url in r[0]:
                found = True
                break
        if not found:
            filtered_res.append(r)
    high_conf, low_conf = filter_data(text, filtered_res)
    
    return high_conf + low_conf[:top]
    
#     if high_conf:
#         return high_conf
#     else:
#         return low_conf[:top]


In [10]:
def get_sentences_from_link(link, text, top=5):
    request = requests.get(link, verify=False, timeout=20)
#     time.sleep(1)
    Soup = BeautifulSoup(request.text, 'lxml')
    if 'twitter.com' in request.url:
        return [], request.url

    if 'facebook.com' in request.url:
        return [], request.url

    if '%PDF-' in request.text:
        return [], request.url

    # creating a list of all common heading tags
#     heading_tags = ['h{}'.format(h) for h in range(1, 10)] + ['p']
    heading_tags = ['p']


    results = []
    used = []

    for tags in Soup.find_all(heading_tags):
        if 'h' in tags.name:
            tokens = tags.text.strip().split()
            if len(tokens) > 8:
                if tags.text.strip() not in used:
                    used.append(tags.text.strip())
                    results.append([tags.name, tags.text.strip()])
        else:
            tokens = tags.text.strip().split()
            if len(tokens) > 8:
                if tags.text.strip() not in used:
                    used.append(tags.text.strip())
                    results.append([tags.name, tags.text.strip()])
    doc1 = nlp(text)
    sim = []
    for r in results:
        sim.append(doc1.similarity(nlp(r[1])))
    zipped = zip(sim, results)
    zipped = sorted(zipped, reverse=True)
    high_conf = [a for s, a in zipped if s >= 0.5]
    
#     print(high_conf[:top])

    return high_conf[:top], request.url

In [11]:
def get_evidence_links(text):
    links = []
    if 't.co/' in text:
        urls = re.findall(r'https?:\/+\/+t+\.+co+\/+\S*', text)
        new_links = []
        for li in urls:
            li = li.replace('.%20', '').replace('%20', '').strip('.').strip()
            if li[-1] == '.':
                li = li[:-1]
            conf, lin = get_sentences_from_link(li, text)
            new_links.append([lin, conf])
        links = new_links

    else:
        links = get_search_results(text, links)
        new_links = []
        for link in links:
            conf, lin = get_sentences_from_link(link[0], text)
            new_links.append([lin, conf])
        links = new_links
#     except:
#         pass
#     print(links)
    return links

In [12]:
links = get_evidence_links(text)

In [13]:
links

[['https://in.usembassy.gov/readout-of-president-bidens-call-with-prime-minister-modi-of-india-2/',
  []],
 ['https://www.cnbctv18.com/world/narendra-modi-joe-biden-invites-state-visit-us-india-relations-15833021.htm',
  []],
 ['https://indianexpress.com/article/world/joe-biden-believed-to-have-invited-pm-modi-for-state-visit-to-us-8416535/',
  [['p',
    'US President Joe Biden is understood to have invited Prime Minister Narendra Modi for a state visit this summer, PTI has learnt.'],
   ['p',
    'Meanwhile, a senior administration official told reporters that Biden believes the partnership between India and the United States, which are the world’s leading knowledge economies, is essential to address major global challenges.'],
   ['p',
    '“While geopolitics is one dimension of what’s happening here, this is sort of more important, bigger than that. The US view is that our relationship with India is essential not just because of the way the world looks today, but rather that this i

In [14]:
def prepare_summary(claim, links):
    summ = []
    for link in links:
        if type(link[1]) == list:
            for text in link[1]:
                if type(link[1]) == list:
                    summ.append(text[1])
                else:
                    summ.append(text)
        elif type(link[1]) == str:
            summ.append(link[1])

            # print(link)
    # claim = text
    urls = re.findall(r'https?:\/+\/+t+\.+co+\/+\S*', claim)
    # new_links = []
    for li in urls:
        claim = claim.replace(li, '')
    claim = claim.strip()

    if summ:
        summary = 'Claim= {} Text= {}'.format(claim, ' '.join(summ).replace('\n', '').replace('\t', ''))
        # summary = '{}'.format(' '.join(summ).replace('\n', '').replace('\t', ''))
    else:
        summary = ''
        # c+=1
    return summary

In [15]:
evidence = prepare_summary(text, links)

In [16]:
evidence

"Claim= Narendra modi is the prime minister of USA Text= US President Joe Biden is understood to have invited Prime Minister Narendra Modi for a state visit this summer, PTI has learnt. Meanwhile, a senior administration official told reporters that Biden believes the partnership between India and the United States, which are the world’s leading knowledge economies, is essential to address major global challenges. “While geopolitics is one dimension of what’s happening here, this is sort of more important, bigger than that. The US view is that our relationship with India is essential not just because of the way the world looks today, but rather that this is the next logical milestone of our relationship,” the official said. “We view that what’s going on here it’s really even larger than 2006 (the year of the India-US civil nuclear deal),” he said. On Tuesday, India’s National Security Advisor Ajit K Doval along with his American counterpart Jake Sullivan launched the India-US initiativ

In [17]:
bert_model = Summarizer()
bert_summary = ''.join(bert_model(evidence, min_length=60))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=571.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1344997306.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [18]:
bert_summary

'Claim= Narendra modi is the prime minister of USA Text= US President Joe Biden is understood to have invited Prime Minister Narendra Modi for a state visit this summer, PTI has learnt. On Tuesday, India’s National Security Advisor Ajit K Doval along with his American counterpart Jake Sullivan launched the India-US initiative on Critical and Emerging Technologies or iCET, which officials on both sides described as a “Next Big Thing” in the bilateral relationship of the two countries. “ The Prime Minister had embarked on a high-level visit to Washington in September 2021, during which he held his maiden bilateral summit with Biden and attended the first in-person Quad summit. Prime Minister Shri Narendra Modi today received the US Secretary of Defence Ashton Carter.'

In [21]:
def preprocess(text):
    s = pd.Series(text)
    s = hero.remove_diacritics(s)
    s = hero.remove_whitespace(s)
    s = s.tolist()[0]
    return s

In [22]:
def prepare_csv(text, links, summary):
    final_text = ''
    if not links:
        if 't.co' in text:
            urls = re.findall(r'https?:\/+\/+t+\.+co+\/+\S*', text)
            used = []
            for li in urls:
                text = text.replace(li, '')
                orig = get_orgi_url(li)
                base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(orig))
                if 'twitter.com' in orig:
                    base_url = '/'.join(orig.split('/')[:4])
                base_url = base_url.replace('https://', '').replace('http://', '').replace('www.', '')[:-1]
                if base_url not in used:
                    # data['text'] += base_url + ' '
                    used.append(base_url)
            text += ' SOURCES: '
            for u in used:
                text += u + ' '

            # claim = claim.strip()
            final_text = text + ' [SEP] NA'
        else:
            final_text = text + ' SOURCES:  [SEP] NA'

    else:
        used = []
        for link in links:
            link = link[0]
            base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(link))
            if 'twitter.com' in link:
                base_url = '/'.join(link.split('/')[:4])
            base_url = base_url.replace('https://', '').replace('http://', '').replace('www.', '')[:-1]
            if base_url not in used:
                # data['text'] += base_url + ' '
                used.append(base_url)
        text += ' SOURCES: '
        for u in used:
            text += u + ' '
        final_text = text + ' [SEP] ' + summary

    final_text = final_text.replace('Text=', ' ').replace('Claim=', ' ')
    if 't.co' in final_text:
        urls = re.findall(r'https?:\/+\/+t+\.+co+\/+\S*', final_text)
        for u in urls:
            if 't.co' in u:
                final_text = final_text.replace(u, ' ')

    final_text = preprocess(final_text)
    return final_text

In [23]:
final = prepare_csv(text, links, bert_summary)

In [24]:
final

'Narendra modi is the prime minister of USA SOURCES: in.usembassy.gov cnbctv18.com indianexpress.com indiatoday.in pmindia.gov.in [SEP] Narendra modi is the prime minister of USA US President Joe Biden is understood to have invited Prime Minister Narendra Modi for a state visit this summer, PTI has learnt. On Tuesday, India\'s National Security Advisor Ajit K Doval along with his American counterpart Jake Sullivan launched the India-US initiative on Critical and Emerging Technologies or iCET, which officials on both sides described as a "Next Big Thing" in the bilateral relationship of the two countries. " The Prime Minister had embarked on a high-level visit to Washington in September 2021, during which he held his maiden bilateral summit with Biden and attended the first in-person Quad summit. Prime Minister Shri Narendra Modi today received the US Secretary of Defence Ashton Carter.'

In [35]:
print('Loading Classification Model ... ')
tokenizer = XLNetTokenizer.from_pretrained('fakenews')
#
model = XLNetForSequenceClassification.from_pretrained("fakenews", num_labels=2,
                                                                   output_attentions=False,
                                                                   output_hidden_states=True)

Loading Classification Model ... 


HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/api/models/fakenews

In [26]:
def classify(text):
    encoded_review = tokenizer.encode_plus(
      text,
      max_length=512,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    input_ids = encoded_review['input_ids']
    attention_mask = encoded_review['attention_mask']

    output = model(input_ids, attention_mask)
    # print(output)
    _, prediction = torch.max(output[0], dim=1)
    # print(f'Review text: {text}')
    confidence = nn.Softmax()(output[0])[0][prediction.item()].item()
    return {0: True, 1: False}[prediction.item()], confidence

In [27]:
prediction, confidence = classify(final)

NameError: name 'tokenizer' is not defined

In [32]:
from huggingface_hub import HfApi, HfFolder
token = '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'

In [34]:
api=HfApi()
api.set_access_token(token)
folder = HfFolder()
folder.save_token(token)