In [1]:
import re
import urllib
import json
import numpy as np
import pandas as pd
import transformers
import tensorflow as tf
import torch.nn as nn
import texthero as hero
from urllib.parse import urlsplit
import requests
import torch
from bs4 import BeautifulSoup
from requests import get
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from huggingface_hub import from_pretrained_keras
import spacy_sentence_bert

In [72]:
from summarizer import Summarizer,TransformerSummarizer

In [120]:
import warnings
warnings.filterwarnings('ignore')

## Search Google News

In [63]:
import math
from GoogleNews import GoogleNews
from datetime import datetime

In [62]:
googlenews = GoogleNews(lang='en')
googlenews.set_encode('utf-8')

In [69]:
def get_links_from_google_news(claim):
    googlenews.search(claim)
    
    results = googlenews.results()
    googlenews.clear()
    
    results = [x for x in results if isinstance(x['datetime'], datetime)]
    
    def func(ele):
        return str(ele['datetime'])
    
    results.sort(key = func, reverse = True)
    links = [x['link'] for x in results]
    
    return links

## Scraping Evidences

In [91]:
nlp = spacy_sentence_bert.load_model('en_nli_roberta_base')

In [124]:
def get_sentences_from_link(link, text, top=10):
    request = requests.get(link, verify=False, timeout=20)
    heading_tags = ['p']


    results = []
    used = []

    for tags in Soup.find_all(heading_tags):
        if 'h' in tags.name:
            tokens = tags.text.strip().split()
            if len(tokens) > 8:
                if tags.text.strip() not in used:
                    used.append(tags.text.strip())
                    results.append([tags.name, tags.text.strip()])
        else:
            tokens = tags.text.strip().split()
            if len(tokens) > 8:
                if tags.text.strip() not in used:
                    used.append(tags.text.strip())
                    results.append([tags.name, tags.text.strip()])
    doc1 = nlp(text)
    sim = []
    for r in results:
        sim.append(doc1.similarity(nlp(r[1])))
    zipped = zip(sim, results)
    zipped = sorted(zipped, reverse=True)
    high_conf = [a for s, a in zipped if s >= 0.5]

    return high_conf[:top], request.url

In [122]:
def scrap_evidences(text, links):
    new_links = []
    for link in links:
        conf, lin = get_sentences_from_link(link, text)
        new_links.append([lin, conf])
    return new_links

## Concatinating Evidences

In [123]:
def concatenate_evidences(claim, links):
    summ = []
    for link in links:
        if type(link[1]) == list:
            for text in link[1]:
                if type(link[1]) == list:
                    summ.append(text[1])
                else:
                    summ.append(text)
        elif type(link[1]) == str:
            summ.append(link[1])

    urls = re.findall(r'https?:\/+\/+t+\.+co+\/+\S*', claim)
    
    for li in urls:
        claim = claim.replace(li, '')
    claim = claim.strip()

    if summ:
        summary = (claim, ' '.join(summ).replace('\n', '').replace('\t', ''))
    else:
        summary = ('', '')

    return summary

## Extractive Summary

In [74]:
summarizer = Summarizer()

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [75]:
def generate_extractive_summary(evidence):
    return ''.join(summarizer(evidence, min_length=60))

## Semantic Based Similarity

In [76]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data."""
    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=32,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=128,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

In [92]:
model = from_pretrained_keras("keras-io/bert-semantic-similarity")
labels = ["contradiction", "entailment", "neutral"]

config.json not found in HuggingFace Hub




In [78]:
def predict(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )
    probs = model.predict(test_data[0])[0]
    
    labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)}
    return labels_probs

## Fake News Detection

In [118]:
def fake_news_detection(claim):
    links = get_links_from_google_news(claim)
    evidence_list = scrap_evidences(claim, links)
    evidence = concatenate_evidences(claim, evidence_list)
    extractive_summary = generate_extractive_summary(evidence[1])
    print(predict(claim, extractive_summary))

In [119]:
claim = "Narendra modi is a good person"
fake_news_detection(claim)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'contradiction': 0.03117767907679081, 'entailment': 8.52795856189914e-05, 'neutral': 0.9687370657920837}
