# **Extract relevant topics based on query by Using BERT**

In [2]:
!git clone https://github.com/HardikMochi/Assignment.git

Cloning into 'Assignment'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 14 (delta 3), reused 12 (delta 1), pack-reused 0[K
Unpacking objects: 100% (14/14), done.


In [1]:
cd Assignment

/content/Assignment


In [2]:
!pip install faiss-cpu
!pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.7/dist-packages (1.1.0)


### Let's begin!

In [3]:

# Used to import data from local.
import pandas as pd

# Used to create the dense document vectors.
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity

# Used to create and store the Faiss index.
import faiss
import numpy as np
import pickle
from pathlib import Path
import time
import itertools

# Used to do vector searches and display the results.
from utils import vector_search, id2details

In [4]:
#define the n_grams and stop word
n_gram_range = (1, 1)
stop_words = "english"

Stored and processed data


In [5]:
# Read a CSV in a table
df = pd.read_csv('data/misinformation_papers.csv')

## Max Sum Similarity
The maximum sum distance between pairs of data is defined as the pairs of data for which the distance between them is maximized. In our case, we want to maximize the candidate similarity to the document whilst minimizing the similarity between candidates.

In [6]:

def max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [7]:
## Used to do find the keyword that help in find the relative topics.

def get_keyword(text):
  # Extract candidate words/phrases
  count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
  candidates = count.get_feature_names()
  
  #we convert both the text as well as the candidate to numerical data
  doc_embedding = model.encode([text])
  candidate_embeddings = model.encode(candidates)
  return max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n =7, nr_candidates=10)

In [8]:
#read the csv file
df.head(3)

Unnamed: 0,original_title,abstract,year,citations,id,is_EN
0,When Corrections Fail: The Persistence of Poli...,An extensive literature addresses citizen igno...,2010,901,2132553681,1
1,A postmodern Pandora's box: anti-vaccination m...,The Internet plays a large role in disseminati...,2010,440,2117485795,1
2,Spread of (Mis)Information in Social Networks,We provide a model to investigate the tension ...,2010,278,2120015072,1


In [9]:
print(f"Misinformation, disinformation and fake news papers: {df.id.unique().shape[0]}")

Misinformation, disinformation and fake news papers: 8430


In [10]:
df.shape

(8430, 6)

## Data Preprossening

In [11]:
#in our data many entiries are the duplicates so first we have to remove the duplicate entries from our data 
#other wise same entry are displayed twice in result 
query = df.iloc[5415:5417,0:2]
query.head()

Unnamed: 0,original_title,abstract
5415,The COVID-19 social media infodemic.,We address the diffusion of information about ...
5416,The COVID-19 Social Media Infodemic,We address the diffusion of information about ...


In [12]:
query = df.iloc[5445:5449,0:2]
query.head()


Unnamed: 0,original_title,abstract
5445,A Dataset of Fact-Checked Images Shared on Wha...,"Recently, messaging applications, such as What..."
5446,A Dataset of Fact-Checked Images Shared on Wha...,"Recently, messaging applications, such as What..."
5447,Prta: A System to Support the Analysis of Prop...,"Recent events, such as the 2016 US Presidentia..."
5448,Prta: A System to Support the Analysis of Prop...,"Recent events, such as the 2016 US Presidentia..."


In [13]:
# dropping ALL duplicte values
data = df.drop_duplicates(subset =['original_title','abstract'],
                     keep = False)
data.shape


(8285, 6)

## Model
In this tutorial, we will use the distilbert-base-nli-stsb-mean-tokens model which has the best performance on Semantic Textual Similarity tasks among the DistilBERT versions. Moreover, although it's slightly worse than BERT, it is quite faster thanks to having a smaller size.

In [14]:
# Instantiate the sentence-level DistilBERT
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# Check if GPU is available and use it
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
print(model.device)

HBox(children=(FloatProgress(value=0.0, max=244715968.0), HTML(value='')))


cuda:0


In [15]:
 # Convert abstracts to vectors
embeddings = model.encode(data.abstract.to_list(), show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=259.0, style=ProgressStyle(description_widt…




In [16]:
print(f'Shape of the vectorised abstract: {embeddings[0].shape}')

Shape of the vectorised abstract: (768,)


## Vector similarity search with Faiss


In [17]:
# Step 1: Change data type
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")

# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, data.id.values)

print(f"Number of vectors in the Faiss index: {index.ntotal}")

Number of vectors in the Faiss index: 8285


### Searching the index
The index we built will perform a k-nearest-neighbour search. We have to provide the number of neighbours to be returned.

Let's query the index with an abstract from our dataset and retrieve the 5 most relevant documents. The first one must be our query!



In [26]:
# Paper abstract
query = data.iloc[5420, 1]

In [27]:
 # Retrieve the 10 nearest neighbours
D, I = index.search(np.array([embeddings[5420]]), k=5)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')

L2 distance: [0.0, 81.33309173583984, 105.64372253417969, 105.74414825439453, 106.14827728271484]

MAG paper IDs: [3033049462, 3047503830, 3024620668, 3039651429, 3032066355]


In [28]:

# Fetch the paper titles based on their index
id2details(data, I, 'original_title')

[['Disparities in COVID-19 related knowledge, attitudes, beliefs and behaviours by health literacy'],
 ['COVID-19: Beliefs in misinformation in the Australian community'],
 ['Quantifying COVID-19 Content in the Online Health Opinion War Using Machine Learning'],
 ['Misinformation During a Pandemic'],
 ['La libertà di informazione al tempo della pandemia. Rilievi critici in margine all’istituzione dell’«unità di monitoraggio per il contrasto della diffusione di fake news relative al CoViD-19 sul web e sui social network»']]

In [29]:
# Fetch the paper abstracts based on their index
id2details(data, I, 'abstract')

[['Objectives: To explore the variation in understanding, attitudes and uptake of COVID-19 health advice during the 2020 pandemic lockdown by health literacy.\n\nStudy design: National cross sectional community survey.\n\nSetting: Australian general public.\n\nParticipants: Adults aged over 18 years (n=4362).\n\nMain outcome measures: Knowledge, attitudes and behaviours related to COVID-19; health literacy and socio-demographic factors.\n\nResults: People with inadequate health literacy had poorer understanding of COVID-19 symptoms (49% vs 68%; p<0.001), were less able to identify behaviours to prevent infection (59% vs 72%; p<0.001), and experienced more difficulty finding information and understanding government messaging about COVID-19 than people with adequate health literacy. They were less likely to rate social distancing as important (6.1 vs 6.5, p<0.001) and reported more difficulty remembering/accessing medication since lockdown (3.6 vs 2.7, p<0.001). Importantly there was hig


## Putting all together
Let’s try to find relevant academic articles for a new, unseen search query. In this example, I will query our index with the first paragraph of the Can WhatsApp benefit from debunked fact-checked stories to reduce misinformation? article that was published on HKS Misinformation Review.

In [30]:
def search(query):
   t=time.time()
   k = 5
   D, I = vector_search([query], model, index, num_results=k)
   print('totaltime: {}'.format(time.time()-t))
   # Fetching the paper titles based on their index
   return id2details(df, I, 'original_title')

In [32]:
query=str(input())
results=search(query)
print("-------------------------------------------------------------------------------------------------------------------------------------")
print('keywords that find the most relevant articles :')
keyword = get_keyword(query)
print(keyword)
print("------------------------------------------------------------------------------------------------------------------------------------------")
print('Top 5 ranked articles  :')
for result in results:
   print('\t',result)

WhatsApp was alleged to have been widely used to spread misinformation and propaganda  during the 2018 elections in Brazil and the 2019 elections in India.
totaltime: 0.019596576690673828
-------------------------------------------------------------------------------------------------------------------------------------
keywords that find the most relevant articles :
['2019', '2018', 'whatsapp', 'propaganda', 'brazil', 'india', 'misinformation']
------------------------------------------------------------------------------------------------------------------------------------------
Top 5 ranked articles  :
	 ['Can WhatsApp Benefit from Debunked Fact-Checked Stories to Reduce Misinformation?']
	 ['Understanding Viral Communism: A Thematic Analysis of Twitter During Brazil’s 2018 Elections']
	 ['Politics of Fake News: How WhatsApp Became a Potent Propaganda Tool in India']
	 ['A System for Monitoring Public Political Groups in WhatsApp']
	 ['Can WhatsApp Counter Misinformation by Limitin

In [None]:
# Serialise index and store it as a pickle
with open(f"models/faiss_index.pickle", "wb") as h:
    pickle.dump(faiss.serialize_index(index), h)