<a href="https://colab.research.google.com/github/NagaKartheekReddy/NLP-Tasks/blob/main/retreivingRelevantDocuments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Retreiving top 100 documents relevant to user query using TFIDF model and Cosine similarity.

In [None]:
import nltk
#nltk.download('punkt')
import string
import time


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
#from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [None]:
from google.colab import files
uploaded = files.upload()

Saving ArticleDataset.json to ArticleDataset.json


In [None]:
import json
# loading data from json file
data = next(iter(uploaded.values()))
data = json.loads(data)
# extracting text documents
corpus = list(data['text'].values())

In [None]:
trans_dict = {ord(c): None for c in string.punctuation + string.digits}    
stemmer = SnowballStemmer(language='english')

In [None]:
def tokenize(text):

  token_words = [word for word in nltk.word_tokenize(text.translate(trans_dict)) if len(word) > 1] 
  stemm_words = [stemmer.stem(item) for item in token_words]
  return stemm_words

In [None]:
def tfidf(corpus):

  # Initialize an instance of tf-idf Vectorizer
  tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.001,max_df=0.5)
  # return fitted vectorizer object
  fitted_vectorizer=tfidf_vectorizer.fit(corpus)
  print('Vocabulary size',fitted_vectorizer.vocabulary_.__len__())
  # takes corpus and return document-term matrix
  tfidf_vectorizer_vectors=fitted_vectorizer.transform(corpus)

  return fitted_vectorizer , tfidf_vectorizer_vectors


In [None]:
fitted_vectorizer, tfidf_vectorizer_vectors = tfidf(corpus)

Vocabulary size 10182


In [None]:
def documents_retreival(query, fitted_vectorizer, tfidf_vectorizer_vectors, relevant_documents=100):
  
  start = time.time()
  # generating term matrix for query
  tfidf_vectorizer_queryVector=fitted_vectorizer.transform(query)
  # computing similarity scores
  cosine_sim = linear_kernel(tfidf_vectorizer_vectors, tfidf_vectorizer_queryVector)
  similarity_scores = list(enumerate(cosine_sim))
  # sorting based on similarity scores
  similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
  # printing top 100 documents title, URL, TFIDF score
  for i in similarity_scores[:relevant_documents]:
    idx = str(i[0])
    print('Title:{}   URL:{}  Similarity score:{}\n'.format(data['title'][idx], data['url'][idx], i[1]))
  print ('Time taken to run corpus', (time.time() - start)) 

In [None]:
query = ['quantum computing software from IBM']
documents_retreival(query, fitted_vectorizer, tfidf_vectorizer_vectors)

Title:IBM unveils new commercial Q System One quantum computer   URL:https://www.themanufacturer.com/articles/ibm-unveil-new-commercial-q-system-one-quantum-computer/  Similarity score:[0.89130025]

Title:Intel Introduces cryogenic control chip 'Horse Ridge' to enable control of multiple quantum bits   URL:https://techxplore.com/news/2019-12-intel-cryogenic-chip-horse-ridge.html  Similarity score:[0.46463668]

Title:IBM introduces new Watson solutions and services for nine industries and professions   URL:https://www.manufacturingglobal.com/technology/ibm-introduces-new-watson-solutions-and-services-nine-industries-and-professions  Similarity score:[0.35947724]

Title:IBM Investing Billions in 'Internet of Things'   URL:https://www.industryweek.com/technology-and-iiot/article/22005994/ibm-investing-billions-in-internet-of-things  Similarity score:[0.33340104]

Title:An artificial intelligence algorithm can learn the laws of quantum mechanics   URL:https://techxplore.com/news/2019-11-ar