In [29]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
with open('ArticleDataset.json') as f:
    article = json.load(f)


We create a dataframe from the json file. Here a snapshot of the dataframe is given. We are primarily interested in the 'text' field. **As part of this assignment, we return the 'text' and the 'url' field of the 100 most relevant articles concerning a given query.**

In [30]:
df = pd.DataFrame(article, columns=article.keys())
df


Unnamed: 0,date,title,text,url,unknown
0,2020-01-22,TE Connectivity introduces M12 industrial Ethe...,TE Connectivity (TE) now offers M12 data cable...,https://www.automationmag.com/te-connectivity-...,empty
1,2020-01-22,Kirigami designs hold thousands of times their...,"The Japanese art of origami (from ori, folding...",https://techxplore.com/news/2020-01-kirigami-t...,empty
2,2020-01-22,GM's Cruise heads down new road with new robot...,General Motors' self-driving car company will ...,https://techxplore.com/news/2020-01-gm-cruise-...,empty
3,2020-01-22,First fully integrated flexible electronics ma...,Human skin is a fascinating multifunctional or...,https://techxplore.com/news/2020-01-fully-flex...,empty
4,2020-01-22,Study says that we trust our workplace robots,The only constant is change. Presumptions hard...,https://techxplore.com/news/2020-01-workplace-...,empty
...,...,...,...,...,...
8431,2008-11-04,Hometown Rescue,About The Author\nFrank Tobe\nFrank Tobe is th...,https://www.therobotreport.com/hometown-rescue/,delete
8432,2008-11-01,Popular Products – Rovio by Wowee,Frank Tobe\nFrank Tobe is the founder of The R...,https://www.therobotreport.com/popular-product...,delete
8433,2008-11-01,Popular Products – Verro by iRobot,Frank Tobe\nFrank Tobe is the founder of The R...,https://www.therobotreport.com/popular-product...,delete
8434,2008-10-31,Watch this multi-purpose snake crawl up a leg,Frank Tobe\nFrank Tobe is the founder of The R...,https://www.therobotreport.com/watch-this-mult...,delete


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8436 entries, 0 to 8435
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     8436 non-null   object
 1   title    8436 non-null   object
 2   text     8436 non-null   object
 3   url      8436 non-null   object
 4   unknown  8436 non-null   object
dtypes: object(5)
memory usage: 395.4+ KB


In [4]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(df.text)


In [5]:
tfidf_vectorizer_vectors

<8436x73007 sparse matrix of type '<class 'numpy.float64'>'
	with 2486773 stored elements in Compressed Sparse Row format>

The given query is vectorized to the same length as the articles. Cosine similartiy is computed to find the similarity between the query and all the articles. As asked, we are only interested in top 100 articles concerning a given query. Therefore, we sort the cosine similarities to find the 100 most relevant articles. 

In [8]:
def calculate_similarity(tfidf_vectorizer_vectors, vectorizor, query, top_k=100):
    """ Calculates the cosine similarity of documents and returns 100 most relevant articles."""
    
    query_vector = vectorizor.transform(query)    
    cosine_similarities = cosine_similarity(tfidf_vectorizer_vectors,query_vector).flatten()   
    similar_article_indices = np.argsort(cosine_similarities, axis=0)[:-top_k-1:-1]
    return (similar_article_indices, cosine_similarities)

In [24]:
def print_relevant_artciles(df, cosine_similarities, similar_doc_indices):
    """ Prints the most relevant artciles"""
    counter = 1
    for index in similar_doc_indices:
        print('Relevant-{}, Similarity = {}'.format(counter, cosine_similarities[index]))
        print('url-{}\nText: {}, '.format(df['url'][index],df['text'][index]))
        print()
        counter += 1

In [25]:
query = ['quantum computing software from IBM']
similar_article_indices, cosine_similarities = calculate_similarity(tfidf_vectorizer_vectors, tfidf_vectorizer, query)
print_relevant_artciles(df, cosine_similarities, similar_article_indices)

Relevant-1, Similarity = 0.8169635308021591
url-https://www.themanufacturer.com/articles/ibm-unveil-new-commercial-q-system-one-quantum-computer/
Text: IBM has released the world's first-ever commercial quantum computer, the Q System One. However, more progress is needed before it will take over from today's super computers.
Update 4 March, 2019:
IBM has announced that it has achieved the best performance from a quantum computer to date – a scientific milestone that includes the lowest error rates it has ever measured in the highly unstable realm of quantum computing.
According to IBM Q’s research team: Performance was measured via Quantum Volume; a full-system metric that accounts for gate and measurement errors as well as device cross talk and connectivity, and circuit software compiler efficiency.
IBM’s recently unveiled IBM Q System One quantum computer, with a fourth-generation 20-qubit processor, has produced a Quantum Volume of 16, roughly double that of the current IBM Q 20-qub