In [0]:
!pip install pymongo
!pip install plotly



In [0]:

import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

# Below libraries are for text processing using NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [0]:
from pymongo import MongoClient
#Step 1: Connect to MongoDB - Note: Change connection string as needed
client = MongoClient(port=27017)
db=client.newsdb
collection = db.new

In [0]:
# importing requests package 
import requests	 
import json
def News(): 
	
	# BBC news api 
  main_url=["http://newsapi.org/v2/top-headlines?country=us&category=sports&apiKey=0c9ce5c54abc446bbbe5d20e5e6f2fe9","http://newsapi.org/v2/top-headlines?country=us&category=business&apiKey=0c9ce5c54abc446bbbe5d20e5e6f2fe9","http://newsapi.org/v2/top-headlines?country=us&category=politics&apiKey=0c9ce5c54abc446bbbe5d20e5e6f2fe9","http://newsapi.org/v2/top-headlines?country=us&category=technology&apiKey=0c9ce5c54abc446bbbe5d20e5e6f2fe9","http://newsapi.org/v2/top-headlines?country=us&category=health&apiKey=0c9ce5c54abc446bbbe5d20e5e6f2fe9"]
  category=['sports','business','politics','technology','health']
  for k in range(len(main_url)):
	# fetching data in json format 
    open_page = requests.get(main_url[k]).json() 
    article = open_page["articles"]
    for i in article:
        i.update({'category':category[k]})
        print(i)
    collection.insert_many(article)


# Driver Code 
if __name__ == '__main__': 
	News() 



In [0]:
#Loading the database into a pandas dataframe
import pandas as pd
news_articles = pd.DataFrame.from_records(collection.find())

In [0]:
news_articles.head()

In [0]:
#Removing articles with short headlines
news_articles = news_articles[news_articles['title'].apply(lambda x: len(x.split())>5)]
print("Total number of articles after removal of headlines with short title:", news_articles.shape[0])

In [0]:
#Removing duplicate articles
news_articles.sort_values('title',inplace=True, ascending=False)
duplicated_articles_series = news_articles.duplicated('title', keep = False)
news_articles = news_articles[~duplicated_articles_series]
print("Total number of articles after removing duplicates:", news_articles.shape[0])

In [0]:
#Finding number of null values in each attribute
news_articles.isna().sum()

In [0]:
print("Total number of articles : ", news_articles.shape[0])
print("Total number of authors : ", news_articles["author"].nunique())
print("Total number of unqiue categories : ", news_articles["category"].nunique())

In [0]:
fig = go.Figure([go.Bar(x=news_articles["category"].value_counts().index, y=news_articles["category"].value_counts().values)])
fig['layout'].update(title={"text" : 'Distribution of articles category-wise','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Category name",yaxis_title="Number of articles")
fig.update_layout(width=800,height=700)
fig

In [0]:
#Downloading the nltk english stopwords package for preprocessing
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [0]:
news_articles_temp = news_articles.copy()

In [0]:
#Removing stopwords from headlines
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))   
cnt=0
for i in range(len(news_articles_temp["title"])):
    print(i)
    try:
        string = ""
        word_tokens = word_tokenize(news_articles_temp["title"][i])
        print(word_tokens)
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        filtered_sentence = [] 

        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w)
    except KeyError as ke:
        cnt+=1
        print('Key Not Found', ke)

    for word in filtered_sentence:
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
            string += word + " "  
        if(i%10==0):
            print(i)  # To track number of records processed
    #news_articles_temp.at[i,"title"] = string.strip()
print(cnt)

In [0]:
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [0]:
#Lemmatizing the headlines
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))   
cnt=0
for i in range(len(news_articles_temp["title"])):
    print(i)
    try:
        string = ""
        word_tokens = word_tokenize(news_articles_temp["title"][i])
        print(word_tokens)
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        filtered_sentence = [] 

        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w)
    except KeyError as ke:
        cnt+=1
        print('Key Not Found in Employee Dictionary:', ke)

    for w in filtered_sentence:
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_articles_temp.at[i, "title"] = string.strip()
    if(i%10==0):
        print(i)     # To track number of records processed
    #news_articles_temp.at[i,"title"] = string.strip()
print(cnt)

In [0]:
headline_vectorizer = CountVectorizer()
headline_features   = headline_vectorizer.fit_transform(news_articles_temp['title'])

In [0]:
headline_features.get_shape()

In [0]:
pd.set_option('display.max_colwidth', -1) # To display a very long headline completely

In [0]:
#To display results similar to the queried article's headline based on CountVectorizer
def bag_of_words_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(headline_features,headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['publishedAt'][indices].values,
               'headline':news_articles['title'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['title'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,1]
    return df.iloc[1:,]

bag_of_words_based_model(15, 11)

In [0]:
#To display results similar to the queried article's headline based on TfidfVectorizer
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_articles_temp['title'])
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['publishedAt'][indices].values,
               'headline':news_articles['title'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['title'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    df=df.dropna()
    print(len(df))
    #return df.iloc[1:,1]
    return df
tfidf_based_model(15, 11)

In [0]:
pip install -U gensim

In [0]:
#Gensim is NLP package having Google's pretrained Word2vec model
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim import models
import pickle
import os

In [0]:
import gensim
loaded_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [0]:
vocabulary = loaded_model.vocab.keys()
w2v_headline = []
for i in news_articles_temp['title']:
    w2Vec_word = np.zeros(300, dtype="float32")
    for word in i.split():
        if word in vocabulary:
            w2Vec_word = np.add(w2Vec_word, loaded_model[word])
    w2Vec_word = np.divide(w2Vec_word, len(i.split()))
    w2v_headline.append(w2Vec_word)
w2v_headline = np.array(w2v_headline)

In [0]:
#Recommendation based on similar headlines
def avg_w2v_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['publishedAt'][indices].values,
               'headline':news_articles['title'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['title'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,1]
    return df.iloc[1:,]

avg_w2v_based_model(90, 11)

In [0]:
from sklearn.preprocessing import OneHotEncoder 

In [0]:
#Using OneHotEncoding on the catoegory feature
news_articles_temp=news_articles_temp.dropna()
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["category"]).reshape(-1,1))

In [0]:
#Recommendation based on similar headlines and category
def avg_w2v_with_category(row_index, num_similar_items, w1,w2): 
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1

    x=(w1 * w2v_dist +  w2 * category_dist)
    weighted_couple_dist   = x/float(w1 + w2)
    
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['publishedAt'][indices].values,
               'headline':news_articles['title'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                 'Category based Euclidean similarity': category_dist[indices].ravel(),
                 'Categoty': news_articles['category'][indices].values})
    
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['title'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,5]]
    return df.iloc[1:, ]

avg_w2v_with_category(50,10,0.1,0.8)

In [0]:
#Performing OneHotEncoding on the authors
authors_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["author"]).reshape(-1,1))

In [0]:
#Recommendation based on similar headlines,category and authors
def avg_w2v_with_category_and_authors(row_index, num_similar_items, w1,w2,w3): 
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist)/float(w1 + w2 + w3)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['publishedAt'][indices].values,
                'headline':news_articles['title'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),       
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['author'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['title'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print('Authors : ', news_articles['author'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,6,7]]
    return df.iloc[1:, ]
    
avg_w2v_with_category_and_authors(50,10,0.1,0.1,1)


In [0]:
#Performing OneHotEncoding on the published time
publishingday_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["publishedAt"]).reshape(-1,1))

In [0]:
#Recommendation based on similar headlines,category,author and publishing time
def avg_w2v_with_category_authors_and_publshing_day(row_index, num_similar_items, w1,w2,w3,w4): 
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    publishingday_dist = pairwise_distances(publishingday_onehot_encoded, publishingday_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist + w4 * publishingday_dist)/float(w1 + w2 + w3 + w4)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['publishedAt'][indices].values,
                'headline_text':news_articles['title'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),   
                'Publishing day based Euclidean similarity': publishingday_dist[indices].ravel(), 
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['author'][indices].values,
                'Day and month': news_articles['publishedAt'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['title'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print('Authors : ', news_articles['author'][indices[0]])
    print('Day and month : ', news_articles['publishedAt'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,7,8,9]]
    return df.iloc[1:, ]


avg_w2v_with_category_authors_and_publshing_day(50,10,0.1,0.1,0.1,1)