In [1]:
import numpy as np
import pandas as pd
import os
import datetime
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [2]:
filename_w2v = '../recommends_model_w2v.sav'
filename_authors = '../recommends_model_authors.sav'
filename_category = '../recommends_model_category.sav'
filename_pubday = '../recommends_model_pubday.sav'
tfidf_text_features = pickle.load( open( filename_w2v, 'rb' ))
authors_onehot_encoded = pickle.load( open( filename_authors, 'rb' ))
category_onehot_encoded = pickle.load( open( filename_category, 'rb' ))
publishingday_onehot_encoded = pickle.load( open( filename_pubday, 'rb' ))

In [3]:
# load data
#news_articles_1 = pd.DataFrame()
news_articles_1 = pd.read_json( "../News_Category_Dataset_v2.json", lines = True )

news_articles_2a = pd.read_csv( "../articles1.csv" )
news_articles_2b = pd.read_csv( "../articles2.csv" )
news_articles_2c = pd.read_csv( "../articles3.csv" )
news_articles_2 = pd.concat( [news_articles_2a, news_articles_2b, news_articles_2c], ignore_index=True )
#news_articles_2 = pd.concat( [news_articles_2a], ignore_index=True )

In [4]:
# empty dataframe for our trained data
news_articles = pd.DataFrame( columns=[
    'id', 'category', 'headline', 'description', 
    'authors', 'publication', 'date', 'link' ] )

# clean our source data
# all this source data needs to be moved to BigQuery and sourced from there
news_articles_1.rename( columns={'short_description': 'description'}, inplace=True )
news_articles_1.insert( 0, 'id', range( 0, len( news_articles_1 ) ))
news_articles_1["publication"] = "unknown"
news_articles_2.rename( columns={'title': 'headline'}, inplace=True )
news_articles_2.rename( columns={'url': 'link'}, inplace=True )
news_articles_2.rename( columns={'content': 'description'}, inplace=True )
news_articles_2.rename( columns={'author': 'authors'}, inplace=True )
news_articles_2.drop( columns=['year', 'month'], inplace=True )
news_articles_2["category"] = "unknown"
news_articles_2['date'] = pd.to_datetime(news_articles_2['date'], format='%Y-%m-%d' )

# combine sources
news_articles = pd.concat( [news_articles_1, news_articles_2], ignore_index=True )

# reduce author name(s), publisher, and category fields to single string
news_articles['authors'] = news_articles['authors'].str.replace(" ","")
news_articles['publication'] = news_articles['publication'].str.replace(" ","")
news_articles['category'] = news_articles['category'].str.replace(" ","")

# and create combined headline and description to lemma
news_articles['text'] = news_articles['headline'] + " " + news_articles['description']


In [6]:
def tfidf_based_model(row_index, num_similar_items=6):
    couple_dist = pairwise_distances(tfidf_text_features,tfidf_text_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'text':news_articles['text'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    #print("="*30,"Queried article details","="*30)
    print('text : ',news_articles['text'][indices[0]])
    #print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]

In [7]:
tfidf_based_model(2)

text :  Hugh Grant Marries For The First Time At Age 57 The actor and his longtime girlfriend Anna Eberstein tied the knot in a civil ceremony.


Unnamed: 0,publish_date,text,Euclidean similarity with the queried article
1,2015-09-20,Richard Dawkins Accuses Ahmed Mohamed Of Commi...,0.475123
2,2016-12-08,Trump the Progressive ‘Progressive” is a funny...,0.698914
3,2016-12-08,The Pro-Choice Movement’s Overblown Fear of a ...,0.717112
4,2017-01-11,"These 9 Foods Are Loaded With B-12, And Here's...",0.720067
5,2017-04-26,Bill Cosby's Daughter Writes Letter In His Def...,0.723536


In [8]:
tfidf_based_model(5)

text :  Morgan Freeman 'Devastated' That Sexual Harassment Claims Could Undermine Legacy "It is not right to equate horrific incidents of sexual assault with misplaced compliments or humor," he said in a statement.


Unnamed: 0,publish_date,text,Euclidean similarity with the queried article
1,2016-04-06,Anita Hill On Why Her Testimony Still Matters ...,0.767207
2,2017-02-04,"On Field and in Hometown, Neymar Tries to Chan...",0.845699
3,2016-09-19,"After 22 years, the Rams are back in LA. Was i...",0.854314
4,2016-02-28,The Terror Group That Could Ruin Syria's Cease...,0.869222
5,2016-01-13,Stinging Report On Pandemics Makes Louis Paste...,0.874183


In [9]:
tfidf_based_model(2000)

text :  The No. 2 Question In 'Westworld' Finally Has An Answer One mystery remained in the bowels of the park until we talked to Steven Ogg.


Unnamed: 0,publish_date,text,Euclidean similarity with the queried article
1,2016-08-28,How A Tiny Crippled Street Dog 'Spoke' And Fou...,0.961404
2,2014-06-12,"Lend These Guys $17,000 And Get Free Burritos ...",0.963081
3,2017-02-01,The Filibuster: A Senator’s Best Friend A Faus...,0.988326
4,2016-12-22,"Drink: forget claret, champagne and port – the...",1.0
5,2017-01-25,"‘Choose Facebook, revenge porn, zero hours’: w...",1.0
