In [14]:
import numpy as np
import pandas as pd
import os
import math
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/adon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/adon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/adon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
news_articles_1 = pd.read_json( "../News_Category_Dataset_v2.json", lines = True )
#news_articles_1 = pd.DataFrame()

news_articles_2a = pd.read_csv( "../articles1.csv" )
news_articles_2b = pd.read_csv( "../articles2.csv" )
news_articles_2c = pd.read_csv( "../articles3.csv" )
news_articles_2 = pd.concat( [news_articles_2a, news_articles_2b, news_articles_2c], ignore_index=True )
#news_articles_2 = pd.concat( [news_articles_2a], ignore_index=True )

In [17]:
len( news_articles_1 ) + len( news_articles_2 )

343423

In [18]:
# empty dataframe for our training data
news_articles = pd.DataFrame( columns=[
    'id', 'category', 'headline', 'description', 
    'authors', 'publication', 'date', 'link' ] )

In [19]:
# clean our source data
news_articles_1.rename( columns={'short_description': 'description'}, inplace=True )
news_articles_1.insert( 0, 'id', range( 0, len( news_articles_1 ) ))
news_articles_1["publication"] = "unknown"

news_articles_2.rename( columns={'title': 'headline'}, inplace=True )
news_articles_2.rename( columns={'url': 'link'}, inplace=True )
news_articles_2.rename( columns={'content': 'description'}, inplace=True )
news_articles_2.rename( columns={'author': 'authors'}, inplace=True )
news_articles_2.drop( columns=['year', 'month'], inplace=True )
news_articles_2["category"] = "unknown"
news_articles_2['date'] = pd.to_datetime(news_articles_2['date'], format='%Y-%m-%d' )

In [20]:
# combine sources
news_articles = pd.concat( [news_articles_1, news_articles_2], ignore_index=True )
# reduce author name(s), publisher, and category fields to single string
news_articles['authors'] = news_articles['authors'].str.replace(" ","")
news_articles['publication'] = news_articles['publication'].str.replace(" ","")
news_articles['category'] = news_articles['category'].str.replace(" ","")
# and create combined headline and description to lemma
news_articles['text'] = news_articles['headline'] + " " + news_articles['description']


In [21]:
news_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343423 entries, 0 to 343422
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   id           343423 non-null  int64         
 1   category     343423 non-null  object        
 2   headline     343421 non-null  object        
 3   authors      327547 non-null  object        
 4   link         286412 non-null  object        
 5   description  343423 non-null  object        
 6   date         340782 non-null  datetime64[ns]
 7   publication  343423 non-null  object        
 8   Unnamed: 0   142570 non-null  float64       
 9   text         343421 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(7)
memory usage: 26.2+ MB


In [22]:
news_articles.head()

Unnamed: 0.1,id,category,headline,authors,link,description,date,publication,Unnamed: 0,text
0,0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,MelissaJeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26,unknown,,There Were 2 Mass Shootings In Texas Last Week...
1,1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,AndyMcDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,unknown,,Will Smith Joins Diplo And Nicky Jam For The 2...
2,2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,RonDicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,unknown,,Hugh Grant Marries For The First Time At Age 5...
3,3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,RonDicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,unknown,,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,RonDicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26,unknown,,Julianna Margulies Uses Donald Trump Poop Bags...


In [23]:
#news_articles = news_articles[news_articles['date'] >= pd.Timestamp(2018,1,1)]

In [24]:
news_articles.shape

(343423, 10)

In [25]:
#news_articles = news_articles[news_articles['text'].apply(lambda x: len(x.split())>5)]
#print("Total number of articles after removal of those with short title:", news_articles.shape[0])

In [26]:
news_articles.sort_values('text',inplace=True, ascending=False)
duplicated_articles_series = news_articles.duplicated('text', keep = False)
news_articles = news_articles[~duplicated_articles_series]
print("Total number of articles after removing duplicates:", news_articles.shape[0])

Total number of articles after removing duplicates: 342290


In [27]:
news_articles.index = range(news_articles.shape[0])

In [28]:
# Adding a new column containing both day of the week and month, it will be required later while recommending based on day of the week and month
news_articles["day and month"] = news_articles["date"].dt.strftime("%a") + "_" + news_articles["date"].dt.strftime("%b")

In [29]:
news_articles_temp = news_articles.copy()

In [30]:
stop_words = set(stopwords.words('english'))

In [31]:
for i in range(len(news_articles_temp["text"])):
    string = ""
    for word in news_articles_temp["text"][i].split():
        #print(word)
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "  
    if(i%10000==0):
      print(i)           # To track number of records processed
    news_articles_temp.at[i,"text"] = string.strip()

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000


In [32]:
lemmatizer = WordNetLemmatizer()

In [33]:
#nltk.download('omw-1.4')

In [34]:
for i in range(len(news_articles_temp["text"])):
    string = ""
    for w in word_tokenize(news_articles_temp["text"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_articles_temp.at[i, "text"] = string.strip()
    if(i%10000==0):
        print(i)           # To track number of records processed

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000


# TF-IDF method

In [35]:
tfidf_text_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_text_features = tfidf_text_vectorizer.fit_transform(news_articles_temp['text'])

In [36]:
def tfidf_based_model(row_index, num_similar_items=6):
    couple_dist = pairwise_distances(tfidf_text_features,tfidf_text_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'text':news_articles['text'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    #print("="*30,"Queried article details","="*30)
    print('text : ',news_articles['text'][indices[0]])
    #print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]
tfidf_based_model(2)

text :  “We’re going out in a blaze of glory” — why NASA is crashing a 20-year-old spacecraft into Saturn  In the 20 years it’s been in space, the Cassini spacecraft has seen storms on Saturn’s surface, sent a probe down to the planet’s moon Titan, and shown us the eerie beauty of the gas giant’s hexagonal north pole. As it is rapidly running out of fuel, it’s been given one last mission before it goes offline in September: to solve the mystery of the planet’s rings.    “We’re uncertain by quite a large margin about how much stuff is really there [in the rings],” Preston Dyches, a NASA spokesperson for the Cassini mission, said in December. “That has major implication for how they formed and how old they are. That’s still one of the biggest mysteries of Saturn  —   how did it get these rings  —   which tells us things about how planets form and how planets form around other stars. ”  Studying Saturn’s rings helps scientists understand how planets and solar systems form. It’s likely the

Unnamed: 0,publish_date,text,Euclidean similarity with the queried article
1,2017-04-26,The Cassini spacecraft’s dive in between Satur...,0.475123
2,2017-04-05,Cassini spacecraft to dive inside Saturn’s rin...,0.698914
3,2017-04-26,Cassini spacecraft prepares dive into Saturn’s...,0.717112
4,2017-04-27,U.S. spacecraft shares first view from inside ...,0.720067
5,2017-04-13,Watch NASA’s latest discovery about “ocean wor...,0.723536


# Weighted similarity based on category, publish day and author

In [37]:
from sklearn.preprocessing import OneHotEncoder 

In [38]:
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["category"]).reshape(-1,1))

In [39]:
publishingday_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["day and month"]).reshape(-1,1))

In [40]:
authors_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["authors"]).reshape(-1,1))

In [41]:
def avg_TFIDF_with_category_authors_and_publshing_day(row_index, num_similar_items, w1,w2,w3,w4): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(tfidf_text_features,tfidf_text_features[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    publishingday_dist = pairwise_distances(publishingday_onehot_encoded, publishingday_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist + w4 * publishingday_dist)/float(w1 + w2 + w3 + w4)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                'headline_text':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),   
                'Publishing day based Euclidean similarity': publishingday_dist[indices].ravel(), 
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['authors'][indices].values,
                'Day and month': news_articles['day and month'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print('Authors : ', news_articles['authors'][indices[0]])
    print('Day and month : ', news_articles['day and month'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,7,8,9]]
    return df.iloc[1:, ]


In [58]:
news_articles.shape

(342290, 11)

In [42]:
#interesting results...
avg_TFIDF_with_category_authors_and_publshing_day(52,10,0.5,0.2,0.2,0.1)

headline :  ’Zen And The Art of Motorcycle Maintenance’ Author Robert M. Pirsig Dies At 88
Categoty :  unknown
Authors :  LaurelWamsley
Day and month :  Mon_Apr



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2017-04-24,Robert Pirsig: Zen and the Art of Motorcycle M...,1.128348,0.69101,1.0,2.414214,1.0,unknown,,Mon_Apr
2,2017-04-25,Author Robert Pirsig dies at 88,1.317602,0.786677,1.0,2.414214,2.414214,unknown,AssociatedPress,Tue_Apr
3,2017-02-21,Who’s Who In The Race For DNC Chair — And Thei...,1.329411,1.37598,1.0,1.0,2.414214,unknown,LaurelWamsley,Tue_Feb
4,2017-06-12,"One Year After Pulse Shooting, Orlando Honors ...",1.331638,1.380434,1.0,1.0,2.414214,unknown,LaurelWamsley,Mon_Jun
5,2017-05-15,"Like Most White House Kids, Barron Trump Will ...",1.333259,1.383675,1.0,1.0,2.414214,unknown,LaurelWamsley,Mon_May
6,2017-05-27,White Supremacist Charged With Killing 2 In Po...,1.333388,1.383934,1.0,1.0,2.414214,unknown,LaurelWamsley,Sat_May
7,2017-03-31,Serial Killer Dubbed ’Angel Of Death’ Dies Aft...,1.333406,1.383969,1.0,1.0,2.414214,unknown,LaurelWamsley,Fri_Mar
8,2016-12-29,The Top Stories On NPR.org This Year,1.334241,1.385639,1.0,1.0,2.414214,unknown,LaurelWamsley,Thu_Dec
9,2017-04-26,"In Surprise TED Talk, Pope Francis Asks The Po...",1.334797,1.386751,1.0,1.0,2.414214,unknown,LaurelWamsley,Wed_Apr


In [48]:
avg_TFIDF_with_category_authors_and_publshing_day(52,5,0.6,0.2,0.1,0.1)

headline :  ’Zen And The Art of Motorcycle Maintenance’ Author Robert M. Pirsig Dies At 88
Categoty :  unknown
Authors :  LaurelWamsley
Day and month :  Mon_Apr



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2017-04-24,Robert Pirsig: Zen and the Art of Motorcycle M...,0.956027,0.69101,1.0,2.414214,1.0,unknown,,Mon_Apr
2,2017-04-25,Author Robert Pirsig dies at 88,1.154849,0.786677,1.0,2.414214,2.414214,unknown,AssociatedPress,Tue_Apr
3,NaT,And There We Have it,1.282843,1.0,1.0,2.414214,2.414214,unknown,JoshMarshall,
4,2017-04-03,Documentarian Says ’Anarchist Cookbook’ Author...,1.350916,1.349158,1.0,2.414214,1.0,unknown,KellyMcEvers,Mon_Apr


In [50]:
# this one performs particularly well
avg_TFIDF_with_category_authors_and_publshing_day(56001,25,0.6,0.2,0.1,0.1)

headline :  Thousands of Flint residents could lose their homes over unpaid water bills
Categoty :  unknown
Authors :  MarkAbadi
Day and month :  Thu_May



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2017-05-18,Flint Residents Win Battle Over Losing Their ...,0.988426,0.745007,1.0,2.414214,1.0,unknown,BriannaSacks,Thu_May
2,2016-02-04,Unpaid Water Bills In Flint Could Hinder Repairs,1.104222,0.702298,1.0,2.414214,2.414214,unknown,SteveCarmody,Thu_Feb
3,2016-05-05,"Obama sips Flint water, urges children be test...",1.108531,0.945182,1.0,2.414214,1.0,unknown,TimothyGardner,Thu_May
4,2017-02-28,Flint residents must start paying for water th...,1.134226,0.752306,1.0,2.414214,2.414214,unknown,BradyDennis,Tue_Feb
5,2017-03-01,Michigan Ends Water Subsidies To Flint Despite...,1.14251,0.766112,1.0,2.414214,2.414214,unknown,MerritKennedy,Wed_Mar
6,2017-01-24,"Flint water falls below federal lead limits, b...",1.147299,0.774093,1.0,2.414214,2.414214,unknown,MarkBerman,Tue_Jan
7,2016-04-20,Lead-Laced Water In Flint: A Step-By-Step Look...,1.170705,0.813104,1.0,2.414214,2.414214,unknown,MerritKennedy,Wed_Apr
8,2016-12-20,The Flint water crisis is not over,1.18397,0.835213,1.0,2.414214,2.414214,unknown,ConnorCoyne,Tue_Dec
9,2016-02-15,"The Flint water crisis, explained",1.185117,0.837124,1.0,2.414214,2.414214,unknown,LibbyNelson,Mon_Feb
10,2016-05-04,"Watch: President Obama speaks at Flint, Michig...",1.187538,0.841159,1.0,2.414214,2.414214,unknown,GermanLopez,Wed_May


In [52]:
# but we dilute it quickly with the subject i think
avg_TFIDF_with_category_authors_and_publshing_day(56001,5,0.3,0.3,0.2,0.1)

headline :  Thousands of Flint residents could lose their homes over unpaid water bills
Categoty :  unknown
Authors :  MarkAbadi
Day and month :  Thu_May



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2017-05-25,Government officials in the UK are fuming over...,1.132206,1.396619,1.0,1.0,1.0,unknown,MarkAbadi,Thu_May
2,2017-05-18,Flint Residents Win Battle Over Losing Their ...,1.229272,0.745007,1.0,2.414214,1.0,unknown,BriannaSacks,Thu_May
3,2017-01-06,3 more states are proposing ’bathroom bills’ t...,1.256705,1.298711,1.0,1.0,2.414214,unknown,MarkAbadi,Fri_Jan
4,2016-05-14,Here are the 10 most and least popular governo...,1.261974,1.314519,1.0,1.0,2.414214,unknown,MarkAbadi,Sat_May


In [56]:
# same with the authors ... it quickly poisons
avg_TFIDF_with_category_authors_and_publshing_day(56001,10,0.59,0.005,0.3,0.005)

headline :  Thousands of Flint residents could lose their homes over unpaid water bills
Categoty :  unknown
Authors :  MarkAbadi
Day and month :  Thu_May



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2017-01-06,3 more states are proposing ’bathroom bills’ t...,1.203678,1.298711,1.0,1.0,2.414214,unknown,MarkAbadi,Fri_Jan
2,2016-05-14,Here are the 10 most and least popular governo...,1.214041,1.314519,1.0,1.0,2.414214,unknown,MarkAbadi,Sat_May
3,2017-03-09,Texas’ controversial ’bathroom bill’ cleared i...,1.23043,1.339518,1.0,1.0,2.414214,unknown,MarkAbadi,Thu_Mar
4,2017-03-31,’This is a bait and switch’: Liberal groups ar...,1.240191,1.354409,1.0,1.0,2.414214,unknown,MarkAbadi,Fri_Mar
5,2016-12-22,North Carolina Republicans issued a bizarre st...,1.247233,1.365151,1.0,1.0,2.414214,unknown,MarkAbadi,Thu_Dec
6,2016-06-17,The US dropped 67 nuclear bombs on this tiny i...,1.247895,1.36616,1.0,1.0,2.414214,unknown,MarkAbadi,Fri_Jun
7,2017-03-31,North Carolina just repealed its notorious ’ba...,1.24983,1.369111,1.0,1.0,2.414214,unknown,MarkAbadi,Fri_Mar
8,2016-12-21,North Carolina’s embattled departing governor ...,1.25162,1.371843,1.0,1.0,2.414214,unknown,MarkAbadi,Wed_Dec
9,2016-12-08,People in North Carolina are freaking out over...,1.253478,1.374677,1.0,1.0,2.414214,unknown,MarkAbadi,Thu_Dec


In [57]:
# but around .2 the authors work nicely here
avg_TFIDF_with_category_authors_and_publshing_day(56001,10,0.69,0.005,0.2,0.005)

headline :  Thousands of Flint residents could lose their homes over unpaid water bills
Categoty :  unknown
Authors :  MarkAbadi
Day and month :  Thu_May



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2016-02-04,Unpaid Water Bills In Flint Could Hinder Repairs,1.093888,0.702298,1.0,2.414214,2.414214,unknown,SteveCarmody,Thu_Feb
2,2017-05-18,Flint Residents Win Battle Over Losing Their ...,1.118775,0.745007,1.0,2.414214,1.0,unknown,BriannaSacks,Thu_May
3,2017-02-28,Flint residents must start paying for water th...,1.132227,0.752306,1.0,2.414214,2.414214,unknown,BradyDennis,Tue_Feb
4,2017-03-01,Michigan Ends Water Subsidies To Flint Despite...,1.142813,0.766112,1.0,2.414214,2.414214,unknown,MerritKennedy,Wed_Mar
5,2017-01-24,"Flint water falls below federal lead limits, b...",1.148931,0.774093,1.0,2.414214,2.414214,unknown,MarkBerman,Tue_Jan
6,2016-04-20,Lead-Laced Water In Flint: A Step-By-Step Look...,1.178839,0.813104,1.0,2.414214,2.414214,unknown,MerritKennedy,Wed_Apr
7,2017-05-03,Some Flint Residents Could Face Foreclosure Ov...,1.194253,0.82296,2.414214,2.414214,2.414214,POLITICS,ArthurDelaney,Wed_May
8,2016-12-20,The Flint water crisis is not over,1.195789,0.835213,1.0,2.414214,2.414214,unknown,ConnorCoyne,Tue_Dec
9,2016-02-15,"The Flint water crisis, explained",1.197255,0.837124,1.0,2.414214,2.414214,unknown,LibbyNelson,Mon_Feb


In [59]:
# looking really good
avg_TFIDF_with_category_authors_and_publshing_day(256001,10,0.69,0.005,0.2,0.005)

headline :  Divorce, Illness and Compassion
Categoty :  DIVORCE
Authors :  RobinAmosKahn,Contributor
Writer,Speaker,LeadCoachatOwntheRoom
Day and month :  Thu_Jun



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2012-10-01,The Joy of Divorce,1.25262,1.319257,1.0,1.0,2.414214,DIVORCE,"RobinAmosKahn,Contributor\nWriter,Speaker,Lead...",Mon_Oct
2,2014-02-20,"Divorce: ""You Will Survive""",1.270997,1.343226,1.0,1.0,2.414214,DIVORCE,"RobinAmosKahn,Contributor\nWriter,Speaker,Lead...",Thu_Feb
3,2013-08-26,"Thank You, G-D, for This Hellish Divorce and A...",1.276467,1.350361,1.0,1.0,2.414214,DIVORCE,"RobinAmosKahn,Contributor\nWriter,Speaker,Lead...",Mon_Aug
4,2013-11-07,How Pema Chodron Saved My Life,1.280184,1.344962,2.414214,1.0,2.414214,WELLNESS,"RobinAmosKahn,Contributor\nWriter,Speaker,Lead...",Thu_Nov
5,2013-06-11,Out of the Nest and Into the Fire,1.290292,1.368394,1.0,1.0,2.414214,DIVORCE,"RobinAmosKahn,Contributor\nWriter,Speaker,Lead...",Tue_Jun
6,2013-08-14,Write for Your Life,1.290964,1.36927,1.0,1.0,2.414214,DIVORCE,"RobinAmosKahn,Contributor\nWriter,Speaker,Lead...",Wed_Aug
7,2014-04-09,The Freedom of Letting Go,1.291731,1.37027,1.0,1.0,2.414214,DIVORCE,"RobinAmosKahn,Contributor\nWriter,Speaker,Lead...",Wed_Apr
8,2013-10-04,"Men, Divorce, and Love",1.299527,1.380439,1.0,1.0,2.414214,DIVORCE,"RobinAmosKahn,Contributor\nWriter,Speaker,Lead...",Fri_Oct
9,2013-07-12,Welcome to Crazy Time,1.302892,1.384829,1.0,1.0,2.414214,DIVORCE,"RobinAmosKahn,Contributor\nWriter,Speaker,Lead...",Fri_Jul


In [60]:
# this one doesn't perform well
avg_TFIDF_with_category_authors_and_publshing_day(156007,10,0.69,0.005,0.2,0.005)

headline :  Minister Set Fire To Own Home, Then Lied To FBI About Staged Hate Crime: Prosecutors
Categoty :  CRIME
Authors :  RichmondTimes-Dispatch,RichmondTimes-Dispatch
Day and month :  Sat_Nov



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2015-05-24,"No, No, No!",1.329983,1.0,2.414214,2.414214,2.414214,COMEDY,"MarciaLiss,Contributor(Almost)FamousCartoonist",Sun_May
2,NaT,And There We Have it,1.329983,1.0,2.414214,2.414214,2.414214,unknown,JoshMarshall,
3,2014-07-02,Over,1.329983,1.0,2.414214,2.414214,2.414214,FIFTY,"LisaK.Brown,ContributorFreelancewriterandbemus...",Wed_Jul
4,2014-06-05,Because I Can,1.329983,1.0,2.414214,2.414214,2.414214,HEALTHYLIVING,"JudithGreenberg,Ph.D.,ContributorGallatinSchoo...",Thu_Jun
5,2015-04-05,B Is for...,1.329983,1.0,2.414214,2.414214,2.414214,COMEDY,"MarciaLiss,Contributor(Almost)FamousCartoonist",Sun_Apr
6,2015-10-11,Once.,1.329983,1.0,2.414214,2.414214,2.414214,COMEDY,"MarciaLiss,Contributor(Almost)FamousCartoonist",Sun_Oct
7,2016-06-05,L O V E,1.329983,1.0,2.414214,2.414214,2.414214,HEALTHYLIVING,"SOEMOELWIN,ContributorArtistandExplorer,always...",Sun_Jun
8,2014-11-25,As We Are,1.329983,1.0,2.414214,2.414214,2.414214,HEALTHYLIVING,"FrancescaMilliken,ContributorWritesaboutbeingh...",Tue_Nov
9,2015-04-29,WHO Are You Now ?,1.329983,1.0,2.414214,2.414214,2.414214,ENTERTAINMENT,"Rev.PeterE.Bauer,ContributorUnitedChurchofChri...",Wed_Apr


In [62]:
# not bad ... some repeat recommendations, but that is to be expected
avg_TFIDF_with_category_authors_and_publshing_day(330001,10,0.69,0.005,0.2,0.005)

headline :  11 Times Latinos Stole The Show In 2015
Categoty :  LATINOVOICES
Authors :  CarolinaMorenoandTanishaLoveRamirez
Day and month :  Fri_Dec



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2016-12-28,13 Times Latinos Filled Us With Hope And Pride...,1.107596,1.130094,1.0,1.0,2.414214,LATINOVOICES,CarolinaMorenoandTanishaLoveRamirez,Wed_Dec
2,2015-09-10,19 Non-Latino Celebrities Show Off Their Spani...,1.31481,1.400374,1.0,1.0,2.414214,LATINOVOICES,CarolinaMorenoandTanishaLoveRamirez,Thu_Sep
3,2015-05-24,"No, No, No!",1.329983,1.0,2.414214,2.414214,2.414214,COMEDY,"MarciaLiss,Contributor(Almost)FamousCartoonist",Sun_May
4,NaT,And There We Have it,1.329983,1.0,2.414214,2.414214,2.414214,unknown,JoshMarshall,
5,2014-11-25,As We Are,1.329983,1.0,2.414214,2.414214,2.414214,HEALTHYLIVING,"FrancescaMilliken,ContributorWritesaboutbeingh...",Tue_Nov
6,2016-06-05,L O V E,1.329983,1.0,2.414214,2.414214,2.414214,HEALTHYLIVING,"SOEMOELWIN,ContributorArtistandExplorer,always...",Sun_Jun
7,2014-07-02,Over,1.329983,1.0,2.414214,2.414214,2.414214,FIFTY,"LisaK.Brown,ContributorFreelancewriterandbemus...",Wed_Jul
8,2015-04-05,B Is for...,1.329983,1.0,2.414214,2.414214,2.414214,COMEDY,"MarciaLiss,Contributor(Almost)FamousCartoonist",Sun_Apr
9,2014-06-05,Because I Can,1.329983,1.0,2.414214,2.414214,2.414214,HEALTHYLIVING,"JudithGreenberg,Ph.D.,ContributorGallatinSchoo...",Thu_Jun
