### Importing necessary Libraries

In [89]:
import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

# Below libraries are for text processing using NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [90]:

import warnings
warnings.filterwarnings("ignore")

### Loading Data

In [91]:
news_articles = pd.read_json("News_Category_Dataset_v2.json", lines = True)

In [92]:
news_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   category           200853 non-null  object        
 1   headline           200853 non-null  object        
 2   authors            200853 non-null  object        
 3   link               200853 non-null  object        
 4   short_description  200853 non-null  object        
 5   date               200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


The dataset contains about two million records of six different features

In [93]:
news_articles.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,"There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV",Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89,She left her husband. He killed their children. Just another day in America.,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song,Andy McDonald,https://www.huffingtonpost.com/entry/will-smith-joins-diplo-and-nicky-jam-for-the-official-2018-world-cup-song_us_5b09726fe4b0fdb2aa541201,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-grant-marries_us_5b09212ce4b0568a880b9a8c,The actor and his longtime girlfriend Anna Eberstein tied the knot in a civil ceremony.,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carrey-adam-schiff-democrats_us_5b0950e8e4b0fdb2aa53e675,The actor gives Dems an ass-kicking for not fighting hard enough against Donald Trump.,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-margulies-trump-poop-bag_us_5b093ec2e4b0fdb2aa53df70,"The ""Dietland"" actress said using the bags is a ""really cathartic, therapeutic moment.""",2018-05-26


### Data Preprecessing

#### 1 Featuring Only the Articles from 2018

- Since the dataset size is quite large so processing through entire dataset may consume too much time,To retrain again so we are only considering the latest articles from the year 2018

In [94]:
news_articles = news_articles[news_articles['date'] >= pd.Timestamp(2018,1,1)]

In [95]:
news_articles.shape

(8583, 6)

Now, We have total number of NEWS articles comes down to 8583

#### 2 Removing the short headline Articles

- After stop words removal from headline, the articles with very short headline may become blank, So lets remove all the articles with less then (<5) in the headline

In [96]:
news_articles = news_articles[news_articles['headline'].apply(lambda x: len(x.split())>5)]

In [97]:
print("Total Number of articles after Removal of headlines with short title:", news_articles.shape[0])

Total Number of articles after Removal of headlines with short title: 8530


#### 3 Checking and removing all the duplicates

- Some articles are same, exactly as headlines, so lets remove all such articles

In [98]:
news_articles.sort_values('headline',inplace=True, ascending=False)
duplicated_articles_series = news_articles.duplicated('headline', keep = False)
news_articles = news_articles[~duplicated_articles_series]
print("Total number of articles after removing duplicates:", news_articles.shape[0])

Total number of articles after removing duplicates: 8485


#### 4. Checking for missing values

In [99]:
news_articles.isna().sum()

category             0
headline             0
authors              0
link                 0
short_description    0
date                 0
dtype: int64

### Basib data Exploration

#### 1. Basic statistics- Number of articles, authors, categories

In [100]:
print("Total Number of ARTICLES:", news_articles.shape[0])

print("Total Number of AUTHORS:", news_articles['authors'].nunique())

print("Total Number of Unique CATEGORIES:", news_articles['category'].nunique())

Total Number of ARTICLES: 8485
Total Number of AUTHORS: 892
Total Number of Unique CATEGORIES: 26


#### 2. Distribution of Articles - CategoryWise

In [101]:
fig = go.Figure([go.Bar(x=news_articles['category'].value_counts().index,
                        y=news_articles['category'].value_counts().values)])

fig['layout'].update(title = {'text':'Distribution of articles category-wise',
                             'y': 0.9,
                             'x': 0.5,
                             'xanchor':'center',
                             'yanchor':'top'},
                    xaxis_title = "Category name",
                    yaxis_title = "Number of Articles")

fig.update_layout(width = 800, height = 700)

fig

From the bar chart we can observe that politics category has highest number of articles then Entertainment and So onn.

#### 3. Number of Articles per Month

- Let's Group the data in monthly basis using resample() function

In [102]:
news_articles_per_month = news_articles.resample('m',on='date')['headline'].count()

news_articles_per_month

date
2018-01-31    2065
2018-02-28    1694
2018-03-31    1778
2018-04-30    1580
2018-05-31    1368
Freq: M, Name: headline, dtype: int64

In [103]:
fig = go.Figure([go.Bar(x=news_articles_per_month.index.strftime("%b"), y=news_articles_per_month)])

fig['layout'].update(title={"text" : 'Distribution of articles month-wise',
                            'y':0.9,
                            'x':0.5,
                            'xanchor': 'center',
                            'yanchor': 'top'}, 
                     xaxis_title="Month",
                     yaxis_title="Number of articles")
fig.update_layout(width=500,height=500)

fig

- From the bar chart we can observe that January month has highest number of artiles then March 

#### 4. PDF for the Length of headlines

In [104]:
fig = ff.create_distplot([news_articles['headline'].str.len()],
                        ["ht"],show_hist = False,
                        show_rug = False)

fig['layout'].update(title = {'text':'PDF',
                             'y':0.9,
                             'x':0.5,
                             "xanchor":"center",
                             "yanchor":"top"},
                    xaxis_title="Length of a headline",
                    yaxis_title="Probability")

fig.update_layout(showlegend = False,
                 width= 500,
                 height = 500)

fig

- The probability distribution function of headline length is almost similar to Gussian Distribution(Normal Distribution),Where most of the headlines are 50 to 80 words long in length

### Feature Engineering steps

- By Data Preprocessing, We get a subset of orginal dataset which has differen index labels

- Let's Make the indices uniform ranging from 0 to total number of articles

In [105]:
news_articles.index = range(news_articles.shape[0])

- Adding a new column containing both day of week and month, it will be required later while recommending based on day of the week and month

In [106]:
news_articles["day and month"] = news_articles["date"].dt.strftime("%a") + "_" + news_articles["date"].dt.strftime("%b")


- Since after Text Preprocessing the Original Headlines will be modified and it Does not make sense to recommend articles by displaying modified headlines so Let's copy the dataset into some other dataset and perform text preprocessing on the Later stage

In [107]:
news_articles_temp = news_articles.copy()

## Text Preprocessing

#### 1. StopWord Removal

- Stop Words are not much helpful in analyis and also their inclusion consumes much time during processing so let's remove these stop words

In [108]:
stop_words = set(stopwords.words('english'))

In [109]:
for i in range(len(news_articles_temp["headline"])):
    string = ""
    for word in news_articles_temp["headline"][i].split():
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "  
    if(i%1000==0):
      print(i)           # To track number of records processed
    news_articles_temp.at[i,"headline"] = string.strip()

0
1000
2000
3000
4000
5000
6000
7000
8000


### Lemmatization

In [110]:
lemmatizer = WordNetLemmatizer()

In [111]:
for i in range(len(news_articles_temp["headline"])):
    string = ""
    for w in word_tokenize(news_articles_temp["headline"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_articles_temp.at[i, "headline"] = string.strip()
    if(i%1000==0):
        print(i)           # To track number of records processed


0
1000
2000
3000
4000
5000
6000
7000
8000


## Headline Based Similarity on New Articles

- Generally, We assess Similarity based on Distance. If the distance is minimum then higher similarity and if it is maximum then low similarity.

- To Calculate the distance, we need to represent the headlines as a d-dimensional vector, Then we can find out the similarity based on the distance between vectors.

- There are Multiple methods to represents a text as d-dimentional vector like Bag of words, TF-IDF method, Word@Vec embedding etc.

- Each Method has it's own limits of advantages and disadvantages.

### Let's Apply all the Methods and Explore

#### 1. Using Bag Of Words Method 

- Bag of Words method represents the occurance of words within a documents.. Here each headline can be consisdered as a documents and set of all headlines from a corpus

- Using Bag of Words approach each documents is represented by a d-dimensional vector, where d id total number of unique words in the corpus.The set of such unique words forms the Vocabulary

In [112]:
headline_vectorizer = CountVectorizer()
headline_features   = headline_vectorizer.fit_transform(news_articles_temp['headline'])


In [113]:
headline_features.get_shape()

(8485, 11122)

- The output of Bag of Words matrix is a Sparce Matrix

In [114]:
pd.set_option('display.max_colwidth', -1)
# To Display a very long headline completely

In [115]:
def bag_of_words_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(headline_features, headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                      'headline':news_articles['headline'][indices].values,
                      'Euclidean similarity with the queried aricle': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline:', news_articles['headline'][indices[0]])
    print('\n', '='*25,"Recommended articles:","="*25)
    return df.iloc[1:,]
bag_of_words_based_model(133,11)
# We can change the row index for any other queried articles

headline: Woman Fired After Flipping Off Trump's Motorcade Sues Former Employer



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried aricle
1,2018-04-02,The Trump Administration Is Suing California Again,2.828427
2,2018-05-01,Texas Sues Trump Administration To End DACA,3.162278
3,2018-03-07,Stormy Daniels Suing Trump Over Nondisclosure Agreement,3.162278
4,2018-04-28,Trump: Mueller Should Never Have Been Appointed,3.162278
5,2018-04-24,Spanish Woman Looks More Like Trump Than The Donald Himself,3.162278
6,2018-02-12,What You Should Know About Trump's Nihilist Budget,3.162278
7,2018-05-09,The Caliphate Of Trump And A Planet In Ruins,3.162278
8,2018-03-26,Trump Ally Sues Qatar For Hacking His Email,3.162278
9,2018-02-21,All They Will Call You Will Be Deportees,3.162278
10,2018-04-11,Pursuing Desegregation In The Trump Era,3.162278


- Above function recommends 10 similar articles to the queried(read) article based on the headline. It accepts two arguments - index of already read artile and the total number of articles to be recommended.

- Based on the Euclidean distance it finds out 10 nearest neighbors and recommends.

##### Disadvantages

1. It gives very low importance to less frequently observed words in the corpus. Few words from the queried article like "employer", "flip", "fire" appear less frequently in the entire corpus so BoW method does not recommend any article whose headline contains these words. Since trump is commonly observed word in the corpus so it is recommending the articles with headline containing "trump".
2. BoW method doesn't preserve the order of words.



- To overcome the first disadvantage we use TF-IDF method for feature representation.



#### 2. Using TF-IDF Method

- TF-IDF method is a weighted measure which gives more importance to less frequent words in a corpus. It assigns a weight to each term(word) in a document based on Term frequency(TF) and inverse document frequency(IDF).

- TF(i,j) = (# times word i appears in document j) / (# words in document j)

- IDF(i,D) = log_e(#documents in the corpus D) / (#documents containing word i)

- weight(i,j) = TF(i,j) x IDF(i,D)

- So if a word occurs more number of times in a document but less number of times in all other documents then its TF-IDF value will be high.



In [116]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)

tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_articles_temp['headline'])


In [117]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]
tfidf_based_model(133, 11)

headline :  Woman Fired After Flipping Off Trump's Motorcade Sues Former Employer



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-05-21,The Supreme Court Just Made It A Lot Harder For You To Sue Your Employer,1.164067
2,2018-04-02,The Trump Administration Is Suing California Again,1.253867
3,2018-04-10,"Lou Dobbs Flips Out On Live TV, Urges Trump To 'Fire The SOB' Robert Mueller",1.25881
4,2018-04-26,Cardi B's Former Manager Sues Her For $10 Million,1.268704
5,2018-04-03,A Third Woman Is Suing To Break A Trump-Related Nondisclosure Agreement,1.274264
6,2018-02-24,Former RNC Chair Fires Back At Claim He Was Only Hired Because He Was Black,1.274847
7,2018-01-16,State Employer Side Payroll Taxes And Loser Liberalism,1.276696
8,2018-02-21,Democrats Flip Kentucky State House Seat Where Trump Won Overwhelmingly,1.282008
9,2018-01-09,Big Tax Game Hunting: Employer Side Payroll Taxes,1.285147
10,2018-02-28,Democrats Flip 2 More GOP-Held State House Seats,1.287403


- Compared to BoW method, here TF-IDF method recommends the articles with headline containing words like "employer", "fire", "flip" in top 5 recommendations and these words occur less frequently in the corpus.

#### - Disadvantages :-

- Bow and TF-IDF method do not capture semantic and syntactic similarity of a given word with other words but this can be captured using Word embeddings.



- For example: there is a good association between words like "trump" and "white house", "office and employee", "tiger" and "leopard", "USA" and "Washington D.C" etc. Such kind of semantic similarity can be captured using word embedding techniques. Word embedding techniques like Word2Vec, GloVe and fastText leverage semantic similarity between words.



#### 3. Using Word2Vec Embedding

- Word2Vec is one of the techniques for semantic similarity which was invented by Google in 2013.For a given corpus, during training it observes the patterns and represents each word by a d-dimensional vector. To get better results we need fairly large corpus.

- Since our corpus size is small so let's use Google's pretrained model on google news articles. This standard model contains vector representation for billions of words obtained by training on millions of new articles. Here, each word is represented by a 300 Dimensional dense vectors.


In [119]:
from gensim.models import Word2Vec
from gensim.models import keyedvectors
import pickle

Since this pre-trained Word2Vec model is 1.5 GB in compressed form. So it needs a high end RAM to load it in the memory after unzipping.

Here, we are loading this pre-build model from a pickle file which contains this model in advance.

In [120]:
os.chdir(r'/Users/nadeemm/Desktop/Google_news')

In [123]:
!ls

In [None]:
with open('googlew2v/word2vec_model', 'rb') as file:
    loaded_model = pickle.load(file)

Since the model gives vector representation for each word but we calculate the distance between headlines so we need to obtain vector representation for each headline. One way to obtain this is by first adding vector representation of all the words available in the headline and then calculating the average. It is also known as average Word2Vec model.

Below code cell performs the same.

In [None]:
vocabulary = loaded_model.keys()
w2v_headline = []
for i in news_articles_temp['headline']:
    w2Vec_word = np.zeros(300, dtype="float32")
    for word in i.split():
        if word in vocabulary:
            w2Vec_word = np.add(w2Vec_word, loaded_model[word])
    w2Vec_word = np.divide(w2Vec_word, len(i.split()))
    w2v_headline.append(w2Vec_word)
w2v_headline = np.array(w2v_headline)

In [None]:
def avg_w2v_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,1]
    return df.iloc[1:,]

avg_w2v_based_model(133, 11)


Here, Word2Vec based representation recommends the headlines containing the word white house which is associated with the word trump in the queried article. Similarly, it recommends the headlines with words like "offical", "insist" which have semantic similarity to the words "employer", "sue" in the queried headline.

So far we were recommending using only one feature i.e. headline but in order to make a robust recommender system we need to consider multiple features at a time. Based on the business interest and rules, we can decide weight for each feature.

Let's see different models with combinations of different features for article similarity.

#### 4. Weighted similarity based on headline and category

Let's first see articles similarity based on headline and category. We are using onehot encoding to get feature representation for category.

Sometimes as per the business requirements, we may need to give more preference to the articles from the same category. In such cases we can assign more weight to category while recommending. Higher the weight, more the significance. Similarly less weight leads to less signficance to a particular feature.

In [125]:
from sklearn.preprocessing import OneHotEncoder 


In [126]:
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["category"]).reshape(-1,1))

In [None]:
def avg_w2v_with_category(row_index, num_similar_items, w1,w2): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist)/float(w1 + w2)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                 'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Categoty': news_articles['category'][indices].values})
    
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,5]]
    return df.iloc[1:, ]

avg_w2v_with_category(528,10,0.1,0.8)


Here, Word2Vec based representation recommends the headlines containing the word white house which is associated with the word trump in the queried article. Similarly, it recommends the headlines with words like "offical", "insist" which have semantic similarity to the words "employer", "sue" in the queried headline.

So far we were recommending using only one feature i.e. headline but in order to make a robust recommender system we need to consider multiple features at a time. Based on the business interest and rules, we can decide weight for each feature.

Let's see different models with combinations of different features for article similarity.

#### 5. Weighted similarity based on headline and category

Let's first see articles similarity based on headline and category. We are using onehot encoding to get feature representation for category.

Sometimes as per the business requirements, we may need to give more preference to the articles from the same category. In such cases we can assign more weight to category while recommending. Higher the weight, more the significance. Similarly less weight leads to less signficance to a particular feature.

In [128]:
from sklearn.preprocessing import OneHotEncoder 

In [129]:
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["category"]).reshape(-1,1))


In [None]:
def avg_w2v_with_category(row_index, num_similar_items, w1,w2): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist)/float(w1 + w2)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                 'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Categoty': news_articles['category'][indices].values})
    
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,5]]
    return df.iloc[1:, ]

avg_w2v_with_category(528,10,0.1,0.8)


Above function takes two extra arguments w1 and w2 for weights corresponding to headline and category. It is always a good practice to pass the weights in a range scaled from 0 to 1, where a value close to 1 indicates high weight whereas close to 0 indicates less weight.

Here, we can observe that the recommended articles are from the same category as the queried article category. This is due to passing of high value to w2.

#### 6 .  Weighted similarity based on headline, category and author
Now let's see calcualte articles similarity based on author along with headline and category. Again, we are encoding author through onehot encoding.

In [130]:
authors_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["authors"]).reshape(-1,1))


In [None]:
def avg_w2v_with_category_and_authors(row_index, num_similar_items, w1,w2,w3): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist)/float(w1 + w2 + w3)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                'headline':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),       
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['authors'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print('Authors : ', news_articles['authors'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,6,7]]
    return df.iloc[1:, ]


avg_w2v_with_category_and_authors(528,10,0.1,0.1,1)

Above function takes one extra weight argument w3 for author.

In the ouput, we can observe that the recommended articles are from the same author as the queried article author due to high weightage to w3.

#### Weighted similarity based on headline, category, author and publishing day
Now let's see calcualte articles similarity based on the publishing week day author along with headline, category and author. Again, we are encoding this new feature through onehot encoding.

In [132]:
publishingday_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["day and month"]).reshape(-1,1))

In [None]:
def avg_w2v_with_category_authors_and_publshing_day(row_index, num_similar_items, w1,w2,w3,w4): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    publishingday_dist = pairwise_distances(publishingday_onehot_encoded, publishingday_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist + w4 * publishingday_dist)/float(w1 + w2 + w3 + w4)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                'headline_text':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),   
                'Publishing day based Euclidean similarity': publishingday_dist[indices].ravel(), 
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['authors'][indices].values,
                'Day and month': news_articles['day and month'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print('Authors : ', news_articles['authors'][indices[0]])
    print('Day and month : ', news_articles['day and month'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,7,8,9]]
    return df.iloc[1:, ]


avg_w2v_with_category_authors_and_publshing_day(528,10,0.1,0.1,0.1,1)

Above function takes one extra weight argument w4 for day of the week and month.

In the ouput, we can observe that the recommended articles are from the same day of the week and month as the queried article due to high weightage to w4.