In [1]:
import numpy as np
import pandas as pd
import os
import math
import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self signed certificate in certificate chain
[nltk_data]     (_ssl.c:1129)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self signed certificate in certificate chain
[nltk_data]     (_ssl.c:1129)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self signed certificate in certificate chain
[nltk_data]     (_ssl.c:1129)>


False

In [3]:
news_articles = pd.read_json("News_Category_Dataset_v2.json", lines = True)

In [4]:
news_articles.rename(columns={'short_description': 'description'}, inplace=True)

In [5]:
news_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   category     200853 non-null  object        
 1   headline     200853 non-null  object        
 2   authors      200853 non-null  object        
 3   link         200853 non-null  object        
 4   description  200853 non-null  object        
 5   date         200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [6]:
news_articles.head()

Unnamed: 0,category,headline,authors,link,description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [7]:
news_articles = news_articles[news_articles['date'] >= pd.Timestamp(2018,1,1)]

In [8]:
news_articles.shape

(8583, 6)

In [9]:
news_articles = news_articles[news_articles['description'].apply(lambda x: len(x.split())>5)]
print("Total number of articles after removal of description with short title:", news_articles.shape[0])

Total number of articles after removal of description with short title: 7854


In [10]:
news_articles.sort_values('description',inplace=True, ascending=False)
duplicated_articles_series = news_articles.duplicated('description', keep = False)
news_articles = news_articles[~duplicated_articles_series]
print("Total number of articles after removing duplicates:", news_articles.shape[0])

Total number of articles after removing duplicates: 7828


In [11]:
news_articles.index = range(news_articles.shape[0])

In [12]:
# Adding a new column containing both day of the week and month, it will be required later while recommending based on day of the week and month
news_articles["day and month"] = news_articles["date"].dt.strftime("%a") + "_" + news_articles["date"].dt.strftime("%b")

In [13]:
news_articles_temp = news_articles.copy()

In [14]:
stop_words = set(stopwords.words('english'))

In [15]:
for i in range(len(news_articles_temp["description"])):
    string = ""
    for word in news_articles_temp["description"][i].split():
        #print(word)
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "  
    if(i%1000==0):
      print(i)           # To track number of records processed
    news_articles_temp.at[i,"description"] = string.strip()

0
1000
2000
3000
4000
5000
6000
7000


In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
for i in range(len(news_articles_temp["description"])):
    string = ""
    for w in word_tokenize(news_articles_temp["description"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_articles_temp.at[i, "description"] = string.strip()
    if(i%1000==0):
        print(i)           # To track number of records processed

0
1000
2000
3000
4000
5000
6000
7000


# TF-IDF method

In [18]:
tfidf_description_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_description_features = tfidf_description_vectorizer.fit_transform(news_articles_temp['description'])

In [19]:
def tfidf_based_model(row_index, num_similar_items=6):
    couple_dist = pairwise_distances(tfidf_description_features,tfidf_description_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'description':news_articles['description'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    #print("="*30,"Queried article details","="*30)
    print('description : ',news_articles['description'][indices[0]])
    #print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]
tfidf_based_model(2)

description :  ″My photo seems to have traveled far. I say it is because of the color of my hair,” she said.


Unnamed: 0,publish_date,description,Euclidean similarity with the queried article
1,2018-01-03,"""Remember me, though I have to travel far, rem...",1.201945
2,2018-02-28,The war is far from over.,1.207912
3,2018-01-05,"Why Trump eats so much McDonald's, and what hi...",1.237358
4,2018-02-17,"“Growing up, I didn’t see any news anchors wit...",1.242886
5,2018-04-23,It's the travel series America needs.,1.248519


# Weighted similarity based on category, publish day and author

In [20]:
from sklearn.preprocessing import OneHotEncoder 

In [21]:
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["category"]).reshape(-1,1))

In [22]:
publishingday_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["day and month"]).reshape(-1,1))

In [23]:
authors_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["authors"]).reshape(-1,1))

In [24]:
def avg_TFIDF_with_category_authors_and_publshing_day(row_index, num_similar_items, w1,w2,w3,w4): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(tfidf_description_features,tfidf_description_features[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    publishingday_dist = pairwise_distances(publishingday_onehot_encoded, publishingday_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist + w4 * publishingday_dist)/float(w1 + w2 + w3 + w4)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                'headline_text':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),   
                'Publishing day based Euclidean similarity': publishingday_dist[indices].ravel(), 
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['authors'][indices].values,
                'Day and month': news_articles['day and month'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print('Authors : ', news_articles['authors'][indices[0]])
    print('Day and month : ', news_articles['day and month'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,7,8,9]]
    return df.iloc[1:, ]


avg_TFIDF_with_category_authors_and_publshing_day(528,10,0.5,0.2,0.2,0.1)


headline :  The Facebook/Cambridge Analytica Scandal, According To My Mom
Categoty :  POLITICS
Authors :  Ashley Feinberg
Day and month :  Thu_Mar



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2018-03-10,"GOP: Actually, There Are Lots Of Women In The ...",1.278436,1.27403,1.0,1.0,2.414214,POLITICS,Ashley Feinberg,Sat_Mar
2,2018-04-25,Turning Point USA Keeps Accidentally Hiring Ra...,1.348528,1.414214,1.0,1.0,2.414214,POLITICS,Ashley Feinberg,Wed_Apr
3,2018-03-05,Bernie Sanders' Son Is Extremely Mad Online,1.348528,1.414214,1.0,1.0,2.414214,POLITICS,Ashley Feinberg,Mon_Mar
4,2018-01-10,"Don Jr., It’s a Beautiful Day For A Walk Aroun...",1.348528,1.414214,1.0,1.0,2.414214,POLITICS,Ashley Feinberg,Wed_Jan
5,2018-01-17,"Chuck Grassley’s Yearlong, One-Sided Twitter C...",1.348528,1.414214,1.0,1.0,2.414214,POLITICS,Ashley Feinberg,Wed_Jan
6,2018-02-07,Right-Wing Conspiracists Are Pretending The FB...,1.348528,1.414214,1.0,1.0,2.414214,POLITICS,Ashley Feinberg,Wed_Feb
7,2018-03-26,Kellyanne Conway's Husband Is Going Rogue,1.348528,1.414214,1.0,1.0,2.414214,POLITICS,Ashley Feinberg,Mon_Mar
8,2018-05-10,Here Are All 3 FCC Complaints About The '60 Mi...,1.348528,1.414214,1.0,1.0,2.414214,POLITICS,Ashley Feinberg,Thu_May
9,2018-04-05,Here’s Don Jr. Talking About How It Sucks To B...,1.348528,1.414214,1.0,1.0,2.414214,POLITICS,Ashley Feinberg,Thu_Apr


In [25]:
news_articles.shape

(7828, 7)