In [1]:
import numpy as np
import pandas as pd
import os
import math
import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/adon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/adon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/adon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
news_articles = pd.read_json("News_Category_Dataset_v2.json", lines = True)

ValueError: Unexpected character found when decoding 'NaN'

In [4]:
news_articles.rename(columns={'short_description': 'description'}, inplace=True)

NameError: name 'news_articles' is not defined

In [None]:
news_articles.info()

In [None]:
news_articles.head()

In [None]:
news_articles = news_articles[news_articles['date'] >= pd.Timestamp(2018,1,1)]

In [None]:
news_articles.shape

In [None]:
news_articles = news_articles[news_articles['description'].apply(lambda x: len(x.split())>5)]
print("Total number of articles after removal of description with short title:", news_articles.shape[0])

In [None]:
news_articles.sort_values('description',inplace=True, ascending=False)
duplicated_articles_series = news_articles.duplicated('description', keep = False)
news_articles = news_articles[~duplicated_articles_series]
print("Total number of articles after removing duplicates:", news_articles.shape[0])

In [None]:
news_articles.index = range(news_articles.shape[0])

In [None]:
# Adding a new column containing both day of the week and month, it will be required later while recommending based on day of the week and month
news_articles["day and month"] = news_articles["date"].dt.strftime("%a") + "_" + news_articles["date"].dt.strftime("%b")

In [None]:
news_articles_temp = news_articles.copy()

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
for i in range(len(news_articles_temp["description"])):
    string = ""
    for word in news_articles_temp["description"][i].split():
        #print(word)
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "  
    if(i%1000==0):
      print(i)           # To track number of records processed
    news_articles_temp.at[i,"description"] = string.strip()

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
for i in range(len(news_articles_temp["description"])):
    string = ""
    for w in word_tokenize(news_articles_temp["description"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_articles_temp.at[i, "description"] = string.strip()
    if(i%1000==0):
        print(i)           # To track number of records processed

# TF-IDF method

In [None]:
tfidf_description_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_description_features = tfidf_description_vectorizer.fit_transform(news_articles_temp['description'])

In [None]:
def tfidf_based_model(row_index, num_similar_items=6):
    couple_dist = pairwise_distances(tfidf_description_features,tfidf_description_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'description':news_articles['description'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    #print("="*30,"Queried article details","="*30)
    print('description : ',news_articles['description'][indices[0]])
    #print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]
tfidf_based_model(2)

# Weighted similarity based on category, publish day and author

In [None]:
from sklearn.preprocessing import OneHotEncoder 

In [None]:
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["category"]).reshape(-1,1))

In [5]:
publishingday_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["day and month"]).reshape(-1,1))

NameError: name 'OneHotEncoder' is not defined

In [None]:
authors_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["authors"]).reshape(-1,1))

In [None]:
def avg_TFIDF_with_category_authors_and_publshing_day(row_index, num_similar_items, w1,w2,w3,w4): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(tfidf_description_features,tfidf_description_features[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    publishingday_dist = pairwise_distances(publishingday_onehot_encoded, publishingday_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist + w4 * publishingday_dist)/float(w1 + w2 + w3 + w4)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                'headline_text':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),   
                'Publishing day based Euclidean similarity': publishingday_dist[indices].ravel(), 
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['authors'][indices].values,
                'Day and month': news_articles['day and month'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print('Authors : ', news_articles['authors'][indices[0]])
    print('Day and month : ', news_articles['day and month'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,7,8,9]]
    return df.iloc[1:, ]


avg_TFIDF_with_category_authors_and_publshing_day(528,10,0.5,0.2,0.2,0.1)


In [None]:
news_articles.shape