In [1]:
import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

In [2]:
# Below libraries are for text processing using NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [3]:
news_articles = pd.read_csv("news_article.txt")
news_articles.to_csv('news_article.csv', 
                  index = None)

In [4]:
news_articles.head()

Unnamed: 0,headline
0,An Education While Incarcerated
1,Is There a Smoking Gun in the January 6th Inve...
2,The Year in Climate
3,Kirsten Dunst’s Feminine Urges
4,Rebelling Against the Word Processor


In [5]:
news_articles_temp = news_articles.copy()

In [6]:
stop_words = set(stopwords.words('english'))
for i in range(len(news_articles_temp["headline"])):
    string = ""
    for word in news_articles_temp["headline"][i].split():
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "  
    if(i%1000==0):
      print(i)           # To track number of records processed
    news_articles_temp.at[i,"headline"] = string.strip()

0


In [7]:
lemmatizer = WordNetLemmatizer()
for i in range(len(news_articles_temp["headline"])):
    string = ""
    for w in word_tokenize(news_articles_temp["headline"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_articles_temp.at[i, "headline"] = string.strip()
    if(i%1000==0):
        print(i)           # To track number of records processed
        

0


In [8]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_articles_temp['headline'])

In [9]:
def recommend(new):
    recommend_news = [] 
    row_index = news_articles[news_articles['headline'] == new].index[0]
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:3]
    recommend_news.append(news_articles['headline'][indices])
    return (recommend_news)

In [10]:
recommend_news = recommend("The Year in Climate")
print(recommend_news)

[2                                  The Year in Climate
0                      An Education While Incarcerated
1    Is There a Smoking Gun in the January 6th Inve...
Name: headline, dtype: object]


In [11]:
import pickle
pickle.dump(news_articles,open('news_articles_list.pkl','wb'))

In [12]:
pickle.dump(tfidf_headline_features,open('similarity.pkl','wb'))