In [1]:
# Recommendation engine using spaCy and cosine_difference
##
## initially trained on https://www.kaggle.com/datasets/snapcrack/all-the-news
## with fields: ,id,title,publication,author,date,year,month,url,content
## 
### using title only as a baseline model

In [2]:
import spacy
import pandas as pd
import numpy as np
nlp = spacy.load('en_core_web_sm')

In [3]:
articles = pd.read_csv("../articles1.csv" )

In [4]:
# start small...
#articles = articles[:500]

articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   50000 non-null  int64  
 1   id           50000 non-null  int64  
 2   title        50000 non-null  object 
 3   publication  50000 non-null  object 
 4   author       43694 non-null  object 
 5   date         50000 non-null  object 
 6   year         50000 non-null  float64
 7   month        50000 non-null  float64
 8   url          0 non-null      float64
 9   content      50000 non-null  object 
dtypes: float64(3), int64(2), object(5)
memory usage: 3.8+ MB


In [5]:
articles.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [6]:
# remove duplicates
articles.sort_values( 'title', inplace=True, ascending=False )
duplicated_articles = articles.duplicated( 'title', keep=False )
news_articles = articles[~duplicated_articles]
news_articles.shape[0]

49868

In [7]:
news_articles.isna().sum()

Unnamed: 0         0
id                 0
title              0
publication        0
author          6232
date               0
year               0
month              0
url            49868
content            0
dtype: int64

In [8]:
# remove stopwords

all_stopwords = nlp.Defaults.stop_words

def remove_stopwords(title):
    text_tokens = nlp( title )
    tokens_without_sw = [word.text for word in text_tokens if not word in all_stopwords]
    return( ' '.join( tokens_without_sw ))

fn = lambda row: remove_stopwords( row.title )
col = news_articles.apply( fn, axis=1 )
news_articles = news_articles.assign( title_clean=col.values )

In [9]:
#add an index
news_articles.insert( 0, 'index', range( 0, len( news_articles ) ))

In [10]:
# let's try CountVectorizer

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity


In [12]:
cv = CountVectorizer()

In [13]:
features = cv.fit_transform( news_articles.title_clean )
cosine_sim = cosine_similarity( features )
features
#cosine_sim

<49868x27333 sparse matrix of type '<class 'numpy.int64'>'
	with 544955 stored elements in Compressed Sparse Row format>

In [14]:
# inspired from https://medium.com/@sumanadhikari/building-a-movie-recommendation-engine-using-scikit-learn-8dbb11c5aa4b
def get_title_from_index(index):
    return news_articles[news_articles.index == index]["title_clean"].values[0]

def get_index_from_title(title):
    return news_articles[news_articles.title_clean == title]["index"].values[0]


In [15]:
offset = 50
counter = 5

for i in range(counter):
    print( i+offset )
    print( get_title_from_index(i+offset) )
    index = get_index_from_title( news_articles.iloc[i+offset].title_clean )
    similar = list( enumerate( cosine_sim[index] ))
    sorted_similar = sorted( similar, key=lambda x:x[1], reverse=True )
    print( "Similar to:" )
    print( sorted_similar[1] )
    print( get_title_from_index( sorted_similar[1][0] ))
    print( sorted_similar[2] )
    print( get_title_from_index( sorted_similar[2][0] ))
    print( " " )
    

50
President Xi ’s Great Chinese Soccer Dream - The New York Times
Similar to:
(21523, 0.38805700005813276)
Richard Armitage Plans to Vote for Hillary Clinton - Breitbart
(16988, 0.3741657386773942)
13 Shot at House Party in Gun - Controlled Connecticut - Breitbart
 
51
Tech Giants Seem Invincible . That Worries Lawmakers . - The New York Times
Similar to:
(42870, 0.40422604172722154)
Solar power boom : Super - charge it ( Opinion )
(2464, 0.38348249442368526)
Photos of Jupiter From NASA Spacecraft , Both Near and Far - The New York Times
 
52
Chinese City Official Shoots 2 Others and Kills Himself , State Media Says - The New York Times
Similar to:
(7208, 0.8660254037844388)
Hezbollah Says Military Leader Died in Artillery Attack - The New York Times
(27128, 0.4564354645876385)
Clift : Cruz ’ Did n’t Bow ’ To ’ Ethanol Gods ’ ’ Because He ’s in the Pocket of Big Oil ’ - Breitbart
 
53
Drop in Gang Violence Drove New York City Shootings Below 1,000 in 2016 - The New York Times
Similar 