In [1]:
# Recommendation engine using spaCy
##
## initially trained on https://www.kaggle.com/datasets/snapcrack/all-the-news
## with fields: ,id,title,publication,author,date,year,month,url,content
## 
### using title only as a baseline model

In [2]:
import spacy
import pandas as pd
nlp = spacy.load('en_core_web_sm')

In [3]:
articles = pd.read_csv("../articles1.csv")

In [4]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   50000 non-null  int64  
 1   id           50000 non-null  int64  
 2   title        50000 non-null  object 
 3   publication  50000 non-null  object 
 4   author       43694 non-null  object 
 5   date         50000 non-null  object 
 6   year         50000 non-null  float64
 7   month        50000 non-null  float64
 8   url          0 non-null      float64
 9   content      50000 non-null  object 
dtypes: float64(3), int64(2), object(5)
memory usage: 3.8+ MB


In [5]:
articles.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [6]:
# remove headlines with short title
articles = articles[articles['title'].apply( lambda x: len( x.split() ) > 4 )]
articles.shape[0]

49417

In [7]:
# remove duplicates
articles.sort_values( 'title', inplace=True, ascending=False )
duplicated_articles = articles.duplicated( 'title', keep=False )
news_articles = articles[~duplicated_articles]
news_articles.shape[0]

49295

In [8]:
news_articles.isna().sum()

Unnamed: 0         0
id                 0
title              0
publication        0
author          5862
date               0
year               0
month              0
url            49295
content            0
dtype: int64

In [9]:
# now taking inspiration from https://www.kaggle.com/code/zackakil/done-nlp-using-word-vectors-with-spacy-cldspn/notebook


In [10]:
all_stopwords = nlp.Defaults.stop_words
i = 1

def remove_stopwords(title):
    text_tokens = nlp( title )
    tokens_without_sw = [word.text for word in text_tokens if not word in all_stopwords]
    return( ' '.join( tokens_without_sw ))

def get_word_vectors(words):
    global i
    i = i + 1
    if i % 100 == 0:
        print( i )
    # converts a list of words into their word vectors
    return [nlp(word).vector for word in words]

In [11]:
# start small...
news_articles = news_articles[:1000]

In [12]:
# remove stopwords
news_articles["word_vectors"] = news_articles["title"].apply( lambda x: get_word_vectors( remove_stopwords( x )) )

100
200
300
400
500
600
700
800
900
1000


In [13]:
# to reduce to 2 dimensions
from sklearn.decomposition import PCA

pca = PCA( n_components=2 )

def reducer(word_vectors):
    pca.fit( word_vectors )
    word_vecs_2d = pca.transform( word_vectors )
    return( word_vecs_2d )


In [14]:
# reduce to 2 dimensions
news_articles["word_vectors_2d"] = news_articles["word_vectors"].apply( lambda x: reducer( x ) )

In [15]:
news_articles.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content,word_vectors,word_vectors_2d
30936,30944,49709,“It’s the Bataan Death March”-Democratic White...,Breitbart,Breitbart News,2016-05-21,2016.0,5.0,,advertisement,"[[1.0420169, 0.12709321, 0.15162945, 0.2223444...","[[-0.888271043487767, -0.9503344111128138], [7..."
39636,41147,60019,’’Santas’ march against South Korea’s impeache...,CNN,,2016-12-24,2016.0,12.0,,"(CNN) Thousands of protesters, including some...","[[-1.0106193, 0.18087775, -0.03555146, 0.26973...","[[2.6020068553501243, 8.441647894107245], [2.6..."
42781,45425,64400,’Zombieland’ blows up in Detroit,CNN,,2015-07-11,2015.0,7.0,,(CNN)”Zombieland” blew up. Detroit’s Park Ave...,"[[-1.0106193, 0.18087775, -0.03555146, 0.26973...","[[1.7579184550029372, 9.411330182624862], [6.9..."
40246,41971,60913,’Zombie cat’ presumed dead found alive after b...,CNN,Stephanie Gallman,2015-01-28,2015.0,1.0,,(CNN) It’s like a plot line ripped from the s...,"[[-1.0106193, 0.18087775, -0.03555146, 0.26973...","[[1.6676626415800517, -2.4144524585043], [7.33..."
31249,31257,50022,’Zombie Mike’ Sought for Allegedly Raping a Wh...,Breitbart,Warner Todd Huston,2016-04-08,2016.0,4.0,,"A manhunt is under way for Michael Hawkins, kn...","[[-1.0106193, 0.18087775, -0.03555146, 0.26973...","[[1.0188343027372402, 2.006809156961334], [7.1..."


In [None]:
# ok let's save for safe keeping
# ... just getting started

news_articles.to_excel( 'article_2d_vectors.xlsx' )