In [1]:
# Syntactic Similarity
# sila Nov. 19 2022
# 
# Based on >>Blueprints for Text Analysis Using Python<<
# Jens Albrecht, Sidharth Ramachandran, Christian Winkler
# Chapter 5

In [2]:
# Simple count in a vector

In [3]:
sentences = ["It was the best of times", 
             "it was the worst of times", 
             "it was the age of wisdom", 
             "it was the age of foolishness"]

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]

vocabulary = set([w for s in tokenized_sentences for w in s])

import pandas as pd
[[w, i] for i,w in enumerate(vocabulary)]

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


[['best', 0],
 ['It', 1],
 ['times', 2],
 ['it', 3],
 ['wisdom', 4],
 ['was', 5],
 ['the', 6],
 ['foolishness', 7],
 ['of', 8],
 ['age', 9],
 ['worst', 10]]

In [4]:
# One hot by hand

In [5]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

onehot = [onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences]
tokens = "It was the best of times".split()
print(tokens)
print(vocabulary)
print(onehot_encode(tokens))

['It', 'was', 'the', 'best', 'of', 'times']
{'best', 'It', 'times', 'it', 'wisdom', 'was', 'the', 'foolishness', 'of', 'age', 'worst'}
[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0]


In [6]:
for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0]: It was the best of times
[0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1]: it was the worst of times
[0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0]: it was the age of wisdom
[0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0]: it was the age of foolishness


In [7]:
# Compare to vectors
import numpy as np
np.dot(onehot[0], onehot[1])

4

In [8]:
np.dot(onehot, onehot[1])

array([4, 6, 4, 4])

In [9]:
# As expected vector 1 is most similar to itself

In [10]:
# Vectors for
# Out of vocabulary
onehot_encode("the age of wisdom is the best of times".split())

[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [12]:
more_sentences = sentences + ["John likes to watch movies. Mary likes movies too.",
                              "Mary also likes to watch football games."]
pd.DataFrame(more_sentences)

Unnamed: 0,0
0,It was the best of times
1,it was the worst of times
2,it was the age of wisdom
3,it was the age of foolishness
4,John likes to watch movies. Mary likes movies ...
5,Mary also likes to watch football games.


In [13]:
cv.fit(more_sentences)

In [14]:
print(cv.get_feature_names_out())

['age' 'also' 'best' 'foolishness' 'football' 'games' 'it' 'john' 'likes'
 'mary' 'movies' 'of' 'the' 'times' 'to' 'too' 'was' 'watch' 'wisdom'
 'worst']


In [15]:
dt = cv.transform(more_sentences)
#print(dt)

In [16]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,2,1,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt[0], dt[1])

array([[0.83333333]])

In [18]:
cosine_similarity(dt[2], dt[3])

array([[0.83333333]])

In [19]:
cosine_similarity(dt[1], dt[3])

array([[0.66666667]])

In [20]:
# Using TF/IDF

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)

In [22]:
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.56978,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.56978
2,0.467228,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.56978,0.0
3,0.467228,0.0,0.0,0.56978,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305609,0.501208,0.250604,0.611219,0.0,0.0,0.0,0.250604,0.305609,0.0,0.250604,0.0,0.0
5,0.0,0.419233,0.0,0.0,0.419233,0.419233,0.0,0.0,0.343777,0.343777,0.0,0.0,0.0,0.0,0.343777,0.0,0.0,0.343777,0.0,0.0


In [23]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.675351,0.457049,0.457049,0.0,0.0
1,0.675351,1.0,0.457049,0.457049,0.0,0.0
2,0.457049,0.457049,1.0,0.675351,0.0,0.0
3,0.457049,0.457049,0.675351,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.43076
5,0.0,0.0,0.0,0.0,0.43076,1.0


In [24]:
# Continue when
# Abcnews file is uploaded to colab (or local machine)

In [25]:
headlines = pd.read_csv('abcnews-date-text.csv', parse_dates=["publish_date"])
headlines.head()

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [26]:
headlines.tail()

Unnamed: 0,publish_date,headline_text
1082163,2017-06-30,when is it ok to compliment a womans smile a g...
1082164,2017-06-30,white house defends trumps tweet
1082165,2017-06-30,winter closes in on tasmania as snow ice falls
1082166,2017-06-30,womens world cup australia wins despite atapat...
1082167,2017-06-30,youtube stunt death foreshadowed by tweet


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines["headline_text"])

In [28]:
print(tfidf.get_feature_names_out())

['000' '000app' '002' ... 'zyngier' 'zz' 'zzz']


In [29]:
print(dt.shape)

(1082168, 95999)


In [30]:
print(dt.data.nbytes)

54644968


In [31]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000])

CPU times: total: 1.05 s
Wall time: 1.42 s


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16871665,
        0.16767302],
       [0.        , 0.        , 0.        , ..., 0.16871665, 1.        ,
        0.33175557],
       [0.        , 0.        , 0.        , ..., 0.16767302, 0.33175557,
        1.        ]])

In [32]:
# Finding document most similar to made-up document

In [33]:
dt = tfidf.fit_transform(headlines["headline_text"])

In [34]:
made_up = tfidf.transform(["australia and new zealand discuss optimal apple size"])

In [35]:
sim = cosine_similarity(made_up, dt)

In [36]:
print(sim[0])

[0.         0.         0.         ... 0.         0.05526755 0.        ]


In [37]:
headlines.iloc[np.argsort(sim[0])[::-1][0:5]][["publish_date", "headline_text"]]

Unnamed: 0,publish_date,headline_text
633411,2011-08-17,new zealand apple imports
633410,2011-08-17,new zealand apple import
633412,2011-08-17,new zealand apple industry hurting
299505,2007-04-21,highlights australia v new zealand
299543,2007-04-21,podcast australia v new zealand


In [38]:
# Removing stop-words, top 10.000 words (from Google index), working on Lemmas only
# etc will probably improve performance. 
# See chapter 5 in the Blueprints book for details. 

In [39]:
# Another example of finding syntactic similarity with this code:

In [40]:
new_made_up = tfidf.transform(["Trump tweet from White house"])

In [41]:
sim = cosine_similarity(new_made_up, dt)

In [42]:
headlines.iloc[np.argsort(sim[0])[::-1][0:10]][["publish_date", "headline_text"]]

Unnamed: 0,publish_date,headline_text
1082164,2017-06-30,white house defends trumps tweet
712816,2012-08-01,to tweet or not to tweet
703365,2012-06-19,tweet tweet tweeting
1052759,2016-11-11,obama and trump meet at the white house
960635,2015-06-17,donald trump announces run for the white house
960915,2015-06-18,donald trump joins race for the white house
1069653,2017-03-18,trump and merkel meet at white house
1062978,2017-01-30,donald trump white house defends us immigratio...
1055951,2016-12-03,trump invites duterte to the white house
1051677,2016-11-05,trump clinton clash over economy in white hous...
