In [1]:
import spacy
import pandas as pd

# Import project components
from helpers import get_articles
from esclient import ElasticsearchClient

In [2]:
%config Completer.use_jedi = False

In [3]:
NLP = spacy.load("en_core_web_sm")

In [4]:
print("Starting...")
# create client
es = ElasticsearchClient('10.0.0.35')
print("Connected")

Starting...
Connected


In [5]:
# set fetching corresponding time window
from_ = '2021-02-18T00:00:00.000'
to_ = '2021-02-18T23:59:00.000'
timeframe = (from_, to_)

In [7]:
print("Fetching articles...")
articles = get_articles(es, timeframe)
print('Total articles fetched: {}'.format(len(articles)))

Fetching articles...
Total articles fetched: 701


In [8]:
data = pd.DataFrame(articles).rename(columns={0: 'datetime', 1: 'article'})

In [9]:
data.head()

Unnamed: 0,datetime,article
0,2021-02-18 14:22:06+00:00,"{'@version': '1', 'bylined_article': 'No', 're..."
1,2021-02-18 02:00:00+00:00,"{'@version': '1', 'bylined_article': 'No', 're..."
2,2021-02-18 00:30:21+00:00,"{'@version': '1', 'bylined_article': 'No', 're..."
3,2021-02-18 13:56:11+00:00,"{'@version': '1', 'bylined_article': 'No', 're..."
4,2021-02-18 12:00:28+00:00,"{'@version': '1', 'bylined_article': 'No', 're..."


In [10]:
data = data.join(data['article'].apply(pd.Series))

In [11]:
data.drop("article", axis = 1, inplace=True)

In [12]:
data.head()

Unnamed: 0,datetime,@version,bylined_article,references_count,@timestamp,text,polarity,word_count,source_name,subjectivity,...,url,click_baitness,origin,readability,lang,id,favorite_count,reply_count,quote_count,retweet_count
0,2021-02-18 14:22:06+00:00,1,No,6,2021-02-18T14:22:13.419Z,An NHS employee looks over the vaccination bay...,Neutral,853,The Indian Express,Very objective,...,https://indianexpress.com/article/explained/ex...,,twitter,Very confusing,en,1362407046398631937,,,,
1,2021-02-18 02:00:00+00:00,1,No,25,2021-02-18T02:00:19.267Z,As debates rage across the country over whethe...,Neutral,1939,NBC Nightly News with Lester Holt,Neutral,...,https://www.nbcnews.com/news/education/remote-...,,twitter,Difficult,en,1362220291154452482,11.0,0.0,0.0,6.0
2,2021-02-18 00:30:21+00:00,1,No,14,2021-02-18T00:30:32.975Z,"In a recent press conference, Pentagon officia...",Neutral,751,National Interest,Very objective,...,https://nationalinterest.org/blog/skeptics/afg...,,twitter,Fairly difficult,en,1362197730001625088,5.0,0.0,0.0,4.0
3,2021-02-18 13:56:11+00:00,1,No,16,2021-02-18T13:56:18.201Z,"In early March 1992, a foundational U.S. strat...",Neutral,1210,Lawfare,Very objective,...,https://www.lawfareblog.com/indo-pacific-strat...,,twitter,Difficult,en,1362400523329167363,3.0,0.0,0.0,2.0
4,2021-02-18 12:00:28+00:00,1,No,46,2021-02-18T12:00:36.803Z,Double masking has emerged as a crowd-sourced ...,Positive,1003,Calgary Herald,Neutral,...,https://www.healthing.ca/diseases-and-conditio...,,twitter,Difficult,en,1362371402737848327,,1.0,,


In [13]:
data.columns

Index(['datetime', '@version', 'bylined_article', 'references_count',
       '@timestamp', 'text', 'polarity', 'word_count', 'source_name',
       'subjectivity', 'title', 'publish_datetime', 'top_image',
       'scientific_count', 'handle', 'references', 'tags', 'authors', 'url',
       'click_baitness', 'origin', 'readability', 'lang', 'id',
       'favorite_count', 'reply_count', 'quote_count', 'retweet_count'],
      dtype='object')

In [14]:
data["lang"].value_counts()

en    701
Name: lang, dtype: int64

In [15]:
# shuffle data
data = data.sample(frac=1).reset_index(drop=True)

In [16]:
data.head()

Unnamed: 0,datetime,@version,bylined_article,references_count,@timestamp,text,polarity,word_count,source_name,subjectivity,...,url,click_baitness,origin,readability,lang,id,favorite_count,reply_count,quote_count,retweet_count
0,2021-02-18 02:22:01+00:00,1,No,6,2021-02-18T02:22:09.000Z,The winter weather crisis has turned into a wa...,Neutral,827,Dallas Morning News,Very objective,...,https://www.dallasnews.com/news/weather/2021/0...,,twitter,Fairly difficult,en,1362225829716099074,4.0,1.0,0.0,3.0
1,2021-02-18 20:06:55+00:00,1,No,6,2021-02-18T20:07:03.254Z,A flip-flop of Earth’s magnetic poles between ...,Neutral,1128,Science News,Very objective,...,https://www.sciencenews.org/article/earth-magn...,,twitter,Difficult,en,1362493823037890561,85.0,4.0,2.0,31.0
2,2021-02-18 02:45:02+00:00,1,Yes,12,2021-02-18T02:45:15.539Z,A drug used for treating parasites was briefly...,Positive,520,Stuff,Neutral,...,https://www.stuff.co.nz/national/health/corona...,,twitter,Difficult,en,1362231621651296264,,3.0,,
3,2021-02-18 17:45:54+00:00,1,Yes,13,2021-02-18T17:46:00.328Z,"On Thursday afternoon, Perseverance, NASA’s mo...",Neutral,625,WIRED,Very objective,...,https://www.wired.com/story/how-to-watch-nasas...,,twitter,Difficult,en,1362458331210780681,45.0,0.0,2.0,19.0
4,2021-02-18 01:00:05+00:00,1,No,1,2021-02-18T01:01:31.595Z,TORONTO -- A new study suggests that climate c...,Neutral,411,CTV News,Very subjective,...,https://www.ctvnews.ca/climate-and-environment...,,twitter,Very confusing,en,1362205211331350531,21.0,32.0,2.0,6.0


In [17]:
data.shape

(701, 28)

In [18]:
train = data[:601]

In [19]:
test = data[601:]

In [20]:
train = pd.DataFrame(train["text"])

In [21]:
test = pd.DataFrame(test["text"])

In [22]:
test.reset_index(drop=True, inplace=True)

## Train embeddings

In [23]:
import fasttext

In [24]:
import numpy as np

In [25]:
np.savetxt(r'train.txt', train.values, fmt="%s")

In [26]:
model = fasttext.train_unsupervised("train.txt")

In [27]:
model.get_word_vector("the") + model.get_word_vector("at")

array([-0.21318385,  0.19906057, -0.15394117, -0.0887067 , -0.0419652 ,
       -0.31530857, -0.28573543,  0.51959133, -0.71608436,  0.43242806,
       -0.50335526, -1.1146153 ,  0.44841817, -0.99372923,  0.15226609,
       -0.3852373 , -0.4807292 ,  0.18805593,  0.54104257,  0.16522384,
       -0.4632836 ,  0.11385529, -0.28500247, -0.16165611, -0.11779454,
        0.14798246,  0.180295  , -0.34493083, -0.5592813 ,  0.4127128 ,
       -0.08449039,  0.12075883, -0.45598993, -0.23989156, -0.0097229 ,
        0.21026285,  0.10734053,  0.10873559,  0.55075884,  0.22544804,
       -0.5258551 ,  0.99412626, -1.1888003 ,  0.7574045 , -0.25568762,
       -0.39299178,  0.9746808 ,  0.13862574, -0.6164155 , -0.0520665 ,
       -0.7151561 , -0.47654772, -0.3627184 , -0.02973231, -0.04797609,
        0.13369161, -0.37809736,  0.37672725,  0.16738409, -0.11890082,
        0.39174968, -0.48931363, -0.3017764 ,  0.38554397,  0.31566843,
       -0.18853906,  0.05569923, -0.01819134, -0.9476843 ,  1.29

In [28]:
test_str = test.loc[0].values[0]

In [29]:
def get_mean_embedding(text):
    accum = 0
    for word in text.split():
        emb = model.get_word_vector(word)
        accum += emb
    accum /= len(text.split())
    return accum

In [30]:
mean_embeddings = test["text"].apply(get_mean_embedding)

In [31]:
mean_embeddings

0     [-0.03745339, -0.049993336, -0.20754576, -0.08...
1     [-0.08984386, -0.16195257, -0.19512671, -0.016...
2     [-0.07810761, -0.062172163, -0.19828564, -0.08...
3     [0.056023937, 0.060261212, -0.29016793, 0.0713...
4     [0.023778055, 0.008605363, -0.19391066, -0.062...
                            ...                        
95    [-0.029408874, -0.04367957, -0.14207345, -0.16...
96    [-0.12605351, -0.12779108, -0.29760912, -0.067...
97    [-0.06918773, -0.08350389, -0.18238182, -0.109...
98    [-0.061607808, -0.04224319, -0.21548186, -0.13...
99    [-0.044436228, -0.05202745, -0.15339218, -0.21...
Name: text, Length: 100, dtype: object

In [32]:
test["embeddings"] = mean_embeddings

In [33]:
test

Unnamed: 0,text,embeddings
0,Facebook also announced that it would start re...,"[-0.03745339, -0.049993336, -0.20754576, -0.08..."
1,WASHINGTON — Vice President Kamala Harris stre...,"[-0.08984386, -0.16195257, -0.19512671, -0.016..."
2,A waitress in New York City was fired from her...,"[-0.07810761, -0.062172163, -0.19828564, -0.08..."
3,Life expectancy in the US dropped a full year ...,"[0.056023937, 0.060261212, -0.29016793, 0.0713..."
4,This year’s flu season has been historically m...,"[0.023778055, 0.008605363, -0.19391066, -0.062..."
...,...,...
95,"There's no silver bullet to living longer, say...","[-0.029408874, -0.04367957, -0.14207345, -0.16..."
96,Here's what to know about the mission and how ...,"[-0.12605351, -0.12779108, -0.29760912, -0.067..."
97,\n• The CDC last week released its guidance on...,"[-0.06918773, -0.08350389, -0.18238182, -0.109..."
98,A restaurant in Florida went viral after shari...,"[-0.061607808, -0.04224319, -0.21548186, -0.13..."


In [34]:
def cosine_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [35]:
similarities = []
for row1 in test.iterrows():
    for row2 in test.iterrows():
        emb1 = row1[1][1]
        emb2 = row2[1][1]
        similarities.append((row1[1][0], row2[1][0], cosine_similarity(emb1, emb2)))

In [36]:
similarities[:10]

[("Facebook also announced that it would start removing posts that contain false claims about vaccines. It had previously made it more difficult to see vaccine misinformation by “downranking” it, which makes it less visible in news feeds.\n\nAmid the years-long spat between the tech giants, Facebook CEO Mark Zuckerberg in 2018 didn’t mince words when talking about rival Apple.\n\n“We need to inflict pain,” he told his team, pointing out how terribly Apple was treating the company, according to people who were familiar with the exchange.\n\nThe Wall Street Journal added that particular squabble involved Apple’s “glib” comments regarding the highly sensitive topic of the company’s data collection practices.\n\nToday, the most recent feud has branched out to privacy notifications that Apple plans to include in the iOS 14 operating system for iPhones. Those notifications prompt users to give permissions to apps that want to track their activity—a change that Facebook fears would hurt its a

In [37]:
allsides = pd.read_csv("./archive/allsides.csv")

In [38]:
allsides

Unnamed: 0,agree,agree_ratio,agreeance_text,allsides_page,bias,disagree,name,total_votes
0,21052,1.680530,Agrees,https://www.allsides.com/news-source/abc-news-...,left-center,12527,ABC News (Online),33579
1,80,2.857143,Strongly Agrees,https://www.allsides.com/news-source/above-law...,left-center,28,Above The Law,108
2,223,1.581560,Agrees,https://www.allsides.com/news-source/abridge-n...,allsides,141,Abridge News,364
3,263,2.481132,Strongly Agrees,https://www.allsides.com/news-source/aj-media-...,left,106,AJ+,369
4,4217,0.631382,Disagrees,https://www.allsides.com/news-source/al-jazeer...,left-center,6679,Al Jazeera,10896
...,...,...,...,...,...,...,...,...
295,268,1.212670,Somewhat Agrees,https://www.allsides.com/news-source/independe...,left-center,221,The Independent,489
296,2711,2.755081,Strongly Agrees,https://www.allsides.com/news-source/intercept,left,984,The Intercept,3695
297,195,1.010363,Somewhat Agrees,https://www.allsides.com/news-source/jerusalem...,center,193,The Jerusalem Post,388
298,2,2.000000,Agrees,https://www.allsides.com/news-source/juggernau...,left,1,The Juggernaut,2
