# Word embeddings for biased outlets





In this script, we:
- train word embeddings for HuffPost and Breitbart
- extract seed biased words from the words close to the words describing contentious topics

To run this script, the following data files are needed:
- left_news_dates.csv (-)
- right_news_dates.csv (-)
- wordsim353.tsv (-)
- men.txt (-)
- questions-words.txt (-)

Saved models:
- huff_simp_bi_trigr10.model (+)
- breitbart10.model (+)

(attached: +, not attached: -)

In [1]:
import os
# for jupyter notebook
# os.chdir('/Users/ladarudnitckaia/Desktop/CSS/Project/queries from ccnc')

# for colaboratory gpu

from google.colab import drive
drive.mount('/content/gdrive')
import sys

os.chdir("/content/gdrive/My Drive/CSS Project")

Mounted at /content/gdrive


In [3]:
# data
import pandas as pd
import numpy as np
import csv

# misc
import ast
import time
import re
from random import sample
import statistics
from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import TSNE

# nlp
import string
import inflect
infl = inflect.engine()
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('words')
import warnings  
warnings.filterwarnings(action = 'ignore')
import gensim 
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils
from gensim.corpora import Dictionary
from gensim.utils import tokenize
from gensim.models.phrases import Phrases, Phraser
from gensim.models.phrases import original_scorer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


## 1 HuffPost

### 1.1 Data Exploration

In [None]:
os.chdir("/content/gdrive/My Drive/CSS Project/Data")
left_news = pd.read_csv("left_news_dates.csv")
huffpost = left_news[left_news.source_domain == "www.huffingtonpost.com"]
del(left_news)
huffpost.head()

Unnamed: 0,date_publish,lang,source_domain,title,text
2384,2018-07-23 13:44:16,en,www.huffingtonpost.com,"Yes, Democrats Need To Run Left - On Economics",Spencer Platt via Getty Images Rising progress...
2385,2017-09-27 22:45:42,en,www.huffingtonpost.com,Trump's Explanation For Removing Sudan From Hi...,"At a press briefing on Wednesday afternoon, U...."
2386,2017-10-02 17:56:46,en,www.huffingtonpost.com,It's Completely Legal To Walk Around Las Vegas...,Sunday’s horrific mass shooting in Las Vegas o...
2387,2017-08-01 14:55:48,en,www.huffingtonpost.com,Dylan Sprouse Is Acting Again For The First Ti...,Dylan Sprouse is returning to screens after ta...
2388,2016-12-30 16:47:06,en,www.huffingtonpost.com,How to Tell if You've Really Found Your Soulmate,How can you be sure the person you're going to...


In [None]:
print('Number of articles:', huffpost.shape[0]) # 101108

Number of articles: 101108


In [None]:
# Select only articles published in or later than 2010
huffpost['year_publish'] = huffpost.apply(lambda row: int(row.date_publish[:4]) if (pd.notnull(row.date_publish)) else row.date_publish, axis=1)

print("The number of articles published in or later than 2010:",
      len(huffpost[huffpost.year_publish >= 2010]), ",",
      round(len(huffpost[huffpost.year_publish >= 2010])/len(huffpost[pd.notnull(huffpost.year_publish)])*100,2),
      "%")
print("The number of articles published earlier than 2010:",
      len(huffpost[huffpost.year_publish < 2010]), ",",
      round(len(huffpost[huffpost.year_publish < 2010])/len(huffpost[pd.notnull(huffpost.year_publish)])*100,2),
      "%")

The number of articles published in or later than 2010: 101101 , 100.0 %
The number of articles published earlier than 2010: 1 , 0.0 %


In [None]:
print("The number of articles with missing publish date:", huffpost.date_publish.isnull().sum())

The number of articles with missing publish date: 6


Articles with unknown publish date are kept due to the scarcity of the data. Basing on the rest of the data, I assume that majority of them were published later than 2010.

In [None]:
huffpost = pd.concat([huffpost[huffpost.year_publish >= 2010],
                        huffpost[huffpost.year_publish.isnull() == True]])

In [None]:
print('Is there missing values?', huffpost.isnull().values.any())
print('Number of missing values:\n', huffpost.isnull().sum())
# Delete the rows with missing article text:
huffpost = huffpost.dropna(subset=['text'])

Is there missing values? True
Number of missing values:
 date_publish      6
lang              0
source_domain     0
title             0
text             10
year_publish      6
dtype: int64


In [None]:
print('Number of articles after filtering:', huffpost.shape[0]) # 101097

Number of articles after filtering: 101097


### 1.2 Pre-processing

In [None]:
huffpost['text_preprocessed'] = huffpost["title"].astype(str) + ". " + huffpost["text"]

In [None]:
# Remove links and nicknames
http_number, www_number, pic_number, at_number = 0, 0, 0, 0
for row in huffpost.text_preprocessed:
  if "http" in row:
    http_number += 1
  if 'www' in row:
    www_number += 1
  if 'pic.' in row:
    pic_number += 1
  if '@' in row:
    at_number += 1

print('The number of articles containing "http" in the text', http_number)
print('The number of articles containing "www" in the text', www_number)
print('The number of articles containing "pic." in the text', pic_number)
print('The number of articles containing "@" in the text', at_number)

The number of articles containing "http" in the text 7075
The number of articles containing "www" in the text 2870
The number of articles containing "pic." in the text 9497
The number of articles containing "@" in the text 18196


In [None]:
huffpost['text_preprocessed'] = huffpost.apply(lambda row: re.sub(r'http\S+', '', row.text_preprocessed), axis=1) # including https
huffpost['text_preprocessed'] = huffpost.apply(lambda row: re.sub(r'www\.\S+', '', row.text_preprocessed), axis=1)
huffpost['text_preprocessed'] = huffpost.apply(lambda row: re.sub(r'pic\.\S+', '', row.text_preprocessed), axis=1)
huffpost['text_preprocessed'] = huffpost.apply(lambda row: re.sub(r'@\S+', '', row.text_preprocessed), axis=1)

In [None]:
start_time = time.time()
huffpost['tokens'] = huffpost.apply(lambda row: gensim.utils.simple_preprocess(row.text_preprocessed, 
                                                                               deacc=False, min_len=1, max_len=28),
                                    axis=1)
end_time = time.time()
print("huffpost gensim simple_preprocess", round((end_time - start_time),2), "seconds")

huffpost gensim simple_preprocess 66.7 seconds


In [None]:
huffpost['words'] = huffpost.apply(lambda row: len(row.tokens), axis=1)

print("Total number of words for training:", sum(huffpost['words']))
print("Average number of words per article:", round(statistics.mean(huffpost['words']),0))

Total number of words for training: 68172239
Average number of words per article: 674.0


### 1.3 Word embeddings

#### 1.3.1 Bigrams generation

In [None]:
# Check the scores of a sample of real bigrams
print("Very frequent:")
print("new_york", original_scorer(bigram_transformer.vocab[b'new'], bigram_transformer.vocab[b'york'], 
                    bigram_transformer.vocab[b'new_york'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("climate_change", original_scorer(bigram_transformer.vocab[b'climate'], bigram_transformer.vocab[b'change'], 
                    bigram_transformer.vocab[b'climate_change'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("left_wing", original_scorer(bigram_transformer.vocab[b'left'], bigram_transformer.vocab[b'wing'], 
                    bigram_transformer.vocab[b'left_wing'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("civil_rights", original_scorer(bigram_transformer.vocab[b'civil'], bigram_transformer.vocab[b'rights'], 
                    bigram_transformer.vocab[b'civil_rights'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("human_rights", original_scorer(bigram_transformer.vocab[b'human'], bigram_transformer.vocab[b'rights'], 
                    bigram_transformer.vocab[b'human_rights'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("sexual_harassment", original_scorer(bigram_transformer.vocab[b'sexual'], bigram_transformer.vocab[b'harassment'], 
                    bigram_transformer.vocab[b'sexual_harassment'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("anti_semitic", original_scorer(bigram_transformer.vocab[b'anti'], bigram_transformer.vocab[b'semitic'], 
                    bigram_transformer.vocab[b'anti_semitic'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("anti_lgbtq", original_scorer(bigram_transformer.vocab[b'anti'], bigram_transformer.vocab[b'lgbtq'], 
                    bigram_transformer.vocab[b'anti_lgbtq'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("Medium frequent:")
print("self_promotion", original_scorer(bigram_transformer.vocab[b'self'], bigram_transformer.vocab[b'promotion'], 
                    bigram_transformer.vocab[b'self_promotion'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("tax_fraud", original_scorer(bigram_transformer.vocab[b'tax'], bigram_transformer.vocab[b'fraud'], 
                    bigram_transformer.vocab[b'tax_fraud'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("bike_share", original_scorer(bigram_transformer.vocab[b'bike'], bigram_transformer.vocab[b'share'], 
                    bigram_transformer.vocab[b'bike_share'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("illegal_aliens", original_scorer(bigram_transformer.vocab[b'illegal'], bigram_transformer.vocab[b'aliens'], 
                    bigram_transformer.vocab[b'illegal_aliens'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("pro_choice", original_scorer(bigram_transformer.vocab[b'pro'], bigram_transformer.vocab[b'choice'], 
                    bigram_transformer.vocab[b'pro_choice'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("Rare:")
print("limited_liability", original_scorer(bigram_transformer.vocab[b'limited'], bigram_transformer.vocab[b'liability'], 
                    bigram_transformer.vocab[b'limited_liability'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("audio_recordings", original_scorer(bigram_transformer.vocab[b'audio'], bigram_transformer.vocab[b'recordings'], 
                    bigram_transformer.vocab[b'audio_recordings'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("auto_pilot", original_scorer(bigram_transformer.vocab[b'auto'], bigram_transformer.vocab[b'pilot'], 
                    bigram_transformer.vocab[b'auto_pilot'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("westminster_abbey", original_scorer(bigram_transformer.vocab[b'westminster'], bigram_transformer.vocab[b'abbey'], 
                    bigram_transformer.vocab[b'westminster_abbey'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

Very frequent:
new_york 154.48516218496692
climate_change 250.0298531202226
left_wing 101.62679355675907
civil_rights 374.46785059152614
human_rights 163.22080595400888
sexual_harassment 532.5095309904502
anti_semitic 1353.1335189491524
anti_lgbtq 102.27561044843367
Medium frequent:
self_promotion 36.65566785642789
tax_fraud 17.75379120420146
bike_share 41.50921181045425
illegal_aliens 662.5123260623196
pro_choice 116.91339719655505
Rare:
limited_liability 53.260706472796244
audio_recordings 268.56713763834347
auto_pilot 91.39903352407279
westminster_abbey 6614.475009278733


In [None]:
# Check the scores of a sample of wrong bigrams
print("Very frequent:")
print("told_huffpost", original_scorer(bigram_transformer.vocab[b'told'], bigram_transformer.vocab[b'huffpost'], 
                    bigram_transformer.vocab[b'told_huffpost'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("last_week", original_scorer(bigram_transformer.vocab[b'last'], bigram_transformer.vocab[b'week'], 
                    bigram_transformer.vocab[b'last_week'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("make_sure", original_scorer(bigram_transformer.vocab[b'make'], bigram_transformer.vocab[b'sure'], 
                    bigram_transformer.vocab[b'make_sure'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("reminds_us", original_scorer(bigram_transformer.vocab[b'reminds'], bigram_transformer.vocab[b'us'], 
                    bigram_transformer.vocab[b'reminds_us'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("every_day", original_scorer(bigram_transformer.vocab[b'every'], bigram_transformer.vocab[b'day'], 
                    bigram_transformer.vocab[b'every_day'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("Medium frequent:")
print("moving_parts", original_scorer(bigram_transformer.vocab[b'moving'], bigram_transformer.vocab[b'parts'], 
                    bigram_transformer.vocab[b'moving_parts'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("former_employer", original_scorer(bigram_transformer.vocab[b'former'], bigram_transformer.vocab[b'employer'], 
                    bigram_transformer.vocab[b'former_employer'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("rising_temperatures", original_scorer(bigram_transformer.vocab[b'rising'], bigram_transformer.vocab[b'temperatures'], 
                    bigram_transformer.vocab[b'rising_temperatures'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("documents_related", original_scorer(bigram_transformer.vocab[b'documents'], bigram_transformer.vocab[b'related'], 
                    bigram_transformer.vocab[b'documents_related'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("Rare:")
print("incorrect_information", original_scorer(bigram_transformer.vocab[b'incorrect'], bigram_transformer.vocab[b'information'], 
                    bigram_transformer.vocab[b'incorrect_information'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("christmas_presents", original_scorer(bigram_transformer.vocab[b'christmas'], bigram_transformer.vocab[b'presents'], 
                    bigram_transformer.vocab[b'christmas_presents'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("primary_objective", original_scorer(bigram_transformer.vocab[b'primary'], bigram_transformer.vocab[b'objective'], 
                    bigram_transformer.vocab[b'primary_objective'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

print("rental_income", original_scorer(bigram_transformer.vocab[b'rental'], bigram_transformer.vocab[b'income'], 
                    bigram_transformer.vocab[b'rental_income'], len(bigram_transformer.vocab), 
                    bigram_transformer.min_count, bigram_transformer.corpus_word_count))

Very frequent:
told_huffpost 146.77794285285
last_week 93.78138720959296
make_sure 79.4013327979774
reminds_us 75.16414435876845
every_day 40.89132798249094
Medium frequent:
moving_parts 23.061485884263444
former_employer 21.37805797530887
rising_temperatures 186.08611863155303
documents_related 34.38382672744311
Rare:
incorrect_information 20.219643476422558
christmas_presents 25.367923561439092
primary_objective 25.254077557914783
rental_income 19.52055580758316


In [None]:
stopwords = stopwords.words("english")
print('I will use', len(stopwords), 'stop words from nltk')
common_terms = frozenset(stopwords)

I will use 179 stop words from nltk


In [None]:
start_time = time.time()
# threshold for forming the phrases - higher means fewer phrases
# min_count (float, optional) – Ignore all words and bigrams with total collected count lower than this value
bigram_transformer = Phrases(huffpost.tokens,
                             min_count = 25, 
                             delimiter = b'_',
                             common_terms = common_terms,
                             scoring = 'default',
                             threshold = 90.0)
end_time = time.time()

print("Bigram transformer (Huffington Post) for", round((end_time - start_time),2), "seconds")

Bigram transformer (Huffington Post) for 81.32 seconds


In [None]:
start_time = time.time()
bigram_transformer_second = Phrases(bigram_transformer[huffpost.tokens],
                             min_count = 25, 
                             delimiter = b'_',
                             common_terms = common_terms,
                             scoring = 'default',
                             threshold = 120.0)
end_time = time.time()

print("Bigram transformer (Huffington Post) for", round((end_time - start_time),2), "seconds")

Bigram transformer (Huffington Post) for 261.58 seconds


#### 1.3.2 Training

In [None]:
start_time = time.time()
huff_simp_bi_trigr10 = Word2Vec(bigram_transformer_second[huffpost['tokens']],
                      size = 300, window = 8, min_count = 25, workers = 4, sample = 0.00001,
                      sg = 1, compute_loss = True, callbacks = (), iter = 10)
end_time = time.time()

os.chdir("/content/gdrive/My Drive/CSS Project/Models")
huff_simp_bi_trigr10.save("huff_simp_bi_trigr10.model")
print("Word2Vec for left news (Huffington Post) for", round((end_time - start_time),2), "seconds")

Word2Vec for left news (Huffington Post) for 3128.95 seconds


#### 1.3.3 Analysis of quality

In [None]:
os.chdir("/content/gdrive/My Drive/CSS Project/Models")
huff_simp_bi_trigr10 = Word2Vec.load("huff_simp_bi_trigr10.model")

In [None]:
print("Vocabulary length:", len(huff_simp_bi_trigr10.wv.vocab), "words")

Vocabulary length: 53223 words


In [None]:
os.chdir("/content/gdrive/My Drive/CSS Project/Evaluation data")
wordsim353 = huff_simp_bi_trigr10.evaluate_word_pairs('wordsim353.tsv')
# simverb3500 = huff_simp_bi_trigr10.evaluate_word_pairs('simverb-3500.txt')
men = huff_simp_bi_trigr10.evaluate_word_pairs('men.txt')
# rw = huff_simp_bi_trigr10.evaluate_word_pairs('rw.txt')

print("Test dataset WordSim-353 Pearson:", round(wordsim353[0][0],2), ", Spearman:", round(wordsim353[1][0],2))
# print("Test dataset SimVerb-3500 Pearson:", round(simverb3500[0][0],2), ", Spearman:", round(simverb3500[1][0],2))
print("Test dataset MEN Pearson:", round(men[0][0],2), ", Spearman:", round(men[1][0],2))
# print("Test dataset RW Pearson:", round(rw[0][0],2), ", Spearman:", round(rw[1][0],2))

Test dataset WordSim-353 Pearson: 0.65 , Spearman: 0.68
Test dataset SimVerb-3500 Pearson: 0.22 , Spearman: 0.24
Test dataset MEN Pearson: 0.71 , Spearman: 0.72
Test dataset RW Pearson: 0.47 , Spearman: 0.49


In [None]:
analogy_google = huff_simp_bi_trigr10.wv.evaluate_word_analogies('questions-words.txt')
#analogy_SemEval = huff_simp_bi_trigr10.wv.evaluate_word_analogies('semeval.txt')

print("Test dataset Google:", round(analogy_google[0],2))
#print("Test dataset SemEval-2012:", round(analogy_SemEval[0],2))

Test dataset Google: 0.5


## 2 Breitbart

### 2.1 Data exploration

In [None]:
os.chdir("/content/gdrive/My Drive/CSS Project/Data")
right_news = pd.read_csv("right_news_dates.csv")
breitbart = right_news[right_news.source_domain == "www.breitbart.com"]
del(right_news)
breitbart.head()

Unnamed: 0,date_publish,lang,source_domain,title,text
0,2019-07-30 11:42:14,en,www.breitbart.com,Boko Haram Kills Up to 70 Mourners at Funeral ...,The death toll from a suspected Boko Haram att...
1,2019-09-08 14:31:38,en,www.breitbart.com,I'll Help Johnson Get 100-Seat Majority in Ele...,Brexit Party leader Nigel Farage has said that...
2,2017-01-12 04:47:38,en,www.breitbart.com,ESPN: Chargers plan to announce move to LA,SIGN UP FOR OUR NEWSLETTER NEW YORK (AP) — The...
3,2019-07-18 10:54:40,en,www.breitbart.com,Iran Claims Revolutionary Guard Seize Foreign ...,Iran’s state TV claims marine elements of the ...
4,2019-07-03 21:35:37,en,www.breitbart.com,MTV to Air Raunchy Nicki Minaj Concert in Saud...,"DUBAI, United Arab Emirates (AP) — Saudi Arabi..."


In [None]:
print("Number of articles:", len(breitbart)) # 81018

Number of articles: 81018


In [None]:
# Select only articles published in or later than 2010
breitbart['year_publish'] = breitbart.apply(lambda row: int(row.date_publish[:4]) if (pd.notnull(row.date_publish)) else row.date_publish, axis=1)

print("The number of articles published in or later than 2010:",
      len(breitbart[breitbart.year_publish >= 2010]), ",",
      round(len(breitbart[breitbart.year_publish >= 2010])/len(breitbart[pd.notnull(breitbart.year_publish)])*100,2),
      "%")
print("The number of articles published earlier than 2010:",
      len(breitbart[breitbart.year_publish < 2010]), ",",
      round(len(breitbart[breitbart.year_publish < 2010])/len(breitbart[pd.notnull(breitbart.year_publish)])*100,2),
      "%")

The number of articles published in or later than 2010: 81016 , 100.0 %
The number of articles published earlier than 2010: 0 , 0.0 %


In [None]:
print("The number of articles with missing publish date:", breitbart.date_publish.isnull().sum())

The number of articles with missing publish date: 2


Articles with unknown publish date are kept due to the scarcity of the data. Basing on the rest of the data, I assume that majority of them were published later than 2010.

In [None]:
print('Is there missing values?', breitbart.isnull().values.any())
print('Number of missing values:\n', breitbart.isnull().sum())
# Delete the rows with missing article text:
breitbart = breitbart.dropna(subset=['text'])

Is there missing values? True
Number of missing values:
 date_publish      2
lang              0
source_domain     0
title             0
text             34
year_publish      2
dtype: int64


In [None]:
print('Number of articles after filtering:', breitbart.shape[0]) # 80984

Number of articles after filtering: 80984


### 2.2 Pre-processing

In [None]:
breitbart['text_preprocessed'] = breitbart["title"].astype(str) + ". " + breitbart["text"]

In [None]:
# Remove links and nicknames
http_number, www_number, pic_number, at_number = 0, 0, 0, 0
for row in breitbart.text_preprocessed:
  if "http" in row:
    http_number += 1
  if 'www' in row:
    www_number += 1
  if 'pic.' in row:
    pic_number += 1
  if '@' in row:
    at_number += 1

print('The number of articles containing "http" in the text', http_number)
print('The number of articles containing "www" in the text', www_number)
print('The number of articles containing "pic." in the text', pic_number)
print('The number of articles containing "@" in the text', at_number)

The number of articles containing "http" in the text 11063
The number of articles containing "www" in the text 333
The number of articles containing "pic." in the text 10125
The number of articles containing "@" in the text 46296


In [None]:
breitbart['text_preprocessed'] = breitbart.apply(lambda row: re.sub(r'http\S+', '', row.text_preprocessed), axis=1) # including https
breitbart['text_preprocessed'] = breitbart.apply(lambda row: re.sub(r'www\.\S+', '', row.text_preprocessed), axis=1)
breitbart['text_preprocessed'] = breitbart.apply(lambda row: re.sub(r'pic\.\S+', '', row.text_preprocessed), axis=1)
breitbart['text_preprocessed'] = breitbart.apply(lambda row: re.sub(r'@\S+', '', row.text_preprocessed), axis=1)

In [None]:
start_time = time.time()
breitbart['tokens'] = breitbart.apply(lambda row: gensim.utils.simple_preprocess(row.text_preprocessed, 
                                                                               deacc=False, min_len=1, max_len=28),
                                    axis=1)
end_time = time.time()
print("breitbart gensim simple_preprocess", round((end_time - start_time),2), "seconds")

breitbart gensim simple_preprocess 48.32 seconds


In [None]:
breitbart['words'] = breitbart.apply(lambda row: len(row.tokens), axis=1)

print("Total number of words for training:", sum(breitbart['words']))
print("Average number of words per article:", round(statistics.mean(breitbart['words']),0))

Total number of words for training: 39348959
Average number of words per article: 486.0


### 2.3 Word embeddings

#### 2.3.1 Bigrams generation

In [None]:
stopwords = stopwords.words("english")
print('I will use', len(stopwords), 'stop words from nltk')
common_terms = frozenset(stopwords)
stopwords

In [None]:
start_time = time.time()
# threshold for forming the phrases - higher means fewer phrases
# min_count (float, optional) – Ignore all words and bigrams with total collected count lower than this value
bigram_transformer = Phrases(breitbart.tokens,
                             min_count = 25, 
                             delimiter = b'_',
                             common_terms = common_terms,
                             scoring = 'default',
                             threshold = 10.0)
end_time = time.time()

print("Bigram transformer (Huffington Post) for", round((end_time - start_time),2), "seconds")

Bigram transformer (Huffington Post) for 57.06 seconds


In [None]:
start_time = time.time()
bigram_transformer_second = Phrases(bigram_transformer[breitbart.tokens],
                             min_count = 25, 
                             delimiter = b'_',
                             common_terms = common_terms,
                             scoring = 'default',
                             threshold = 30.0)
end_time = time.time()

print("Bigram transformer (Huffington Post) for", round((end_time - start_time),2), "seconds")

Bigram transformer (Huffington Post) for 173.76 seconds


#### 2.3.2 Training

In [None]:
start_time = time.time()
breitbart10 = Word2Vec(bigram_transformer_second[breitbart['tokens']],
                      size = 300, window = 8, min_count = 25, workers = 4, sample = 0.00001,
                      sg = 1, compute_loss = True, callbacks = (), iter = 10)
end_time = time.time()

os.chdir("/content/gdrive/My Drive/CSS Project/Models")
breitbart10.save("breitbart10.model")
print("Word2Vec for breitbart for", round((end_time - start_time),2), "seconds")

Word2Vec for breitbart for 2044.3 seconds


#### 2.3.3 Analysis of quality

In [None]:
os.chdir("/content/gdrive/My Drive/CSS Project/Models")
breitbart10 = Word2Vec.load("breitbart10.model")

In [None]:
print("Vocabulary length:", len(breitbart10.wv.vocab), "words")

Vocabulary length: 44783 words


In [None]:
os.chdir("/content/gdrive/My Drive/CSS Project/Evaluation data")
wordsim353 = breitbart10.evaluate_word_pairs('wordsim353.tsv')
simverb3500 = breitbart10.evaluate_word_pairs('simverb-3500.txt')
men = breitbart10.evaluate_word_pairs('men.txt')
rw = breitbart10.evaluate_word_pairs('rw.txt')

print("Test dataset WordSim-353 Pearson:", round(wordsim353[0][0],2), ", Spearman:", round(wordsim353[1][0],2))
print("Test dataset SimVerb-3500 Pearson:", round(simverb3500[0][0],2), ", Spearman:", round(simverb3500[1][0],2))
print("Test dataset MEN Pearson:", round(men[0][0],2), ", Spearman:", round(men[1][0],2))
print("Test dataset RW Pearson:", round(rw[0][0],2), ", Spearman:", round(rw[1][0],2))

Test dataset WordSim-353 Pearson: 0.58 , Spearman: 0.59
Test dataset SimVerb-3500 Pearson: 0.16 , Spearman: 0.19
Test dataset MEN Pearson: 0.59 , Spearman: 0.59
Test dataset RW Pearson: 0.44 , Spearman: 0.49


In [None]:
analogy_google = breitbart10.wv.evaluate_word_analogies('questions-words.txt')
analogy_SemEval = breitbart10.wv.evaluate_word_analogies('semeval.txt')

print("Test dataset Google:", round(analogy_google[0],2))
print("Test dataset SemEval-2012:", round(analogy_SemEval[0],2))

Test dataset Google: 0.37
Test dataset SemEval-2012: 0.02


In [None]:
incorrect_answers = []

for question in range(len(incorrects)):
  try:
    pos1 = incorrects[question][1].lower()
    pos2 = incorrects[question][2].lower()
    neg = incorrects[question][0].lower()
    answer = breitbart10.wv.most_similar(positive=[pos1,pos2], negative=[neg], topn=1)[0][0]
    incorrect_answers.append((neg,pos1,pos2,answer))
  except:
    continue
    
incorrect_answers

[('athens', 'greece', 'baghdad', 'krg'),
 ('athens', 'greece', 'hanoi', 'hanoi_vietnam'),
 ('athens', 'greece', 'kabul', 'afghan_government'),
 ('athens', 'greece', 'london', 'uk'),
 ('baghdad', 'iraq', 'kabul', 'taliban'),
 ('baghdad', 'iraq', 'london', 'mayor_sadiq'),
 ('beijing', 'china', 'hanoi', 'hanoi_vietnam'),
 ('beijing', 'china', 'helsinki', 'putin_in_helsinki'),
 ('beijing', 'china', 'kabul', 'taliban'),
 ('beijing', 'china', 'london', 'mayor_sadiq'),
 ('berlin', 'germany', 'hanoi', 'south_korea'),
 ('berlin', 'germany', 'kabul', 'afghan_government'),
 ('berlin', 'germany', 'london', 'uk'),
 ('cairo', 'egypt', 'hanoi', 'north_korea'),
 ('cairo', 'egypt', 'helsinki', 'putin_in_helsinki'),
 ('cairo', 'egypt', 'kabul', 'afghan_government'),
 ('cairo', 'egypt', 'london', 'mayor_sadiq'),
 ('canberra', 'australia', 'kabul', 'afghan_government'),
 ('canberra', 'australia', 'london', 'uk'),
 ('hanoi', 'vietnam', 'kabul', 'afghan_government'),
 ('hanoi', 'vietnam', 'london', 'mayor_s

## 3 Extract seed biased words

In [4]:
os.chdir("/content/gdrive/My Drive/CSS Project/Models")
left_model = Word2Vec.load("huff_simp_bi_trigr10.model")
right_model = Word2Vec.load("breitbart10.model")

In [None]:
seed_words = ['regulation', 'regulations', 'involvement', 'control', 'unregulated',
              'government', 'centralization', 'law',
              'tax', 'taxes', 'taxation', 'funding', 'spending',
              'corporation', 'corporations', 'business', 'businesses', 'economy',
              'equality', 'inequality', 'rights', 'equal_rights', 'wealth', 'living_wage', 'welfare', 'welfare_state',
              'services', 'government_services', 'social_security', 'benefit', 'benefits', 'help',
              'student', 'students', 'loan', 'loans', 'student_loan', 'student_loans', 'education', 'healthcare',
              'individual', 'personal_responsibility', 'collective',
              'security', 'military', 'military_force', 'defense', 'intervention', 'protect', 'protection',
              'border', 'border_security', 'migration', 'migrant', 'migrants', 'immigration', 'immigrant', 'immigrants',
              'terror', 'terrorism', 'terrorist', 'terrorists',
              'tradition', 'norms', 'cultural_norms', 'progress', 'change', 'changes',
              'race', 'racism',
              'gender', 'man', 'woman', 'he', 'she',
              'sexual', 'orientation', 'sexual_orientation', 'gay', 'lesbian', 'homosexuality', 'homosexual',
              'identity',
              'religion', 'islam',
              'tolerance', 'multiculturalism', 'family', 'values', 'family_values', 'bible', 'constitution',
              'freedom', 'speech', 'freedom_of_speech', 'free_speech', 'hate_speech', 'gun', 'guns', 'gun_owner', 'gun_owners',
              'abortion', 'environment', 'media']
              
print("The number of seed words:", len(seed_words))

In [None]:
close_to_seed_huffpost = {}

for seed_word in seed_words:
  if seed_word not in left_model.wv.vocab:
    close_to_seed_huffpost[seed_word] = ['not in the vocabulary']
  else:
    close_to_seed_huffpost[seed_word] = left_model.most_similar(positive=seed_word, topn=20)
    
close_to_seed_huffpost_pd = pd.DataFrame(columns = ["seed_word", "close_word", "cos_sim"])

for seed_word in seed_words:
  if seed_word not in left_model.wv.vocab:
    row = pd.DataFrame([[seed_word, 'not in the vocabulary', None]], 
                       columns=["seed_word", "close_word", "cos_sim"])
    close_to_seed_huffpost_pd = close_to_seed_huffpost_pd.append(row)

  else:
    for tuple_num in range(len(left_model.most_similar(positive=seed_word, topn=20))):
      close_word = left_model.most_similar(positive=seed_word, topn=20)[tuple_num][0]
      cos_sim = left_model.most_similar(positive=seed_word, topn=20)[tuple_num][1]
      row = pd.DataFrame([[seed_word, close_word, cos_sim]], 
                         columns=["seed_word", "close_word", "cos_sim"])
      close_to_seed_huffpost_pd = close_to_seed_huffpost_pd.append(row)

In [None]:
# os.chdir("/content/gdrive/My Drive/CSS Project/Lexicon analysis")
# close_to_seed_huffpost_pd.to_csv(r'close_to_seed_huffpost.csv', index = False, header = True)

In [None]:
close_to_seed_breitbart = {}

for seed_word in seed_words:
  if seed_word not in right_model.wv.vocab:
    close_to_seed_breitbart[seed_word] = ['not in the vocabulary']
  else:
    close_to_seed_breitbart[seed_word] = right_model.most_similar(positive=seed_word, topn=20)
    
close_to_seed_breitbart_pd = pd.DataFrame(columns = ["seed_word", "close_word", "cos_sim"])

for seed_word in seed_words:
  if seed_word not in right_model.wv.vocab:
    row = pd.DataFrame([[seed_word, 'not in the vocabulary', None]], 
                       columns=["seed_word", "close_word", "cos_sim"])
    close_to_seed_breitbart_pd = close_to_seed_breitbart_pd.append(row)

  else:
    for tuple_num in range(len(right_model.most_similar(positive=seed_word, topn=20))):
      close_word = right_model.most_similar(positive=seed_word, topn=20)[tuple_num][0]
      cos_sim = right_model.most_similar(positive=seed_word, topn=20)[tuple_num][1]
      row = pd.DataFrame([[seed_word, close_word, cos_sim]], 
                         columns=["seed_word", "close_word", "cos_sim"])
      close_to_seed_breitbart_pd = close_to_seed_breitbart_pd.append(row)

In [None]:
# os.chdir("/content/gdrive/My Drive/CSS Project/Lexicon analysis")
# close_to_seed_breitbart_pd.to_csv(r'close_to_seed_breitbart.csv', index = False, header = True)