In [1]:
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
# Reading in .csv files
real_wordvec = pd.read_csv('./data/wordvec_data/real_wordvec.csv')
fake_wordvec = pd.read_csv('./data/wordvec_data/fake_wordvec.csv')
est_wordvec = pd.read_csv('./data/wordvec_data/est_wordvec.csv')
right_wordvec = pd.read_csv('./data/wordvec_data/right_wordvec.csv')
left_wordvec = pd.read_csv('./data/wordvec_data/left_wordvec.csv')

In [3]:
# Creating processed corpora for Word2Vec analysis

real_corpus = []
for i in range(len(real_wordvec['0'])):
    real_corpus.append(simple_preprocess(real_wordvec['0'][i]))
    
fake_corpus = []
for i in range(len(fake_wordvec['0'])):
    fake_corpus.append(simple_preprocess(fake_wordvec['0'][i]))
    
est_corpus = []
for i in range(len(est_wordvec['0'])):
    est_corpus.append(simple_preprocess(est_wordvec['0'][i]))

right_corpus = []
for i in range(len(right_wordvec['0'])):
    right_corpus.append(simple_preprocess(right_wordvec['0'][i]))

left_corpus = []
for i in range(len(left_wordvec['0'])):
    left_corpus.append(simple_preprocess(left_wordvec['0'][i]))

In [5]:
# Creating Word2Vec model for 'real news' article bodies

real_W2V_model = Word2Vec(real_corpus,  # Corpus of data.
                          size=100,     # Dimensionality of word vectors
                          window=8,     # Amount of "context words"
                          min_count=1,  # Ignores words below this threshold
                          sg=1,         # SG = 1 for skip-grams
                          workers=4)

In [6]:
# Creating Word2Vec model for 'fake news' article bodies

fake_W2V_model = Word2Vec(fake_corpus,
                          size=100,
                          window=8,
                          min_count=1,
                          sg=1,
                          workers=4)

In [7]:
# Creating Word2Vec model for establishment article bodies

est_W2V_model = Word2Vec(est_corpus,
                         size=100,
                         window=8,
                         min_count=1,
                         sg=1,
                         workers=4)

In [8]:
# Creating Word2Vec model for establishment article bodies

right_W2V_model = Word2Vec(right_corpus,
                           size=100,
                           window=8,
                           min_count=1,
                           sg=1,
                           workers=4)

In [9]:
# Creating Word2Vec model for establishment article bodies

left_W2V_model = Word2Vec(left_corpus,
                          size=100,
                          window=8,
                          min_count=1,
                          sg=1,
                          workers=4)

In [87]:
pd.DataFrame(left_W2V_model.most_similar(positive='trump'), columns=['term','cos_sim'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,term,cos_sim
0,donald,0.85385
1,presidency,0.744218
2,groundless,0.743976
3,cred,0.727032
4,backchannel,0.72078
5,collude,0.717873
6,unequivocally,0.717174
7,moderating,0.716822
8,deviated,0.716177
9,thankyoumaddow,0.713455


In [76]:
def get_words(positive):
    print(f'Real News: {positive}')
    print(pd.DataFrame(real_W2V_model.most_similar(positive=positive),
                        columns=['_____','cos_sim']))
    print()
    print(f'Fake News: {positive}')
    print(pd.DataFrame(fake_W2V_model.most_similar(positive=positive),
                       columns=['_____','cos_sim']))
    print()
    print(f'Establishment News: {positive}')
    print(pd.DataFrame(est_W2V_model.most_similar(positive=positive),
                       columns=['_____','cos_sim']))
    print()
    print(f'Rightwing News: {positive}')
    print(pd.DataFrame(right_W2V_model.most_similar(positive=positive),
                       columns=['_____','cos_sim']))
    print()
    print(f'Leftwing News: {positive}')
    print(pd.DataFrame(left_W2V_model.most_similar(positive=positive),
                       columns=['_____','cos_sim']))

In [77]:
get_words('clinton')

Real News: clinton
          _____   cos_sim
0       hillary  0.961000
1           mrs  0.843157
2  infidelities  0.793667
3        rodham  0.786545
4       sanders  0.769354
5      clintons  0.739765
6       minions  0.736551
7       vouched  0.735214
8         robby  0.727439
9      anzalone  0.727180

Fake News: clinton
         _____   cos_sim
0      hillary  0.954179
1     clintonâ  0.830854
2       rodham  0.808131
3   incestuous  0.801925
4  consigliere  0.790763
5      herself  0.787489
6       flotus  0.779230
7        seedy  0.777108
8    broadened  0.775403
9   contending  0.774095

Establishment News: clinton
             _____   cos_sim
0          hillary  0.946615
1              mrs  0.892097
2          sanders  0.782815
3          crooked  0.769432
4            robby  0.747385
5    indiscretions  0.735414
6  trustworthiness  0.734313
7    untrustworthy  0.723849
8             mook  0.720662
9      excoriating  0.712704

Rightwing News: clinton
            _____   cos_sim

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys
  # This is added back by InteractiveShellApp.init_path()
  from ipykernel import kernelapp as app


In [66]:
def definition(like, dislike):
    print(pd.DataFrame(real_W2V_model.most_similar(positive=[like],
                                                   negative=[dislike])))

In [71]:
definition('bernie','male')

               0         1
0        sanders  0.584468
1        vermont  0.468204
2            sen  0.438878
3  berniesanders  0.425464
4        senator  0.424054
5          kaine  0.414550
6     democratic  0.403020
7       opponent  0.402263
8        primary  0.396550
9       delegate  0.396331


  This is separate from the ipykernel package so we can avoid doing imports until


In [59]:
def get_analogy(x1,x2,y1):
    print(pd.DataFrame(real_W2V_model.most_similar(positive=[x2,y1],
                                                   negative=[x1],
                                                   topn=3),
                       columns=['^_____^','cos_sim']))

In [62]:
get_analogy('clinton','democrat','trump')

      ^_____^   cos_sim
0      cardin  0.678174
1      cantor  0.668918
2  disavowing  0.666665


  after removing the cwd from sys.path.


In [55]:
# Define analogy function.

def analogy(x1,x2,y1):
    
    # Find the vector y2 that is closest to $x_1 + x_2 + y_2$.
    y2 = real_W2V_model.most_similar(positive= [x2, y1],
                                     negative = [x1]) 
    
    # Return the result.
    return y2

In [64]:
get_analogy('trump', 'cruz', 'clinton')

   ^_____^   cos_sim
0  hillary  0.722974
1  sanders  0.720340
2   bernie  0.708326


  after removing the cwd from sys.path.


In [70]:
analogy('trump', 'cruz', 'clinton')

  import sys


[('bernie', 0.7321078777313232),
 ('sanders', 0.7271015644073486),
 ('ted', 0.7093454599380493),
 ('hillary', 0.6965794563293457),
 ('caucuses', 0.6947556138038635),
 ('vermont', 0.6914641857147217),
 ('iowa', 0.6832163333892822),
 ('kasich', 0.6774814128875732),
 ('toomey', 0.6667734384536743),
 ('superdelegates', 0.6628011465072632)]

In [None]:
real_W2V_model.predict_output_word()

In [73]:
# Creating Word2Vec model for 'fake news' article bodies

fake_W2V_model = Word2Vec(fake_corpora,
                          size = 100,
                          window = 8,
                          min_count = 1,
                          sg=1,
                          workers=4)

In [74]:
# Define analogy function.

def fake_analogy(x1,x2,y1):
    
    # Find the vector y2 that is closest to $x_1 + x_2 + y_2$.
    y2 = fake_W2V_model.most_similar(positive= [x2, y1],
                                     negative = [x1]) 
    
    # Return the result.
    return y2

In [77]:
fake_W2V_model.most_similar('antifa')

  """Entry point for launching an IPython kernel.


[('slumbering', 0.9556564092636108),
 ('morphing', 0.8669085502624512),
 ('nay', 0.8609015345573425),
 ('fainting', 0.8596563339233398),
 ('unceremoniously', 0.8568920493125916),
 ('winger', 0.8543405532836914),
 ('misbehave', 0.8541936874389648),
 ('fuselage', 0.8540734052658081),
 ('spotting', 0.8513401746749878),
 ('hammerstrike', 0.851270318031311)]

In [78]:
real_W2V_model.most_similar('antifa')

  """Entry point for launching an IPython kernel.


[('antifascist', 0.8936547040939331),
 ('identitarian', 0.8721088767051697),
 ('inflames', 0.869750440120697),
 ('tribunes', 0.8575892448425293),
 ('effeminate', 0.8562133312225342),
 ('reframing', 0.8506422638893127),
 ('misinterpret', 0.8481342792510986),
 ('consort', 0.8456528186798096),
 ('bootleg', 0.8432168960571289),
 ('accross', 0.8430099487304688)]

In [87]:
est_W2V_model = Word2Vec(est_corpus,
                         size=100,
                         window=8,
                         min_count=1,
                         sg=1,
                         workers=4)

In [88]:
# Define analogy function.

def est_analogy(x1,x2,y1):
    
    # Find the vector y2 that is closest to $x_1 + x_2 + y_2$.
    y2 = est_W2V_model.most_similar(positive= [x2, y1],
                                     negative = [x1]) 
    
    # Return the result.
    return y2

In [90]:
fake_analogy('obama','biden','trump')

  import sys


[('donald', 0.6837036609649658),
 ('lewdly', 0.6469045281410217),
 ('groping', 0.6282551288604736),
 ('kellyanne', 0.624221920967102),
 ('pence', 0.622225284576416),
 ('gingrich', 0.6208186745643616),
 ('trumpâ', 0.6175230145454407),
 ('kyei', 0.6087804436683655),
 ('conway', 0.6012117862701416),
 ('groper', 0.5994833707809448)]

In [91]:
right_W2V_model = Word2Vec(right_corpus,
                           size=100,
                           window=8,
                           min_count=1,
                           sg=1,
                           workers=4)

In [92]:
left_W2V_model = Word2Vec(left_corpus,
                          size=100,
                          window=8,
                          min_count=1,
                          sg=1,
                          workers=4)

In [93]:
# Define analogy function.

def left_analogy(x1,x2,y1):
    
    # Find the vector y2 that is closest to $x_1 + x_2 + y_2$.
    y2 = left_W2V_model.most_similar(positive= [x2, y1],
                                     negative = [x1]) 
    
    # Return the result.
    return y2

In [94]:
left_analogy('obama','biden','trump')

  import sys


[('officeholder', 0.6380375623703003),
 ('oz', 0.6140663623809814),
 ('bolduan', 0.604180634021759),
 ('retweeting', 0.6039628386497498),
 ('referencing', 0.6028333902359009),
 ('donald', 0.5978373289108276),
 ('donaldjtrumpjr', 0.5957256555557251),
 ('erstwhile', 0.5955708622932434),
 ('retweet', 0.5948001146316528),
 ('skewered', 0.5912362337112427)]

In [95]:
# Define analogy function.

def right_analogy(x1,x2,y1):
    
    # Find the vector y2 that is closest to $x_1 + x_2 + y_2$.
    y2 = right_W2V_model.most_similar(positive= [x2, y1],
                                     negative = [x1]) 
    
    # Return the result.
    return y2

In [96]:
right_analogy('obama','biden','trump')

  import sys


[('donald', 0.717384397983551),
 ('veep', 0.6595349907875061),
 ('presumptive', 0.6592819094657898),
 ('nominee', 0.6483389139175415),
 ('broadside', 0.6461474895477295),
 ('joe', 0.6460766792297363),
 ('mittromney', 0.6367663145065308),
 ('frontrunner', 0.6346907019615173),
 ('scarborough', 0.6263604164123535),
 ('barletta', 0.6211426258087158)]