# Define Environment

In [1]:
import os
# os.chdir('C:/Users/Mike/Desktop/word_embeddings-master') # windows dir
os.chdir('/home/mike/Desktop/Word Embeddings') # Linux dir
import pandas as pd
import pickle
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import random
from functools import reduce

os.chdir('./Text Processing')
from TextPrep import TextPrep

os.chdir('../Meta Data')
from key_words import key_words, key_synonyms, base_words, base_synonyms, agree_words, agree_synonyms
from stop_words import stop_words
os.chdir('..')

In [2]:
# define functions for analyzing the word vectors
# returns the cosine similarity of two words
def cosine_sim(parser, keyword, text, labels, relative = False):
    # define tagged keywords. To generalize get a list of unique labels. loop through create a new variable for each label
    keyword_r = keyword + '_r'
    keyword_d = keyword + '_d'
    
    ptweets = []
    for i in range(len(text)):
        try:
            ptweets.append(parser.tag_keywords(keyword, text[i], labels[i])) # tweets and labels are global variables. change to local
        except Exception as e:
            print(e)
            print('failed at '+ keyword + str(i))
            
    # lemmatize
    ptweets = parser.multi_lemmatizer(ptweets, threads = 6)

    # drop single letters
    for i in range(len(ptweets)):
        ptweets[i] = [word for word in ptweets[i] if len(word) > 1]

    # train and save word2vec
    pmodel = Word2Vec(ptweets, window = 10, sg = 1)
    
    # return cosine similarity between the words
    if relative == True:
        return pmodel.wv.relative_cosine_similarity(keyword_r, keyword_d, topn=10)
    else:
        return pmodel.wv.similarity(keyword_r, keyword_d)

In [3]:
# load in data
meta_data = pd.read_csv('Meta Data/meta_data.csv')
tweet_df = pd.read_csv('Data/aggregated_tweets.csv')
# subset to tweets after Nov. 6
tweet_df = tweet_df[(tweet_df['created'] >= '2019-11-06') & 
                   (tweet_df['created'] <= '2019-12-16')]
# merge data with meta data
tweet_df = pd.merge(tweet_df, meta_data, how = 'inner', on = 'user_id')
tweet_df = tweet_df[tweet_df.party.isin(['R', 'D'])].reset_index(drop=True)

tweets = tweet_df['text']
labels = tweet_df['party']

# initialize parser for both keywords and base words
keyprep = TextPrep(stopwords = stop_words, key_words = key_words, key_synonyms = key_synonyms)
baseprep = TextPrep(stopwords = stop_words, key_words = base_words, key_synonyms = base_synonyms)
agreeprep = TextPrep(stopwords = stop_words, key_words = agree_words, key_synonyms = agree_synonyms)

In [4]:
%%time
# preprocess text
tweets = [keyprep.twitter_preprocess(tweet) for tweet in tweets]

CPU times: user 734 ms, sys: 4.1 ms, total: 738 ms
Wall time: 736 ms


# Get cosine sim for key words

In [5]:
%%time
# get cosine similarity for all words in the key word list
keysim = []
for word in key_words:
    try:
        cosine = cosine_sim(parser = keyprep, keyword = word, text = tweets, labels = labels)
        keysim.append(cosine)
    except Exception as e:
        print(e)
        print('failed at ' + word)

# Convert to dataframe
keysimdf = pd.DataFrame(data=list(zip(key_words, keysim)), columns = ['word', 'similarity'])
keysimdf.to_csv('Analysis/keyword_similarity.csv', index = False)

CPU times: user 37min 5s, sys: 11.6 s, total: 37min 16s
Wall time: 27min 42s


# Get cosine sim for base words

In [6]:
%%time
# get cosine similarity for all words in the base word list
basesim = []
for word in base_words:
    try:
        cosine = cosine_sim(parser = baseprep, keyword = word, text = tweets, labels = labels)
        basesim.append(cosine)
    except Exception as e:
        print(e)
        print('failed at ' + word)
# Convert to dataframe
basesimdf = pd.DataFrame(data=list(zip(base_words, basesim)), columns = ['word', 'similarity'])
basesimdf.to_csv('Analysis/baseword_similarity.csv', index = False)

CPU times: user 56min 48s, sys: 18.2 s, total: 57min 7s
Wall time: 41min 52s


# Get cosine sim for agree words

In [7]:
%%time
# get cosine similarity for all words in the base word list
agreesim = []
for word in agree_words:
    try:
        cosine = cosine_sim(parser = agreeprep, keyword = word, text = tweets, labels = labels)
        agreesim.append(cosine)
    except Exception as e:
        print(e)
        print('failed at ' + word)
# Convert to dataframe
agreesimdf = pd.DataFrame(data=list(zip(agree_words, agreesim)), columns = ['word', 'similarity'])
agreesimdf.to_csv('Analysis/agreeword_similarity.csv', index = False)

CPU times: user 11min 36s, sys: 3.75 s, total: 11min 40s
Wall time: 8min 34s


# Compare results

In [8]:
sum(keysim)/len(keysim)

0.570209825861043

In [9]:
sum(basesim)/len(basesim)
#basesim

0.6012013243867996

In [10]:
sum(agreesim)/len(agreesim)

0.7672532081604004

# Relative Cosine similarity tests

Key words

In [11]:
%%time
# get cosine similarity for all words in the key word list
keyrelsim = []
for word in key_words:
    try:
        cosine = cosine_sim(parser = keyprep, keyword = word, text = tweets, labels = labels, relative = True)
        keyrelsim.append(cosine)
    except Exception as e:
        print(e)
        print('failed at ' + word)

# Convert to dataframe
keyrelsimdf = pd.DataFrame(data=list(zip(key_words, keyrelsim)), columns = ['word', 'relative_similarity'])
keyrelsimdf.to_csv('Data/keyword_relative_similarity.csv', index = False)

CPU times: user 34min 10s, sys: 31.4 s, total: 34min 41s
Wall time: 25min 9s


In [17]:
sum(keyrelsim)/len(keyrelsim)

0.07482782923600337

Base words

In [18]:
%%time
# get relative cosine similarity for all words in the base word list
baserelsim = []
for word in base_words:
    try:
        cosine = cosine_sim(parser = baseprep, keyword = word, text = tweets, labels = labels, relative = True)
        baserelsim.append(cosine)
    except Exception as e:
        print(e)
        print('failed at ' + word)
# Convert to dataframe
baserelsimdf = pd.DataFrame(data=list(zip(base_words, baserelsim)), columns = ['word', 'relative_similarity'])
baserelsimdf.to_csv('Data/baseword_relative_similarity.csv', index = False)

CPU times: user 58min 44s, sys: 52.2 s, total: 59min 36s
Wall time: 43min 26s


In [19]:
sum(baserelsim)/len(baserelsim)

0.09539504181767948

Agree words

In [20]:
%%time
agreerelsim = []
for word in agree_words:
    try:
        cosine = cosine_sim(parser = agreeprep, keyword = word, text = tweets, labels = labels, relative = True)
        agreerelsim.append(cosine)
    except Exception as e:
        print(e)
        print('failed at ' + word)
# Convert to dataframe
agreerelsimdf = pd.DataFrame(data=list(zip(agree_words, agreerelsim)), columns = ['word', 'relative_similarity'])
agreerelsimdf.to_csv('Data/agreeword_relative_similarity.csv', index = False)

CPU times: user 12min 44s, sys: 10.8 s, total: 12min 55s
Wall time: 9min 19s


In [21]:
sum(agreerelsim)/len(agreerelsim)

0.09952264188657398

## Combine to a single DF

In [24]:
# Assign labels to the words
keysimdf['label'] = 'disagree'
basesimdf['label'] = 'base'
agreesimdf['label'] = 'agree'

keyrelsimdf['label'] = 'disagree'
baserelsimdf['label'] = 'base'
agreerelsimdf['label'] = 'agree'

# concat cosine and relative cosine into two frames
cosine_frames = pd.concat([keysimdf, basesimdf, agreesimdf])
relative_frames = pd.concat([keyrelsimdf, baserelsimdf, agreerelsimdf])
# merge ingo a single df
df_merged = pd.merge(cosine_frames, relative_frames, how='left', on = ['word', 'label'])

df_merged.to_csv("word_similarities.csv", index = False)

# testing and dev

In [9]:
import re

In [7]:
# subset to democrat and republican tweets
countdf = pd.DataFrame(data = list(zip(tweets,labels)), columns = ['text', 'party'])
dems = countdf[countdf['party'] == 'D']
demtext = dems['text']
reps = countdf[countdf['party'] == 'R']
reptext = reps['text']

In [10]:
# count instances of each word used by democrats
for word in key_words_small:
    count_text = []
    for text in list(demtext):
        if word in keyprep.key_synonyms.values():
            count_text.append(keyprep.replace_synonyms(word, text))
        else:
            count_text.append(text)
    dcount = str(len([text for text in count_text if re.search(word, text.lower())]))
    print(word + ':' + ' ' + dcount)

abortion: 44
administration: 568
border: 110
conservative: 12
corrupt: 273
climatechange: 293
democrat: 806
daca: 289
economy: 327
gun: 614
healthcare: 688
impeach: 983
immigration: 99
insurance: 210
liberal: 60
mcconnell: 465
oil: 103
president: 1740
pelosi: 142
police: 146
republican: 851
russia: 173
scotus: 295
tax: 316
trump: 2350
usmca: 29
wall: 98
wealth: 83
welfare: 21
whitehouse: 236


In [11]:
# count instances of each word used by republicans
for word in key_words_small:
    count_text = []
    for text in list(reptext):
        if word in keyprep.key_synonyms.values():
            count_text.append(keyprep.replace_synonyms(word, text))
        else:
            count_text.append(text)
    rcount = str(len([text for text in count_text if re.search(word, text.lower())]))
    print(word + ':' + ' ' + rcount)

abortion: 22
administration: 116
border: 133
conservative: 25
corrupt: 101
climatechange: 7
democrat: 1363
daca: 6
economy: 319
gun: 42
healthcare: 143
impeach: 1706
immigration: 33
insurance: 25
liberal: 62
mcconnell: 17
oil: 36
president: 878
pelosi: 385
police: 116
republican: 355
russia: 117
scotus: 22
tax: 231
trump: 1501
usmca: 513
wall: 70
wealth: 16
welfare: 6
whitehouse: 92


In [12]:
# count instances of each agree word used by republicans
for word in agree_words:
    count_text = []
    for text in list(reptext):
        if word in agreeprep.key_synonyms.values():
            count_text.append(agreeprep.replace_synonyms(word, text))
        else:
            count_text.append(text)
    rcount = str(len([text for text in count_text if re.search(word, text.lower())]))
    print(word + ':' + ' ' + rcount)

cancer: 43
education: 77
infrastructure: 105
isis: 115
kurd: 6
service: 646
terrorism: 80
veteran: 855


In [13]:
for word in agree_words:
    count_text = []
    for text in list(demtext):
        if word in agreeprep.key_synonyms.values():
            count_text.append(agreeprep.replace_synonyms(word, text))
        else:
            count_text.append(text)
    rcount = str(len([text for text in count_text if re.search(word, text.lower())]))
    print(word + ':' + ' ' + rcount)

cancer: 87
education: 304
infrastructure: 124
isis: 451
kurd: 32
service: 1248
terrorism: 61
veteran: 1284


In [8]:
for word in base_words:
    rcount = str(len(list(filter(lambda x: word in x, reptext))))
    dcount = str(len(list(filter(lambda x: word in x, demtext))))
    print(word + ':' + ' ' + rcount +', ' + dcount)

answer: 148, 250
annual: 54, 125
able: 617, 1694
bring: 185, 313
cancer: 33, 71
come: 374, 740
chance: 48, 119
 day : 318, 447
entire: 82, 98
far: 346, 296
find: 99, 310
go: 1374, 2398
get: 801, 1421
hear: 909, 1372
help: 697, 1348
host: 138, 281
hold: 246, 606
 join : 244, 464
look: 275, 374
long: 315, 577
like: 414, 778
live: 551, 1096
month: 181, 321
matter: 65, 166
member: 503, 863
morning: 261, 353
meet: 285, 490
night: 212, 318
near: 108, 242
opportunity: 172, 320
open: 172, 406
plan: 147, 545
place: 134, 437
phone: 145, 135
read: 281, 464
receive: 156, 199
recent: 133, 150
sure: 556, 1349
send: 81, 147
share: 110, 245
small: 175, 271
staff: 129, 224
shut: 39, 54
thanksgiving: 9, 6
 thank : 230, 289
think: 129, 198
take: 357, 833
today: 944, 1530
talk: 273, 303
weekend: 81, 139
 week : 235, 273
yesterday: 158, 174


["Despite all the challenges facing our nation, Democrats have focused on nothing but getting rid of @realDonaldTrump since the day he was elected. \r\n\r\n❌They failed with collusion. \r\n❌They failed with the Mueller Report. \r\n❌And they're going to fail again with their #ShamImpeachment https://t.co/RC2x7EMLfr",
 "Devastating headline for Democrats:\r\n\r\n🚨 ‘No One Believes Anything’: Voters Worn Out by a Fog of Political News 🚨\r\n\r\nFrom collusion to obstruction of justice to abuse of power to quid pro quo to bribery/extortion, Democrats have cried 'wolf' too many times. https://t.co/aD7SSdbMdD",
 'The past three years have been marked by false and misleading stories and narratives to perpetuate the Democrats’ Russian collusion hoax and Ukrainian phone call hoax.\r\n\r\nIt’s time to end this sham against @realDonaldTrump and those who support him. https://t.co/6lh5wPA18s',
 '🚨 More collusion between Dems &amp; their star witnesses revealed:\r\n\r\nYovanovitch communicated with 

In [27]:
'administration' in prep.key_synonyms

False

In [30]:
if 'administration' in prep.key_synonyms:
    print(prep.replace_synonyms('administration', tweets[0]))
print(tweets[0])


im live this morning on kfor with laceylett great talk on reducing prescription drug costs and my upcoming community conversations in oklahoma
