# Define Environment

In [1]:
import os
os.chdir('C:/Users/Mike/Desktop/word_embeddings-master')
import pandas as pd
import pickle
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import random

os.chdir('./Text Processing')
from TextPrep import TextPrep

os.chdir('../Meta Data')
from key_words import key_words_small, key_synonyms, base_words, base_synonyms
from stop_words import stop_words
os.chdir('..')

In [2]:
# define functions for analyzing the word vectors
# returns the cosine similarity of two words
def cosine_sim(parser, keyword, text, labels):
    # define tagged keywords. To generalize get a list of unique labels. loop through create a new variable for each label
    keyword_r = keyword + '_r'
    keyword_d = keyword + '_d'
    
    ptweets = []
    for i in range(len(text)):
        try:
            ptweets.append(parser.tag_keywords(keyword, text[i], labels[i])) # tweets and labels are global variables. change to local
        except Exception as e:
            print(e)
            print('failed at '+ keyword + str(i))
            

    # lemmatize
    ptweets = parser.multi_lemmatizer(ptweets, threads = 16)

    # drop single letters
    for i in range(len(ptweets)):
        ptweets[i] = [word for word in ptweets[i] if len(word) > 1]

    # train and save word2vec
    pmodel = Word2Vec(ptweets, window = 10, sg = 1)
    
    # return cosine similarity between the words
    return pmodel.wv.similarity(keyword_r, keyword_d)

In [3]:
# load in data
meta_data = pd.read_csv('Meta Data/meta_data.csv')
tweet_df = pd.read_csv('Data/aggregated_tweets.csv')
# subset to tweets after oct 29
tweet_df = tweet_df[tweet_df['created'] >= '2019-11-06']
# merge data with meta data
tweet_df = pd.merge(tweet_df, meta_data, how = 'inner', on = 'user_id')
tweet_df = tweet_df[tweet_df.party.isin(['R', 'D'])].reset_index(drop=True)

tweets = tweet_df['text']
labels = tweet_df['party']

# initialize parser for both keywords and base words
keyprep = TextPrep(stopwords = stop_words, key_words = key_words_small, key_synonyms = key_synonyms)
baseprep = TextPrep(stopwords = stop_words, key_words = base_words, key_synonyms = base_synonyms)

In [4]:
%%time
# preprocess text
tweets = [keyprep.twitter_preprocess(tweet) for tweet in tweets]

Wall time: 676 ms


# Get cosine sim for key and base words

In [16]:
for word in key_words_small[5:6]:
    print(word)

collusion


In [19]:
%%time
# get cosine similarity for all words in the key word list
keysim = []
for word in key_words_small[5:6]:
    try:
        cosine = cosine_sim(parser = keyprep, keyword = word, text = tweets, labels = labels)
        keysim.append(cosine)
    except Exception as e:
        print(e)
        print('failed at ' + word)

# Convert to dataframe
#keysimdf = pd.DataFrame(data=list(zip(key_words_small[0:5], keysim)), columns = ['word', 'similarity'])
#keysimdf.to_csv('keyword_similarity.csv', index = False)

"word 'collusion_d' not in vocabulary"
failed at collusion
Wall time: 1min 9s


In [28]:
keysim

[0.6810423, 0.51987475, 0.54324013, 0.82438076, 0.70095146]

In [8]:
# # get cosine similarity for all words in the base word list
basesim = []
for word in base_words[0:5]:
    cosine = cosine_sim(parser = baseprep, keyword = word, text = tweets, labels = labels)
    basesim.append(cosine)

# Convert to dataframe
basesimdf = pd.DataFrame(data=list(zip(base_words[0:5], basesim)), columns = ['word', 'similarity'])
basesimdf.to_csv('baseword_similarity.csv', index = False)

# testing and dev

In [5]:
import re

In [6]:
# subset to democrat and republican tweets
dems = tweet_df[tweet_df['party'] == 'D']
demtext = dems['text']
reps = tweet_df[tweet_df['party'] == 'R']
reptext = reps['text']

In [7]:
# count instances of each word used by democrats
for word in key_words_small:
    count_text = []
    for text in list(demtext):
        if word in keyprep.key_synonyms.values():
            count_text.append(keyprep.replace_synonyms(word, text))
        else:
            count_text.append(text)
    dcount = str(len([text for text in count_text if re.search(word, text.lower())]))
    print(word + ':' + ' ' + dcount)

abortion: 44
administration: 557
border: 104
biden: 87
conservative: 12
collusion: 0
clinton: 26
campaignfinance: 7
corrupt: 271
court: 309
climatechange: 242
cnn: 81
democrat: 546
daca: 289
economy: 317
electoralcollege: 0
education: 302
fox: 35
gun: 611
god: 32
gerrymander: 11
healthcare: 622
impeach: 972
immigration: 96
insurance: 205
liberal: 32
mueller: 12
mikepence: 0
mcconnell: 412
mexico: 100
media: 175
news: 392
obama: 37
oil: 100
president: 1720
pelosi: 140
police: 81
republican: 612
russia: 173
religion: 14
racist: 56
refugee: 40
sanders: 30
socialist: 0
scotus: 134
tax: 314
trump: 2239
usmca: 29
wall: 97
wealth: 81
welfare: 21
warren: 31
whitehouse: 19


In [8]:
# count instances of each word used by republicans
for word in key_words_small:
    count_text = []
    for text in list(reptext):
        if word in keyprep.key_synonyms.values():
            count_text.append(keyprep.replace_synonyms(word, text))
        else:
            count_text.append(text)
    rcount = str(len([text for text in count_text if re.search(word, text.lower())]))
    print(word + ':' + ' ' + rcount)

abortion: 22
administration: 116
border: 133
biden: 91
conservative: 25
collusion: 33
clinton: 32
campaignfinance: 0
corrupt: 99
court: 116
climatechange: 6
cnn: 26
democrat: 1141
daca: 6
economy: 313
electoralcollege: 0
education: 76
fox: 221
gun: 48
god: 126
gerrymander: 0
healthcare: 136
impeach: 1686
immigration: 33
insurance: 25
liberal: 59
mueller: 24
mikepence: 0
mcconnell: 16
mexico: 102
media: 137
news: 590
obama: 95
oil: 38
president: 869
pelosi: 377
police: 59
republican: 188
russia: 115
religion: 9
racist: 1
refugee: 3
sanders: 5
socialist: 32
scotus: 2
tax: 226
trump: 1290
usmca: 508
wall: 69
wealth: 16
welfare: 6
warren: 17
whitehouse: 35


In [8]:
for word in base_words:
    rcount = str(len(list(filter(lambda x: word in x, reptext))))
    dcount = str(len(list(filter(lambda x: word in x, demtext))))
    print(word + ':' + ' ' + rcount +', ' + dcount)

answer: 148, 250
annual: 54, 125
able: 617, 1694
bring: 185, 313
cancer: 33, 71
come: 374, 740
chance: 48, 119
 day : 318, 447
entire: 82, 98
far: 346, 296
find: 99, 310
go: 1374, 2398
get: 801, 1421
hear: 909, 1372
help: 697, 1348
host: 138, 281
hold: 246, 606
 join : 244, 464
look: 275, 374
long: 315, 577
like: 414, 778
live: 551, 1096
month: 181, 321
matter: 65, 166
member: 503, 863
morning: 261, 353
meet: 285, 490
night: 212, 318
near: 108, 242
opportunity: 172, 320
open: 172, 406
plan: 147, 545
place: 134, 437
phone: 145, 135
read: 281, 464
receive: 156, 199
recent: 133, 150
sure: 556, 1349
send: 81, 147
share: 110, 245
small: 175, 271
staff: 129, 224
shut: 39, 54
thanksgiving: 9, 6
 thank : 230, 289
think: 129, 198
take: 357, 833
today: 944, 1530
talk: 273, 303
weekend: 81, 139
 week : 235, 273
yesterday: 158, 174


["Despite all the challenges facing our nation, Democrats have focused on nothing but getting rid of @realDonaldTrump since the day he was elected. \r\n\r\n❌They failed with collusion. \r\n❌They failed with the Mueller Report. \r\n❌And they're going to fail again with their #ShamImpeachment https://t.co/RC2x7EMLfr",
 "Devastating headline for Democrats:\r\n\r\n🚨 ‘No One Believes Anything’: Voters Worn Out by a Fog of Political News 🚨\r\n\r\nFrom collusion to obstruction of justice to abuse of power to quid pro quo to bribery/extortion, Democrats have cried 'wolf' too many times. https://t.co/aD7SSdbMdD",
 'The past three years have been marked by false and misleading stories and narratives to perpetuate the Democrats’ Russian collusion hoax and Ukrainian phone call hoax.\r\n\r\nIt’s time to end this sham against @realDonaldTrump and those who support him. https://t.co/6lh5wPA18s',
 '🚨 More collusion between Dems &amp; their star witnesses revealed:\r\n\r\nYovanovitch communicated with 

In [27]:
'administration' in prep.key_synonyms

False

In [30]:
if 'administration' in prep.key_synonyms:
    print(prep.replace_synonyms('administration', tweets[0]))
print(tweets[0])


im live this morning on kfor with laceylett great talk on reducing prescription drug costs and my upcoming community conversations in oklahoma
