# Define Environment

In [3]:
import os
os.chdir('/home/mike/Desktop/Word Embeddings')
import pandas as pd
import pickle
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import random

os.chdir('./Text Processing')
from TextPrep import TextPrep

os.chdir('../Meta Data')
from key_words import key_words_small, key_synonyms, base_words, base_synonyms
from stop_words import stop_words
os.chdir('..')

In [4]:
# define functions for analyzing the word vectors

# gets most similar words
def similar_words(word, model, topn):
    sim_words = []
    for i in range(topn):
        sim_words.append(model.wv.most_similar(word, topn = topn)[i][0])
    return sim_words + [word]

# returns a dictionary for the similar words. values are the key words, keys are the most similar words
def similar_dict(words1, words2, labels):
    both = [word for word in words1 if word in words2]
    words1 = [word for word in words1 if word not in both]
    words2 = [word for word in words2 if word not in both]
    words = [words1, words2, both]
    labels = labels + ['Both']
    dictionary = {}
    for i in range(len(labels)):
        for word in words[i]:
            dictionary[word] = labels[i]
    return dictionary

# Converts pca model to a data frame for plotting
def pca2df(pcamodel, embedding, dictionary):
    # convert the pca element to a df
    pc_df = pd.DataFrame(data = pcamodel, columns = ['pc1', 'pc2', 'pc3'])
    # add word column to the df
    pc_df['word'] = [key for key in embedding.wv.vocab]
    # get a list of unique words from the dictionary
    words = list(dictionary.keys())
    words = list(set(words))
    # keep only components that are in the list of unique words
    pc_df = pc_df[pc_df['word'].isin(words)].reset_index(drop=True)
    colors = {'Democrat': 'blue', 'Republican':'red', 'Both': 'purple', 'A': 'blue', 'B':'red'}
    pc_df['label'] = pc_df['word'].map(dictionary)
    pc_df['color'] = [colors[word] for word in pc_df['label']]
    return pc_df

# returns the cosine similarity of two words
def cosine_sim(parser, keyword, text, labels):
    # define tagged keywords. To generalize get a list of unique labels. loop through create a new variable for each label
    keyword_r = keyword + '_r'
    keyword_d = keyword + '_d'
    
    ptweets = []
    for i in range(len(text)):
        try:
            ptweets.append(parser.tag_keywords(keyword, text[i], labels[i])) # tweets and labels are global variables. change to local
        except Exception as e:
            print(e)
            print('failed at '+ keyword + str(i))
            

    # lemmatize
    ptweets = parser.multi_lemmatizer(ptweets, threads = 6)

    # drop single letters
    for i in range(len(ptweets)):
        ptweets[i] = [word for word in ptweets[i] if len(word) > 1]

    # train and save word2vec
    pmodel = Word2Vec(ptweets, window = 10, sg = 1)
    
    # return cosine similarity between the words
    return pmodel.wv.similarity(keyword_r, keyword_d)

In [5]:
# load in data
meta_data = pd.read_csv('Meta Data/meta_data.csv')
tweet_df = pd.read_csv('Data/aggregated_tweets.csv')
# subset to tweets after oct 29
tweet_df = tweet_df[tweet_df['created'] >= '2019-11-06']
# merge data with meta data
tweet_df = pd.merge(tweet_df, meta_data, how = 'inner', on = 'user_id')
tweet_df = tweet_df[tweet_df.party.isin(['R', 'D'])].reset_index(drop=True)

tweets = tweet_df['text']
labels = tweet_df['party']

# initialize parser for both keywords and base words
keyprep = TextPrep(stopwords = stop_words, key_words = key_words_small, key_synonyms = key_synonyms)
baseprep = TextPrep(stopwords = stop_words, key_words = base_words, key_synonyms = base_synonyms)

In [6]:
%%time
# preprocess text
tweets = [keyprep.twitter_preprocess(tweet) for tweet in tweets]

CPU times: user 598 ms, sys: 487 µs, total: 598 ms
Wall time: 597 ms


# Get cosine sim for key and base words

In [7]:
%%time
# get cosine similarity for all words in the key word list
keysim = []
for word in key_words_small[0:5]:
    cosine = cosine_sim(parser = keyprep, keyword = word, text = tweets, labels = labels)
    keysim.append(cosine)

# Convert to dataframe
keysimdf = pd.DataFrame(data=list(zip(key_words_small[0:5], keysim)), columns = ['word', 'similarity'])
keysimdf.to_csv('keyword_similarity.csv', index = False)

CPU times: user 4min 25s, sys: 14 s, total: 4min 39s
Wall time: 3min 19s


In [8]:
# # get cosine similarity for all words in the base word list
basesim = []
for word in base_words[0:5]:
    cosine = cosine_sim(parser = baseprep, keyword = word, text = tweets, labels = labels)
    basesim.append(cosine)

# Convert to dataframe
basesimdf = pd.DataFrame(data=list(zip(base_words[0:5], basesim)), columns = ['word', 'similarity'])
basesimdf.to_csv('baseword_similarity.csv', index = False)

# testing and dev

In [27]:
'administration' in prep.key_synonyms

False

In [30]:
if 'administration' in prep.key_synonyms:
    print(prep.replace_synonyms('administration', tweets[0]))
print(tweets[0])


im live this morning on kfor with laceylett great talk on reducing prescription drug costs and my upcoming community conversations in oklahoma
