# Define Environment

In [26]:
pip freeze

anaconda-client==1.7.2
anaconda-navigator==1.9.7
asn1crypto==1.2.0
astroid==2.2.5
atomicwrites==1.3.0
attrs==19.3.0
autopep8==1.4.4
backcall==0.1.0
backports.functools-lru-cache==1.5
backports.tempfile==1.0
backports.weakref==1.0.post1
beautifulsoup4==4.8.1
bleach==3.1.0
blis==0.2.4
boto==2.49.0
boto3==1.9.222
botocore==1.12.222
certifi==2019.9.11
cffi==1.13.1
chardet==3.0.4
Click==7.0
clyent==1.2.2
conda==4.7.12
conda-build==3.18.8
conda-package-handling==1.6.0
conda-verify==3.4.2
cryptography==2.8
cycler==0.10.0
cymem==2.0.2
decorator==4.4.1
defusedxml==0.6.0
demoji==0.1.5
docutils==0.15.2
en-core-web-sm==2.1.0
entrypoints==0.3
filelock==3.0.12
funcy==1.13
future==0.17.1
fuzzyset==0.0.19
gensim==3.8.0
glob2==0.7
idna==2.8
importlib-metadata==0.23
ipykernel==5.1.3
ipython==7.9.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
isort==4.3.21
jedi==0.14.1
Jinja2==2.10.3
jmespath==0.9.4
joblib==0.13.2
json5==0.8.5
jsonschema==3.1.1
jupyter-client==5.3.4
jupyter-core==4.6.0
jupyterlab==1.1.4
jup

In [1]:
import os
os.chdir('/home/mike/Desktop/Word Embeddings')
import pandas as pd
import pickle
from gensim.models import Word2Vec
from mpl_toolkits.mplot3d import Axes3D
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import random

os.chdir('./Text Processing')
from TextPrep import TextPrep

os.chdir('../Meta Data')
from key_words import key_words_small, key_synonyms, base_words, base_synonyms
from stop_words import stop_words
os.chdir('..')

In [2]:
# define functions for analyzing the word vectors

# gets most similar words
def similar_words(word, model, topn):
    sim_words = []
    for i in range(topn):
        sim_words.append(model.wv.most_similar(word, topn = topn)[i][0])
    return sim_words + [word]

# returns a dictionary for the similar words. values are the key words, keys are the most similar words
def similar_dict(words1, words2, labels):
    both = [word for word in words1 if word in words2]
    words1 = [word for word in words1 if word not in both]
    words2 = [word for word in words2 if word not in both]
    words = [words1, words2, both]
    labels = labels + ['Both']
    dictionary = {}
    for i in range(len(labels)):
        for word in words[i]:
            dictionary[word] = labels[i]
    return dictionary

# Converts pca model to a data frame for plotting
def pca2df(pcamodel, embedding, dictionary):
    # convert the pca element to a df
    pc_df = pd.DataFrame(data = pcamodel, columns = ['pc1', 'pc2', 'pc3'])
    # add word column to the df
    pc_df['word'] = [key for key in embedding.wv.vocab]
    # get a list of unique words from the dictionary
    words = list(dictionary.keys())
    words = list(set(words))
    # keep only components that are in the list of unique words
    pc_df = pc_df[pc_df['word'].isin(words)].reset_index(drop=True)
    colors = {'Democrat': 'blue', 'Republican':'red', 'Both': 'purple', 'A': 'blue', 'B':'red'}
    pc_df['label'] = pc_df['word'].map(dictionary)
    pc_df['color'] = [colors[word] for word in pc_df['label']]
    return pc_df

# returns the cosine similarity of two words
def cosine_sim(parser, keyword, text, labels):
    # define tagged keywords. To generalize get a list of unique labels. loop through create a new variable for each label
    keyword_r = keyword + '_r'
    keyword_d = keyword + '_d'
    
    ptweets = []
    for i in range(len(text)):
        try:
            ptweets.append(parser.tag_keywords(keyword, text[i], labels[i])) # tweets and labels are global variables. change to local
        except Exception as e:
            print(e)
            print('failed at '+ keyword + str(i))
            

    # lemmatize
    ptweets = parser.multi_lemmatizer(ptweets, threads = 6)

    # drop single letters
    for i in range(len(ptweets)):
        ptweets[i] = [word for word in ptweets[i] if len(word) > 1]

    # train and save word2vec
    pmodel = Word2Vec(ptweets, window = 10, sg = 1)
    
    # return cosine similarity between the words
    return pmodel.wv.similarity(keyword_r, keyword_d)

In [3]:
# load in data
meta_data = pd.read_csv('Meta Data/meta_data.csv')
tweet_df = pd.read_csv('Data/aggregated_tweets.csv')
# subset to tweets after oct 29
tweet_df = tweet_df[tweet_df['created'] >= '2019-11-06']
# merge data with meta data
tweet_df = pd.merge(tweet_df, meta_data, how = 'inner', on = 'user_id')
tweet_df = tweet_df[tweet_df.party.isin(['R', 'D'])].reset_index(drop=True)

tweets = tweet_df['text']
labels = tweet_df['party']

# initialize parser for both keywords and base words
keyprep = TextPrep(stopwords = stop_words, key_words = key_words_small, key_synonyms = key_synonyms)
baseprep = TextPrep(stopwords = stop_words, key_words = base_words, key_synonyms = base_synonyms)

In [4]:
%%time
# preprocess text
tweets = [keyprep.twitter_preprocess(tweet) for tweet in tweets]

NameError: name 'prep' is not defined

# Get cosine sim for key and base words

In [5]:
%%time
# get cosine similarity for all words in the key word list
keysim = []
for word in key_words_small[0:5]:
    cosine = cosine_sim(parser = keyprep, keyword = word, text = tweets, labels = labels)
    keysim.append(cosine)

# Convert to dataframe
keysimdf = pd.DataFrame(data=list(zip(key_words_small[0:5], keysim)), columns = ['word', 'similarity'])
keysimdf.to_csv('keyword_similarity.csv', index = False)

CPU times: user 11min 32s, sys: 36 s, total: 12min 8s
Wall time: 4min 5s


In [8]:
# # get cosine similarity for all words in the base word list
basesim = []
for word in base_words[0:5]:
    cosine = cosine_sim(parser = baseprep, keyword = word, text = tweets, labels = labels)
    basesim.append(cosine)

# Convert to dataframe
basesimdf = pd.DataFrame(data=list(zip(base_words[0:5], basesim)), columns = ['word', 'similarity'])
basesimdf.to_csv('baseword_similarity.csv', index = False)

# testing and dev

In [27]:
'administration' in prep.key_synonyms

False

In [30]:
if 'administration' in prep.key_synonyms:
    print(prep.replace_synonyms('administration', tweets[0]))
print(tweets[0])


im live this morning on kfor with laceylett great talk on reducing prescription drug costs and my upcoming community conversations in oklahoma
