In [1]:
import os
os.chdir('/home/mike/Desktop/Word Embeddings')
import pandas as pd
from gensim.models import Word2Vec
import random

os.chdir('./Text Processing')
from TextPrep import TextPrep

os.chdir('../Meta Data')
from key_words import key_words, key_synonyms
from stop_words import stop_words
os.chdir('..')

In [2]:
# define function for getting cosine similarity
# variables needed: keyword, list of tweets, list of labels, tagged keywords
# unsure if I want most similar words or not
# adjust number of threads, add variable to toggle multithreading

def cosine_sim(keyword, text, labels):
    # define tagged keywords. To generalize get a list of unique labels. loop through create a new variable for each label
    keyword_r = keyword + '_r'
    keyword_d = keyword + '_d'
    
    ptweets = []
    for i in range(len(text)):
        try:
            ptweets.append(prep.tag_keywords(keyword, text[i], labels[i])) # tweets and labels are global variables. change to local
        except:
            print('failed at '+ str(i))

    # lemmatize
    ptweets = prep.multi_lemmatizer(ptweets, threads = 6)

    # drop single letters
    for i in range(len(ptweets)):
        ptweets[i] = [word for word in ptweets[i] if len(word) > 1]

    # train and save word2vec
    pmodel = Word2Vec(ptweets, window = 10, sg = 1)

    # assign data to variables
    psim = pmodel.wv.similarity(keyword_r, keyword_d)
    return psim

In [3]:
# load in data
meta_data = pd.read_csv('Meta Data/meta_data.csv')
tweet_df = pd.read_csv('Data/aggregated_tweets.csv')
# subset to tweets after oct 29
tweet_df = tweet_df[tweet_df['created'] >= '2019-11-06']
# merge data with meta data
tweet_df = pd.merge(tweet_df, meta_data, how = 'inner', on = 'user_id')
tweet_df = tweet_df[tweet_df.party.isin(['R', 'D'])].reset_index(drop=True)

tweets = tweet_df['text']
plabels = tweet_df['party']

In [4]:
# initialize twitter processor and pre-process tweets
prep = TextPrep(stopwords = stop_words, key_words = key_words, key_synonyms = key_synonyms)
tweets = [prep.twitter_preprocess(tweet) for tweet in tweets]

In [5]:
%%time
# run the permutation test
sim = []
for i in range(2):
    # permute the labels
    rlabels = random.sample(list(plabels), len(plabels))
    # Get cosine similarity
    rcosine = cosine_sim(keyword = 'trump', text = tweets, labels = rlabels)
    # append to list
    sim.append(rcosine)

CPU times: user 1min 56s, sys: 640 ms, total: 1min 57s
Wall time: 1min 27s


In [16]:
# save results as a csv
simdf = pd.DataFrame(sim, columns = ['cosine similarity'])
simdf.to_csv('test.csv', index = False)