In [10]:
import GetDependencyParse as GP
# Word2vec
import gensim
# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

import numpy as np
import pandas as pd
import re
import os

In [15]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [13]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [109]:
#stemmer = WordNetLemmatizer()

def text_process(text):
    # Remove all the special characters
    #document = re.sub(r'\W', ' ', str(text))
    document = re.sub(r'https?:\S+|http?:\S', '', str(text))
    # remove all single characters
    #document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    # Remove single characters from the start
    #document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    # remove first space
    document = re.sub(r'^\s+', '', document)
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    # Converting to Lowercase
    document = document.lower()
    # change covid 19 to covid19
    document = re.sub(r'covid(\s+|\-|\_)19', 'covid19', document)
    # change covid to covid19
    document = re.sub(r'covid(\s+|\W|$)', 'covid19', document)
    return(document)

In [111]:
text_process("covid 19 and covid-19 and covid_19 and covid?")

'covid19 and covid19 and covid19 and covid19'

In [39]:
# read dataset
df = pd.read_csv("data/sample4metaphor_jan24-may25_text_filtered.csv", encoding = "ISO-8859-1")

In [40]:
df.text2 = df.text.apply(lambda x: preprocess(x))

In [92]:
df.text3 = df.text.apply(lambda x: text_process(x))

## Word2Vec

In [41]:
%%time
documents = [_text.split() for _text in df.text2] 

CPU times: user 1.9 ms, sys: 0 ns, total: 1.9 ms
Wall time: 1.91 ms


In [42]:
w2v_model = gensim.models.word2vec.Word2Vec(size=200, 
                                            window=5, 
                                            min_count=2, 
                                            workers=8)

In [43]:
w2v_model.build_vocab(documents)
words = w2v_model.wv.vocab.keys()
#print(len(w2v_model.wv["u"]))
#print(list(w2v_model.wv.vocab.items())[0:5])
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 2376


In [44]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=32)

CPU times: user 770 ms, sys: 24.1 ms, total: 795 ms
Wall time: 513 ms


(423345, 615456)

In [45]:
w2v_model.most_similar("covid19")

[('distance', 0.9995676279067993),
 ('amazing', 0.9995567798614502),
 ('everything', 0.9995455145835876),
 ('much', 0.9995318055152893),
 ('listen', 0.9994614124298096),
 ('taken', 0.9994546175003052),
 ('follow', 0.9994485974311829),
 ('near', 0.9994305372238159),
 ('nice', 0.9994035959243774),
 ('scared', 0.9994033575057983)]

## Noun Pair Similarity

In [93]:
df.text3

0       powerful #perspectives shared here: "diary of ...
1       markets bet fed is pushed to cut rates in coro...
2       georgia postponing march 24 presidential prima...
3       don't ask me: just caught a glimpse of richard...
4       icheoku says if regular flu kills about 60,000...
                              ...                        
1240    "we have the best [insert industry/thing] in t...
1241    we are still processing building projects/perm...
1242    in four u.s. state prisons, nearly 3,300 inmat...
1243    i really hate to do this but i just lost one o...
1244    @kfile i just hope trump gets coronavirus and ...
Name: text, Length: 1245, dtype: object

In [96]:
file1 = open("data/sample_text.txt","w") 
for t in df.text3:
    #t + "\n"
    file1.writelines(t+"\n")
    #file1.write("\n")
file1.close()

In [114]:
nlist = []
for sen in df.text3:
    print(sen)
    try:
        __, nouns = GP.dependency_parse(sen)
        nlist.append(nouns)
    except IndexError:
        nlist.append(["error"])

 f496))
                        (-RRB- >)))))))))))
    (NP (PRP i))
    (VP (VBP hope)
      (NP (PRP you))
      (SBAR (IN like)
        (S
          (NP (PRP it))
          (VP (VBP @michaelbuble)
            (NP (NN @lulopilato))))))))
---
well detroit made national news as the new hot spot in the country #coronavirus #21daylockdown
(ROOT
  (S
    (INTJ (UH well))
    (NP (NN detroit))
    (VP (VBD made)
      (NP (JJ national) (NN news))
      (SBAR (IN as)
        (S
          (NP
            (NP (DT the) (JJ new) (JJ hot) (NN spot))
            (PP (IN in)
              (NP (DT the) (NN country))))
          (VP (VBZ #coronavirus)
            (NP (NN #) (NNS 21daylockdown))))))))
---
@hrddonna @meghanmccain @rodney_is_ouat3 does the president who downplayed coronavirus for two months really deserve your vote?
(ROOT
  (S
    (NP (NN @hrddonna) (NN @meghanmccain) (NNS @rodney_is_ouat3))
    (VP (VBZ does)
      (NP
        (NP (DT the) (NN president))
        (SBAR
          (WHNP

In [121]:
filepath = "data/sample_text.txt"
parser_output = os.popen("stanford-parser-full-2020-11-17" + '/lexparser.sh "' + filepath + '"').read()
parser_output = parser_output.split("\n\n")

NameError: name 'folder' is not defined

In [120]:
os.popen("stanford-parser-full-2020-11-17" + '/lexparser.sh "' + "data/sample_text.txt" + '"').read()

''

In [118]:
len(parser_output)
print(parser_output[0])




In [115]:
nlist

],
 ['error'],
 ['savage', 'coronavirus', 'f', 'n'],
 ['error'],
 ['error'],
 ['sec', 'mnuchin', 'stimulus', 'checks', 'weeks'],
 ['coronavirus', 'mail', 'online', 'customs', 'passengers', 'spots'],
 ['face', 'coronavirus', 't', 'trump'],
 ['square',
  'district',
  'station',
  'midtown',
  'face',
  'times',
  'oculus',
  'masks'],
 ['matt', 'gaetz', 'gas', 'mask', 'coronavirus', 'concerns'],
 ['coronavirus', 'tests'],
 ['cdc', 'race', 'ethnicity', 'caucuses', 'data', 'cases'],
 ['ca', 'college', 'access', 'fgli', 'professionals', 'students'],
 ['error'],
 ['hope', 'hydroxychloroquine', 'covid', 'â', 'consequences'],
 ['error'],
 ['bill', 'plan', 'round', 'event', 'gates'],
 [],
 ['covid', 'opportunity', 'flowbee'],
 ['k', 'nevada', '6th', 'schools', 'concerns'],
 ['coronavirus', 'state', 'conspiracy'],
 ['irony'],
 ['moments'],
 ['cuomo'],
 ['error'],
 ['brian', 'kemp', 'coronavirus', 'georgia', 'cases'],
 ['volunteer', 'link', 'opportunities'],
 [],
 ['winter', 'md', 'symptoms'],
 