In [None]:
import nltk
import re
import multiprocessing as mp
import plotly.express as px

In [1]:
def writeToFile(filename, left_tokenize, right_tokenize, query):
    obj=[query],left_tokenize,right_tokenize

    with open(filename, 'a', encoding='utf8') as fin:
        fin.write(str(obj) + "\n")

In [1]:
# Creates a new file, takes the base concordance file and runs POS tagging.
# This code strips the concordance down to left and right of the query (word concordance query)
def stripConcordance(query):
    stripped_string = "".lower()
    with open('concordance_'+ query + '.txt', 'r', encoding='utf8') as f:
        for line in f.readlines():
            
            # Strips the left and right side of the concordance of unwanted parts.
            left = line[line.find("left_print="):line.find("right_")].strip("left_print=")
            left_re = re.sub('[^A-Za-z0-9 ]+', '', left)
            right = line[line.find("right_print="):line.find("line=")].strip("right_print=")
            right_re = re.sub('[^A-Za-z0-9 ]+', '', right)
            
            # Combines the left and right side of the concordance into a single string.
            stripped_string += left_re
            stripped_string += " "
            stripped_string += right_re
            
            left_tokenize = nltk.pos_tag(nltk.word_tokenize(left_re))
            right_tokenize = nltk.pos_tag(nltk.word_tokenize(right_re))
            writeToFile('POS-concordance-'+ query +'.txt', left_tokenize, right_tokenize, query)
        
        return posTag(stripped_string, query)

In [1]:
# Removes words used to obtain data as well as any words that have snuck into the concordance.
def posStripper(stripped_string):
    stripped_string = stripped_string.replace("conspiracy", "")
    stripped_string = stripped_string.replace("propaganda", "")
    stripped_string = stripped_string.replace("misinformation", "")
    stripped_string = stripped_string.replace("https", "")
    return stripped_string       

In [1]:
def posTag(stripped_string, query):
    # calls function to remove certain words used to obtain data.
    stripped_string = posStripper(stripped_string)
    
    # POS tag the entire concordance.
    token = nltk.pos_tag(nltk.word_tokenize(stripped_string))
    pos_frq = nltk.FreqDist(tag for (word, tag) in token)
    nouns = getNouns(token, query)
    verbs = getVerbs(token, query)

In [1]:
# get all nouns and return the 20 most common
def getNouns(token, query):
    most_common_nouns = []
    for i, j in token:
        if 'NN' in j and len(i) > 4:
            most_common_nouns.append(i)
    noun_frq = nltk.FreqDist(noun for noun in most_common_nouns)
    most_common_nouns = noun_frq.most_common(20)
    print(query + " NOUNS")
    return most_common_nouns

In [1]:
# get all verbs and return the 20 most common
def getVerbs(token, query):
    most_common_verbs = []
    for i, j in token:
        if 'VB' in j and len(i) > 4:
            most_common_verbs.append(i)
    verb_frq = nltk.FreqDist(verb for verb in most_common_verbs)
    most_common_verbs = verb_frq.most_common(20)
    print(query + " VERBS")
    return most_common_verbs

In [148]:
print(stripConcordance("I"))

I NOUNS
[('people', 1509), ('Trump', 811), ('realDonaldTrump', 728), ('media', 690), ('theories', 628), ('theory', 501), ('state', 378), ('virus', 378), ('thing', 371), ('guess', 368), ('China', 335), ('theorist', 313), ('think', 307), ('anything', 296), ('something', 273), ('tweet', 258), ('things', 252), ('someone', 234), ('anyone', 233), ('President', 226)]
I VERBS
[('think', 2290), ('believe', 1341), ('going', 595), ('thought', 534), ('watch', 450), ('being', 410), ('saying', 378), ('agree', 319), ('spreading', 314), ('doing', 308), ('watching', 294), ('understand', 259), ('heard', 236), ('trying', 236), ('getting', 209), ('talking', 196), ('watched', 196), ('remember', 196), ('wonder', 193), ('listen', 192)]
None


In [149]:
print(stripConcordance("me"))

me NOUNS
[('people', 260), ('theories', 172), ('Trump', 167), ('theory', 122), ('theorist', 107), ('media', 106), ('Please', 92), ('realDonaldTrump', 83), ('virus', 83), ('someone', 81), ('thing', 73), ('state', 67), ('please', 61), ('President', 56), ('China', 56), ('something', 55), ('country', 54), ('video', 51), ('anyone', 49), ('anything', 49)]
me VERBS
[('makes', 168), ('believe', 119), ('think', 111), ('telling', 75), ('going', 69), ('being', 65), ('trying', 63), ('spreading', 57), ('saying', 55), ('making', 51), ('blocked', 46), ('talking', 43), ('kidding', 43), ('sending', 41), ('tells', 41), ('doing', 41), ('called', 36), ('spread', 35), ('seems', 35), ('started', 33)]
None


In [150]:
print(stripConcordance("you"))

you NOUNS
[('people', 2339), ('realDonaldTrump', 1667), ('Trump', 1330), ('media', 1076), ('Thank', 953), ('China', 573), ('state', 546), ('virus', 529), ('President', 520), ('truth', 511), ('disinformation', 478), ('anything', 476), ('facts', 460), ('country', 421), ('world', 410), ('nothing', 401), ('something', 398), ('thing', 394), ('theories', 392), ('everything', 374)]
you VERBS
[('think', 1761), ('believe', 1287), ('spreading', 1176), ('doing', 738), ('going', 635), ('being', 589), ('spread', 531), ('saying', 415), ('talking', 405), ('lying', 354), ('trying', 340), ('makes', 318), ('understand', 308), ('watch', 298), ('called', 290), ('listen', 267), ('realDonaldTrump', 266), ('telling', 244), ('getting', 228), ('realize', 218)]
None


In [151]:
print(stripConcordance("we"))

we NOUNS
[('people', 731), ('Trump', 455), ('media', 389), ('realDonaldTrump', 306), ('China', 273), ('country', 232), ('state', 230), ('virus', 225), ('NEVER', 217), ('world', 206), ('EVERYONE', 197), ('DemocratsNOW', 194), ('TWEET', 194), ('REPUBLICAN', 194), ('truth', 188), ('President', 172), ('thing', 166), ('information', 147), ('People', 145), ('facts', 137)]
we VERBS
[('believe', 305), ('think', 292), ('going', 274), ('being', 217), ('doing', 187), ('testing', 169), ('spreading', 147), ('trust', 115), ('listen', 113), ('saying', 110), ('watch', 95), ('fight', 95), ('getting', 93), ('spread', 86), ('forget', 84), ('realDonaldTrump', 84), ('talking', 83), ('continue', 76), ('expect', 75), ('start', 73)]
None


In [152]:
print(stripConcordance("they"))

they NOUNS
[('people', 1211), ('media', 655), ('Trump', 641), ('disinformation', 539), ('China', 430), ('virus', 367), ('realDonaldTrump', 331), ('state', 295), ('world', 283), ('truth', 260), ('anything', 255), ('Twitter', 245), ('nothing', 237), ('everything', 206), ('State', 205), ('money', 205), ('country', 202), ('President', 192), ('ANGEL', 179), ('government', 155)]
they VERBS
[('think', 594), ('believe', 517), ('doing', 388), ('expose', 369), ('going', 318), ('being', 271), ('spread', 251), ('spreading', 243), ('saying', 220), ('trying', 198), ('lying', 159), ('getting', 155), ('called', 136), ('using', 126), ('telling', 119), ('report', 110), ('started', 109), ('watch', 106), ('making', 99), ('talking', 99)]
None


In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Bar(y=[2, 3, 1]))
fig.show()

In [1]:
import plotly;
plotly.__version__ 

'5.1.0'

In [154]:
print(stripConcordance("us"))

us NOUNS
[('China', 431), ('Trump', 303), ('media', 254), ('people', 243), ('virus', 201), ('coronavirus', 151), ('government', 127), ('state', 122), ('world', 120), ('truth', 112), ('State', 100), ('President', 87), ('president', 84), ('Please', 82), ('COVID19', 81), ('realDonaldTrump', 80), ('disinformation', 77), ('economy', 74), ('country', 72), ('theories', 69)]
us VERBS
[('telling', 132), ('going', 126), ('think', 119), ('trying', 112), ('believe', 103), ('trust', 102), ('choose', 90), ('giving', 81), ('doing', 79), ('wants', 70), ('being', 70), ('spreading', 65), ('tells', 58), ('escalates', 55), ('blame', 53), ('killing', 50), ('spread', 49), ('using', 47), ('working', 43), ('saying', 40)]
None


In [None]:
#reference for nltk multi processing
#https://datascience.blog.wzb.eu/2017/06/19/speeding-up-nltk-with-parallel-processing/
corpus = {f_id: nltk.corpus.gutenberg.raw(f_id)
          for f_id in nltk.corpus.gutenberg.fileids()}

def tokenize_and_pos_tag(pair):
    f_id, doc = pair
    return f_id, nltk.pos_tag(nltk.word_tokenize(doc))

if __name__ == '__main__':
    # automatically uses mp.cpu_count() as number of workers
    # mp.cpu_count() is 4 -> use 4 jobs
    with mp.Pool() as pool:
        tokens = pool.map(tokenize_and_pos_tag, corpus.items())