In [44]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import math
from glob import glob
import requests
import json

In [68]:
docs = []
en_list = glob('doc/us/*.txt')
for file in en_list:
    with open (file, 'r', encoding='utf-8') as f:
        doc = "\n".join(f.readlines())
        docs.append(doc)

In [24]:
def remove_string_special_characters(s):
    stripped = re.sub('[^\w\s]', '', s)
    stripped = re.sub('_', '', stripped)
    stripped = re.sub('\s+', ' ', stripped)
    stripped = stripped.strip()
    return stripped

In [25]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [30]:
inverted_index = {}
word_counts = []
i = 0
for doc in docs:
    doc = remove_string_special_characters(doc)
    bow = word_tokenize(doc)
    total_terms = len(set(bow))
    
    word_counts.append(total_terms)
    
    for word in bow:
        if word not in inverted_index:
            temp = {}
            temp[i] = {'count': 1}
            inverted_index[word] = temp
        elif i not in inverted_index[word]:
            inverted_index[word][i] = {'count': 1}
        else:
            inverted_index[word][i]['count'] += 1
    
    i += 1
    
for word in inverted_index:
    for docid in inverted_index[word]:
        TF = inverted_index[word][docid]['count'] / word_counts[docid]
        IDF = math.log(len(docs)/len(inverted_index[word]))
        TFIDF  = TF * IDF
        inverted_index[word][docid]['TFIDF'] = TFIDF
        

In [75]:
word_count_dict = {}
count = 0
for doc in docs:
    count += 1
    print(count)
    text = remove_string_special_characters(doc)
    url = "http://localhost:2222/rest/annotate"

    payload = f"confidence=0&text={text}".encode('utf-8')
    headers = {
        'Accept': "application/json",
        'Content-Type': "application/x-www-form-urlencoded"
        }

    res = requests.request("POST", url, data=payload, headers=headers)
    parsed_res = json.loads(res.text)['Resources']
    
    for word in set([r['@surfaceForm'] for r in parsed_res]):
        if word.isdigit(): continue
        w_count = len(re.findall(word, doc))
        if word not in word_count_dict:
            word_count_dict[word] = 0
        word_count_dict[word] += w_count


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [76]:
word_count_dict

{'Rodrick': 1,
 'queer': 1,
 'choice': 21,
 'turn': 26,
 'social ranks': 1,
 'human societies': 1,
 'Sex': 6,
 'protection': 13,
 'statistical dispersion': 1,
 'tax': 63,
 'Diseases': 1,
 'enforcing': 1,
 'prejudice': 2,
 'Social inequality': 5,
 'federal prison': 1,
 'receive': 59,
 'moves': 2,
 'minority category': 1,
 'policies': 24,
 'lead': 47,
 'wage': 54,
 'GDP': 20,
 'documents': 1,
 'meaning': 8,
 'voter fraud': 1,
 'engender': 1,
 'seek': 15,
 'billions': 4,
 'beliefs': 10,
 'hard': 8,
 'openness': 2,
 'imply': 8,
 'administrations': 1,
 'civic': 2,
 'cost': 8,
 'opportunities': 24,
 'high': 220,
 'income': 355,
 'gendered': 8,
 'address': 9,
 'authority': 3,
 'Castes': 1,
 'rising': 16,
 'initial': 2,
 'feminism': 2,
 'natural resources': 2,
 'USD': 6,
 'belonging': 4,
 'direct payments': 1,
 'The leaders': 1,
 'structural': 12,
 'Social Security': 4,
 'economic': 219,
 'incarcerated': 1,
 'produce': 10,
 'Egalitarian': 2,
 'younger': 5,
 'dependent': 7,
 'separations': 1,
 

In [96]:
for word in sorted(word_count_dict, key = lambda x: word_count_dict[x], reverse=True):
    weight = int((word_count_dict[word] + 1)/500 * 30)
    if weight < 4:
        weight = weight * 2
    print(weight, word)

128 al
52 men
40 The
37 equal
29 equality
28 In
27 inequality
21 income
20 age
19 health
17 education
14 social
14 care
13 gender
13 high
13 economic
10 male
10 This
10 women
10 child
9 rate
9 population
9 system
8 student
8 school
8 iv
8 nation
8 time
7 students
7 individual
7 access
7 distribution
7 man
6 level
6 As
6 higher
6 It
6 political
5 countries
5 United States
5 earn
5 global
5 job
5 life
5 data
5 US
5 For
5 American
5 sex
5 fact
5 educational
5 So
5 lack
5 gap
4 result
4 report
4 patient
4 group
4 individuals
4 workers
4 provide
4 wealth
4 top
4 average
4 number
4 difference
4 government
4 ability
4 led
4 experience
4 measure
4 differ
4 factor
4 support
4 rich
4 study
4 America
4 males
4 change
4 based
4 ill
4 We
4 legal
4 But
6 income inequality
6 problem
6 long
6 ratio
6 tax
6 factors
6 work
6 patients
6 There
6 jobs
6 discrimination
6 healthcare
6 grow
6 receive
6 power
6 groups
6 These
6 An
6 house
6 person
6 schools
6 differences
6 Gender
6 Africa
6 wage
6 disparities


0 savior
0 Facebook
0 car
0 kinds
0 democracy
0 step
0 promoting
0 responsible
0 Australia
0 deep
0 reality
0 ont
0 write
0 effects
0 human capital
0 population growth
0 suburban
0 LGAs
0 married couple
0 computation
0 inflation
0 E2
0 communicate
0 measurement
0 E3
0 E4
0 E1
0 planned
0 segments
0 North
0 Thus
0 World Economic Forum
0 industrialize
0 public education
0 speaking
0 Individuals
0 Race
0 interpreter
0 wards
0 incentives
0 visit
0 The United States
0 American Indian
0 Students
0 implement
0 standardized test
0 courses
0 dropout
0 employ
0 fail
0 gender wage gap
0 glass ceiling
0 trait
0 dress
0 nominated
0 chores
0 voting behavior
0 dolls
0 voter turnout
0 inclusiveness
0 Robert Dahl
0 vacancies
0 March
0 billions
0 belonging
0 Social Security
0 cutting
0 Health inequalities
0 Indonesia
0 identical
0 patterns
0 economic development
0 governmental
0 express
0 reaching
0 duration
0 permanent
0 numerous
0 limitations
0 management
0 decision making
0 impacted
0 Scholars
0 orga

0 Steven Yeun
0 set
0 Ring
0 Moon
0 cinema
0 Bong Joon Ho
0 South Korean film
0 East Asian studies
0 Fairness
0 real income
0 absolute poverty
0 propensity
0 subnational
0 Social justice
0 progressive taxation
0 Wealth distribution
0 Capabilities approach
0 Trade
0 safety nets
0 economic mobility
0 Taxes
0 mutual
0 investigated
0 justified
0 prejudices
0 imperfect
0 socialists
0 entrepreneurs
0 A major
0 relevant
0 economic equality
0 Economic development
0 Income inequality metrics
0 Progressive taxation
0 Journal
0 Billionaire
0 Argentina
0 Information technology
0 fellowship
0 Argument
0 Jeff Bezos
0 surging
0 Patrick
0 continental
0 aversion
0 sections
0 Marxist
0 structural adjustment
0 Earned Income Tax Credit
0 Malthusian
0 Kuznets curve
0 real estate
0 wealth gap
0 posits
0 United Nations University
0 IPS
0 Colombia
0 global warming
0 Anthony Giddens
0 defines
0 Marxism
0 Organisation
0 Finance
0 Diamond
0 patent
0 Measurements
0 Capital
0 Thomas Malthus
0 advancement
0 top qui

0 fundraiser
0 onwards
0 Anna
0 sluts
0 normalised
0 instinct
0 incest
0 February
0 abortion referendum
0 centred
0 Ulster
0 hosting
0 Stanton
0 maximum sentence
0 dead babies
0 Identifying
0 venues
0 Irish government
0 prohibitions
0 PR
0 terminations
0 abnormality
0 unwanted pregnancies
0 brainwashed
0 legal cases
0 brainwashing
0 procuring
0 Ponce
0 Sentencing
0 physical illness
0 track
0 hostile environment
0 exhausted
0 Fri
0 pitted
0 officer
0 Graeme
0 work capability assessment
0 Estelle
0 hedge
0 utility
0 Lord
0 headlines
0 bullying
0 neighbourhood
0 upper tribunal
0 fuel
0 headed
0 perfect storm
0 cold
0 iceberg
0 Punishment
0 confidence
0 Civil
0 pound
0 legal system
0 Legal Aid
0 trouble
0 contact
0 sleeping tablets
0 Hong Kong
0 paranoid schizophrenia
0 vandalism
0 benefit cap
0 charity
0 housing crisis
0 stressful
0 bedroom tax
0 talking
0 social anxiety
0 Henry Brooke
0 manager
0 Rights
0 Welsh
0 overturned
0 separation processes
0 sheds
0 victimised
0 credit card
0 stru

0 NIH
0 echelons
0 processed
0 legacy
0 chronic conditions
0 Declaration
0 sex characteristics
0 cognizant
0 lesbians
0 occupational hazards
0 schedule
0 authorities
0 postpone
0 accidents
0 adverse events
0 urinary tract infections
0 gender identity disorder
0 Legal
0 attends
0 cultural competence
0 Fair
0 sterilization
0 herpes
0 unique
0 United States federal law
0 Public Health England
0 Monitoring
0 breast cancer
0 EuroHealthNet
0 Doctors
0 longevity
0 gradient
0 healthcare reform
0 Sierra Leone
0 Middle East
0 hindered
0 inflexibility
0 continuously
0 replaced
0 physical exercise
0 legal obligation
0 hospitalized
0 modify
0 US history
0 modifies
0 Problems
0 Human Services
0 clinical trials
0 anatomical
0 Quality
0 universal service
0 preventive health services
0 Sexuality
0 primary care
0 villages
0 plans
0 diverges
0 In Europe
0 heterosexuality
0 vaccine
0 tuberculosis
0 arises
0 African continent
0 frequency
0 infanticide
0 shortcomings
0 helpful
0 preventive services
0 uncons