In [5]:
import spacy
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000)
from collections import Counter
from itertools import repeat
nlp = spacy.load('en_core_web_sm')

data = open('organisations_only.text', 'r').readlines()

# Get words
words_data = []
for l_idx, line in enumerate(data):
    line_words = line.split()
    for w_idx, word in enumerate(line_words):
        w_lower = word.lower()
        word_data = [word, w_lower, l_idx, w_idx]
        words_data.append(word_data)
for wd in words_data[:5]:
    print(wd)
    
print(words_data[-1][2])

['Academia', 'academia', 0, 0]
['Sinica', 'sinica', 0, 1]
['Académie', 'académie', 1, 0]
['de', 'de', 1, 1]
['Marine', 'marine', 1, 2]
1383


In [2]:
# SPACY TAGS

SPACY_TARGETS = [wd[0] for i, wd in enumerate(words_data)]
SPACY_TARGETS_lower = [wd[1] for i, wd in enumerate(words_data)]

spacy_items = []
spacy_entT = []

# Lemma, POS, TAG, Entity 
for doc in nlp.pipe(SPACY_TARGETS_lower, batch_size=50, n_threads=4):
    if doc.is_parsed:
        lemma = [n.lemma_ for n in doc][0]
        pos = [n.pos_ for n in doc][0]
        tag = [n.tag_ for n in doc][0]
        ent = [n.ent_type_ for n in doc][0]
        ent = [n.ent_type_ for n in doc][0]
        spacy_items.append([lemma, pos, tag, ent])
    else:
        spacy_items.append([None, None, None, None])

# Entity correction: passing not lowered down Entites list
for doc in nlp.pipe(SPACY_TARGETS, batch_size=50, n_threads=4):
    if doc.is_parsed:
        entT = [n.ent_type_ for n in doc][0]
        spacy_entT.append(entT)
    else:
        spacy_entT.append(None)
        
w_tok = []
for i in range(len(words_data)):
    word_tokenized = [words_data[i][0], # base_word    # 0
                      words_data[i][1], # w_lower      # 1
                      spacy_items[i][0], # lemma       # 2
                      spacy_items[i][1], # w_pos       # 3
                      spacy_items[i][2], # w_tag       # 4
                      spacy_items[i][3], # w_ent       # 5
                      spacy_entT[i], # w_entT          # 6
                      words_data[i][2], # l_idx        # 7
                      words_data[i][3] # w_idx         # 8
                     ]
    w_tok.append(word_tokenized)
    
# Output:
print("Nr of words:", len(w_tok))
for i in w_tok[:10]:
    print(i[0])

Nr of words: 5591
Academia
Sinica
Académie
de
Marine
Académie
des
Beaux-Arts
Académie
française


In [3]:
wtok = w_tok.copy()
wtok[:10]

[['Academia', 'academia', 'academia', 'NOUN', 'NN', '', '', 0, 0],
 ['Sinica', 'sinica', 'sinica', 'NOUN', 'NN', '', 'GPE', 0, 1],
 ['Académie', 'académie', 'académie', 'NOUN', 'NNS', '', 'GPE', 1, 0],
 ['de', 'de', 'de', 'X', 'FW', '', '', 1, 1],
 ['Marine', 'marine', 'marine', 'ADJ', 'JJ', '', '', 1, 2],
 ['Académie', 'académie', 'académie', 'NOUN', 'NNS', '', 'GPE', 2, 0],
 ['des', 'des', 'des', 'X', 'FW', '', '', 2, 1],
 ['Beaux-Arts', 'beaux-arts', 'beaux', 'ADJ', 'JJ', '', 'ORG', 2, 2],
 ['Académie', 'académie', 'académie', 'NOUN', 'NNS', '', 'GPE', 3, 0],
 ['française', 'française', 'française', 'NOUN', 'NN', '', '', 3, 1]]

In [97]:
# Word counts:
all_w = [w[0] for w in wtok]
all_words = len([w[0] for w in wtok])
uniq_words = len(set([w[0] for w in wtok]))
uniq_lower = len(set([w[1] for w in wtok]))
uniq_lemma = len(set([w[2] for w in wtok]))


# Entities counts
#words_ent = len([w[5] for w in wtok])
#words_entT = len([w[6] for w in wtok])


print("All words: {}".format( all_words))
print("Uniq words orig: {}".format( uniq_words))
print("Uniq words lower: {}".format( uniq_lower))
print("Uniq words lemma: {}".format( uniq_lemma))
#print("ent words: {}".format( words_ent))
#print("entT words: {}".format( words_entT))

All words: 5591
Uniq words orig: 1015
Uniq words lower: 1006
Uniq words lemma: 964


In [90]:
# Most popular words
df = pd.DataFrame(pd.Series(all_w), columns=['text'])
my_words = df['text'].tolist()
count_dict = dict()
for w in my_words:
    count = my_words.count(w)
    count_dict[w] = count
df["counts"] = df['text'].map(count_dict)
df = df.sort_values(by=['counts'], ascending=False)

popular_words = df.drop_duplicates()
popular_words[(popular_words['counts'] > 15)] #['text'].tolist()

Unnamed: 0,text,counts
3218,Society,487
2831,of,414
4729,for,224
2779,Institute,202
3381,Association,179
167,Mathematical,152
4012,International,125
4028,and,122
4269,Royal,103
5023,Studies,90


In [93]:
popular_words_map = {
 'Society': "Gathering",
 'Institute': "Institution",
 'Association': "Gathering",
 'Mathematical': "About_kind",
 'International': "Where_kind",
 'Royal': "Boost_kind",
 'Studies': "Institution",
 'American': "Where_kind",
 'Academy': "Institution",
 'Centre': "Institution",
 'Research': "About_kind",
 'British': "Where_kind",
 'Policy': "About_kind",
 'Sciences': "About_kind",
 'Foundation': "Institution",
 'European': "Where_kind",
 'Psychology': "About_kind",
 'Economic': "About_kind",
 'Science': "About_kind",
 'Center': "Institution",
 'Australian': "Where_kind",
 'Psychological': "About_kind",
 'Affairs': "About_kind",
 'Union': "Gathering",
 'Canadian': "Where_kind",
 'National': "Where_kind",
 'Development': "About_kind",
 'Social': "About_kind",
 'Council': "Gathering",
 'New': "Boost_kind",
 'Public': "About_kind",
 'Strategic': "How_kind",
 'Historical': "About_kind",
 'Applied': "How_kind",
 'Health': "About_kind",
 'Chinese' : "Where_kind",
 'Institution' : "Institution",
 'Bangladesh' : "Where_kind",
 'Political' : "About_kind",
 'Australia' : "Where_kind",
 'German' : "Where_kind",
 'College' : "Institution",
 'South' : "Where_kind",
 'Statistical' : "About_kind",
 'America' : "Where_kind",
 'Mathematics' : "About_kind",
 'Engineering' : "About_kind",
 'Security' : "About_kind",
 'London' : "Where_kind",
 'Finnish' : "Where_kind",
 'Zealand' : "Where_kind",
 'History' : "About_kind",
 'Computer' : "About_kind",
 'Canada' : "Where_kind",
 'Behavior' : "About_kind",
 'Pakistan' : "Where_kind",
 'Arts' : "About_kind",
 'Archaeological' : "About_kind",
 'Swedish' : "Where_kind",
 'Medical' : "About_kind",
 'Institut' : "Institution",
 'Federation' : "Gathering",
 'Foreign' : "About_kind",
 'Asiatic' : "About_kind",
 'Management' : "About_kind",
 'Academia' : "Institution",
 'Japan' : "Where_kind",
 'Australasian' : "Where_kind",
 'Education' : "Institution",
 'Experimental' : "How_kind",
 'Accademia' : "Institution",
 'Indian' : "Where_kind",
 'Law' : "About_kind",
 'China' : "Where_kind",
 'African' : "Where_kind",
 'Economics' : "About_kind",
 'University' : "Institution",
 'Israel' : "Where_kind",
 'Relations' : "About_kind",
 'Edinburgh' : "Where_kind",
 'Forum' : "Gathering",
 'Study' : "Gathering",
 'Académie' : "Institution",
 'Humanities' : "About_kind",
 'Chartered' : "About_kind",
 'Europe' : "Where_kind",
 'Literature' : "About_kind",
 'Technology' : "About_kind",
 'Real' : "Boost_kind",
 'Physics' : "About_kind",
 'Actuaries' : "Gathering",
 'Iranian' : "Where_kind",
 'Psychologists' : "Gathering",
 'Analysis' : "About_kind",
 'Group' : "Gathering",
 'Sri' : "Where_kind",
 'Physical' : "About_kind",
 'Lanka' : "Where_kind",
 'Astronomical' : "About_kind",
 'Nacional' : "Where_kind",
 'Meteorological' : "About_kind",
 'Chemical' : "About_kind",
 'Classical' : "About_kind",
 'North' : "Where_kind",
 'Legal' : "Boost_kind",
 'Spanish' : "Where_kind",
 'Service' : "How_kind",
 'Democracy' : "About_kind",
 'Anthropological' : "About_kind",
 'Korea' : "Where_kind",
 'Belgian' : "Where_kind",
 'Club' : "Gathering",
 'Behavioral' : "About_kind",
 'Africa' : "Where_kind",
 'United' : "Boost_kind",
 'Psychiatry' : "About_kind",
 'Argentine' : "Where_kind",
 'Instituto' : "Institution",
 'Industrial' : "About_kind",
 'Associations' : "Gathering",
 'Brazilian' : "Where_kind",
 'Austrian' : "Where_kind",
 'Information' : "About_kind",
 'Regional' : "Where_kind",
 'Informatics' : "About_kind",
 'Jerusalem' : "Where_kind",
 'Mathematicians' : "About_kind",
 'Psychoanalytic' : "About_kind",
 'Sustainable' : "About_kind",
 'Child' : "About_kind",
 'Argentina' : "Where_kind",
 'Peace' : "About_kind",
 'Human' : "About_kind",
 'Geological' : "About_kind",
 'School' : "Institution",
 'Intelligence' : "About_kind",
 'Western' : "Where_kind",
 'Engineers' : "About_kind",
 'Genetics' : "About_kind",
 'Gakkai' : "Where_kind",
 'Forensic' : "About_kind",
 'Mental' : "About_kind",
 'Moscow' : "Where_kind",
 'Crisis' : "About_kind",
 'Philosophy' : "About_kind",
 'Polish' : "Where_kind",
 'Norwegian' : "Where_kind",
 'Community' : "Gathering",
 'Czech' : "Where_kind",
 'Asian' : "Where_kind",
 'Nigerian' : "Where_kind",
 'Hellenic' : "About_kind",
 'Advanced' : "Boost_kind",
 'French' : "Where_kind",
 'Medicine' : "About_kind",
 'Atlantic' : "Where_kind",
 'Singapore' : "Where_kind",
 'Market' : "About_kind",
 'Defence' : "About_kind",
 'Scientific' : "Boost_kind",
 'Innovation' : "Boost_kind",
 'Advancement' : "Boost_kind",
 'Scotland' : "Where_kind",
 'Heraldry' : "About_kind",
 'World' : "Where_kind",
 'India' : "Where_kind",
 'Art' : "About_kind",
 'Historians' : "About_kind",
 'Albanian' : "Where_kind",
 'Britain' : "Where_kind",
 'Philosophical' : "About_kind",
 'Physicians' : "Gathering",
 'Communication' : "About_kind",
 'Tecnología' : "About_kind",
 'Antiquaries' : "Institution",
 'Societies' : "Gathering",
 'Occupational' : "Gathering",
 'Latvian' : "Where_kind",
 'Branch' : "Institution",
 'Brasileira' : "Where_kind",
 'Geneva' : "Where_kind",
 'Future' : "Boost_kind",
 'Scottish' : "Where_kind",
 'Linguistics' : "About_kind",
 'Letters' : "About_kind",
 'Justice' : "About_kind",
 'Matemàtiques' : "About_kind",
 'Sciences,' : "About_kind",
 'State' : "Institution",
 'Professional' : "How_kind",
 'Numismatic' : "About_kind",
 'Eating' : "About_kind",
 'Ecological' : "About_kind",
 'Societat' : "About_kind",
 'Politik' : "About_kind",
 'Government' : "Institution",
 'Global' : "Where_kind",
 'Behavioural' : "How_kind",
 'France' : "Where_kind",
 'Tunisian' : "Where_kind",
 'Georgian' : "Where_kind",
 'Actuarial' : "Institution",
 'Hong' : "Where_kind",
 'Kong' : "Where_kind",
 'Clinical' : "About_kind",
 'Disorders' : "About_kind",
 'Great' : "Boost_kind",
 'Ergonomics' : "About_kind",
 'Transpersonal' : "About_kind",
 'Factors' : "About_kind",
 'Enterprise' : "Institution",
 'Ireland' : "Where_kind",
 'Comparative' : "About_kind",
 'Renaissance' : "About_kind",
 'Brain' : "About_kind",
 'Philological' : "About_kind",
 'Literary' : "About_kind",
 'Applications' : "How_kind",
 'Queensland' : "Where_kind",
 'Learned' : "Boost_kind",
 'Music' : "About_kind",
 'Copenhagen' : "Where_kind",
 'Fundación' : "Institution",
 'Board' : "Institution",
 'Italian' : "Where_kind",
 'Conference' : "Gathering",
 'Romanian' : "Where_kind",
 'Medieval' : "About_kind",
 'Adolescent' : "About_kind",
 'Marine' : "About_kind",
 'Wales' : "Where_kind",
 'Geographical' : "About_kind",
 'Psychiatric' : "About_kind",
 'Economy' : "About_kind",
 'East' : "Where_kind",
 'Taiwan' : "Where_kind",
 'Free' : "Boost_kind",
 'West' : "Where_kind",
 'Georgia' : "Where_kind",
 'Hungarian' : "Where_kind",
 'Netherlands' : "Where_kind",
 'Behaviour' : "About_kind",
}

for k, v in Counter(popular_words_map.values()).items():
    print(k,v)

# Append frequent word category as a last position of wtok word-info list
wtok_cat = wtok.copy()
for i, line in enumerate(wtok):
    word = line[0]
    if word in popular_words_map:
        wtok_cat[i].append(popular_words_map[word])

        else:
        wtok_cat[i].append("Untagged")

Gathering 17
Institution 24
About_kind 101
Where_kind 80
Boost_kind 13
How_kind 7


In [155]:
# Switch categorised words to category names
no_switch = dict()
part_switch = dict() # tag / word
full_switch = dict() # tag / Untagged token 
for i, line in enumerate(wtok[:]):
    
    sent_nr = wtok_cat[i][7]
    if sent_nr != wtok_cat[i-1][7]:
        no_switch[sent_nr] = []
        part_switch[sent_nr] = []
        full_switch[sent_nr] = []
        
    wrd = wtok_cat[i][0]
    hldr = wtok_cat[i][9]
    if hldr != 'Untagged':
        no_switch[sent_nr].append(wrd)
        part_switch[sent_nr].append(hldr)
        full_switch[sent_nr].append(hldr)
    else:
        no_switch[sent_nr].append(wrd)
        part_switch[sent_nr].append(wrd)
        full_switch[sent_nr].append(hldr)

sent_dicts = [no_switch, part_switch, full_switch]
for sent_dict in sent_dicts:
    for k,v in sent_dict.items():
        sent_dict[k] = " ".join(v)
    
print("1. Original words:")
for k,v in no_switch.items():
    print( no_switch[k])
    if int(k) > 10: print(" "); break

print("2. Switching words to predefined categories names:")
for k,v in part_switch.items():
    print( part_switch[k])
    if int(k) > 10: print(" "); break

print("3. Switching all words to category/untagged:")
for k,v in full_switch.items():
    print( full_switch[k])
    if int(k) > 10: break

1. Original words:
Academia Sinica
Académie de Marine
Académie des Beaux-Arts
Académie française
Accademia degli Arcadi
Accademia dei Lincei
Accademia della Crusca
Actuaries Institute
American Antiquarian Society
American Bar Association
American Chemical Society
American Classical League
 
2. Switching words to predefined categories names:
Institution Sinica
Institution de About_kind
Institution des Beaux-Arts
Institution française
Institution degli Arcadi
Institution dei Lincei
Institution della Crusca
Gathering Institution
Where_kind Antiquarian Gathering
Where_kind Bar Gathering
Where_kind About_kind Gathering
Where_kind About_kind League
 
3. Switching all words to category/untagged:
Institution Untagged
Institution Untagged About_kind
Institution Untagged Untagged
Institution Untagged
Institution Untagged Untagged
Institution Untagged Untagged
Institution Untagged Untagged
Gathering Institution
Where_kind Untagged Gathering
Where_kind Untagged Gathering
Where_kind About_kind Gath

In [156]:
df = pd.DataFrame(columns=['a','b','c'])
df['a'] = list(no_switch.values())
df['b'] = list(part_switch.values())
df['c'] = list(full_switch.values())
df

Unnamed: 0,a,b,c
0,Academia Sinica,Institution Sinica,Institution Untagged
1,Académie de Marine,Institution de About_kind,Institution Untagged About_kind
2,Académie des Beaux-Arts,Institution des Beaux-Arts,Institution Untagged Untagged
3,Académie française,Institution française,Institution Untagged
4,Accademia degli Arcadi,Institution degli Arcadi,Institution Untagged Untagged
5,Accademia dei Lincei,Institution dei Lincei,Institution Untagged Untagged
6,Accademia della Crusca,Institution della Crusca,Institution Untagged Untagged
7,Actuaries Institute,Gathering Institution,Gathering Institution
8,American Antiquarian Society,Where_kind Antiquarian Gathering,Where_kind Untagged Gathering
9,American Bar Association,Where_kind Bar Gathering,Where_kind Untagged Gathering


In [157]:
df.to_csv('topic_tagged_orgs.csv')

In [6]:
import pandas as pd
x = pd.read_csv('topic_tagged_orgs.csv')
x

Unnamed: 0.1,Unnamed: 0,a,b,c
0,0,Academia Sinica,Institution Sinica,Institution Untagged
1,1,Académie de Marine,Institution de About_kind,Institution Untagged About_kind
2,2,Académie des Beaux-Arts,Institution des Beaux-Arts,Institution Untagged Untagged
3,3,Académie française,Institution française,Institution Untagged
4,4,Accademia degli Arcadi,Institution degli Arcadi,Institution Untagged Untagged
5,5,Accademia dei Lincei,Institution dei Lincei,Institution Untagged Untagged
6,6,Accademia della Crusca,Institution della Crusca,Institution Untagged Untagged
7,7,Actuaries Institute,Gathering Institution,Gathering Institution
8,8,American Antiquarian Society,Where_kind Antiquarian Gathering,Where_kind Untagged Gathering
9,9,American Bar Association,Where_kind Bar Gathering,Where_kind Untagged Gathering
