In [1]:
# Auto-reload modules when accessing them:
%load_ext autoreload
%autoreload 2 

# Echo all output:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Import own functions for image analysis:
import analysis_funs as va

In [2]:
data = va.load_data('region_descriptions.json', prefix = '1.2/VG/1.2/')
assert len(data) == 108077

Loading pickles/region_descriptions.json.marshal...


In [3]:
# POS tagger directory: stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger
from nltk.tag import StanfordPOSTagger
st = StanfordPOSTagger('stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',
                      path_to_jar='stanford-postagger-2017-06-09/stanford-postagger.jar')

In [4]:
st.tag('What is the airspeed of an unladen swallow ?'.split())

[('What', 'WP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('airspeed', 'NN'),
 ('of', 'IN'),
 ('an', 'DT'),
 ('unladen', 'JJ'),
 ('swallow', 'VB'),
 ('?', '.')]

In [18]:
# for each region look at 'phrase' - it contains the region description
# create function description to synsets
# run it through the whole file and create a new json with output


# Create a map between penn treebank and WordNet 
from nltk.corpus import wordnet as wn

# WordNet POS tags are: NOUN = 'n', ADJ = 's', VERB = 'v', ADV = 'r', ADJ_SAT = 'a'
tag_map = {
            'CC':None, # coordin. conjunction (and, but, or)  
            'CD':wn.NOUN, # cardinal number (one, two)             
            'DT':None, # determiner (a, the)                    
            'EX':wn.ADV, # existential ‘there’ (there)           
            'FW':None, # foreign word (mea culpa)             
            'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
            'JJ':[wn.ADJ, wn.ADJ_SAT], # adjective (yellow)                  
            'JJR':[wn.ADJ, wn.ADJ_SAT], # adj., comparative (bigger)          
            'JJS':[wn.ADJ, wn.ADJ_SAT], # adj., superlative (wildest)           
            'LS':None, # list item marker (1, 2, One)          
            'MD':None, # modal (can, should)                    
            'NN':wn.NOUN, # noun, sing. or mass (llama)          
            'NNS':wn.NOUN, # noun, plural (llamas)                  
            'NNP':wn.NOUN, # proper noun, sing. (IBM)              
            'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
            'PDT':[wn.ADJ, wn.ADJ_SAT], # predeterminer (all, both)            
            'POS':None, # possessive ending (’s )               
            'PRP':None, # personal pronoun (I, you, he)     
            'PRP$':None, # possessive pronoun (your, one’s)    
            'RB':wn.ADV, # adverb (quickly, never)            
            'RBR':wn.ADV, # adverb, comparative (faster)        
            'RBS':wn.ADV, # adverb, superlative (fastest)     
            'RP':[wn.ADJ, wn.ADJ_SAT], # particle (up, off)
            'SYM':None, # symbol (+,%, &)
            'TO':None, # “to” (to)
            'UH':None, # interjection (ah, oops)
            'VB':wn.VERB, # verb base form (eat)
            'VBD':wn.VERB, # verb past tense (ate)
            'VBG':wn.VERB, # verb gerund (eating)
            'VBN':wn.VERB, # verb past participle (eaten)
            'VBP':wn.VERB, # verb non-3sg pres (eat)
            'VBZ':wn.VERB, # verb 3sg pres (eats)
            'WDT':None, # wh-determiner (which, that)
            'WP':None, # wh-pronoun (what, who)
            'WP$':None, # possessive (wh- whose)
            'WRB':None, # wh-adverb (how, where)
            '$':None, #  dollar sign ($)
            '#':None, # pound sign (#)
            '“':None, # left quote (‘ or “)
            '”':None, # right quote (’ or ”)
            '(':None, # left parenthesis ([, (, {, <)
            ')':None, # right parenthesis (], ), }, >)
            ',':None, # comma (,)
            '.':None, # sentence-final punc (. ! ?)
            ':':None # mid-sentence punc (: ; ... – -)
        }

In [57]:
phrase1 = 'A man wearing black clothes'
phrase2 = 'the side walk is  made of bricks'
phrase3 = 'part of the road is red marked with white stripes'
phrase4 = 'there are green trees'
phrase5 = 'What is the airspeed of an unladen swallow ?'
def phrase_to_synset(phrase, debug = False):
    phrase_tagged = st.tag(phrase.split())
    if debug: print(phrase_tagged)
    syns = [likeliest_synset(t[0], tag_map[t[1]]) for t in phrase_tagged if tag_map[t[1]]]
    syns = [s for s in syns if s]
    return syns
    
def max_dict(d):
    if d: return max(d, key=lambda key: d[key])
    else: return None

def likeliest_synset(word, tags, debug = False):
    """word: string
    tags: list of WordNet tags, can be one of:
        NOUN = 'n', ADJ = 's', VERB = 'v', ADV = 'r', ADJ_SAT = 'a'"""
    synsets = wn.synsets(word, tags)
    
    synset_freqs = {}
    
    for synset in synsets:
        freq = 0  
        for lemma in synset.lemmas():
            freq += lemma.count()
            synset_freqs[synset.name()] = freq
            
    if debug: print(synset_freqs)
    
    return max_dict(synset_freqs)
    
#likeliest_synset('yellow', 's')
phrase_to_synset(phrase1)
phrase_to_synset(phrase2)
phrase_to_synset(phrase3)
phrase_to_synset(phrase4)
phrase_to_synset(phrase5)

['man.n.01', 'wear.v.01', 'black.a.01', 'apparel.n.01']

['side.n.01', 'base_on_balls.n.01', 'be.v.01', 'make.v.01', 'brick.n.01']

['part.n.01',
 'road.n.01',
 'be.v.01',
 'red.s.01',
 'marked.s.01',
 'white.a.01',
 'band.n.04']

['there.r.01', 'be.v.01', 'green.s.01', 'tree.n.01']

['be.v.01', 'airspeed.n.01', 'swallow.v.01']

In [34]:
phrase_tagged = st.tag(phrase1.split())

In [36]:
[a for a in phrase_tagged]

[('A', 'DT'),
 ('man', 'NN'),
 ('wearing', 'VBG'),
 ('black', 'JJ'),
 ('clothes', 'NNS')]

In [44]:
wn.synsets('apparel', pos=[wn.NOUN])

[Synset('apparel.n.01')]

In [13]:
wn.synsets('green')

[Synset('green.n.01'),
 Synset('park.n.02'),
 Synset('green.n.03'),
 Synset('green.n.04'),
 Synset('green.n.05'),
 Synset('green.n.06'),
 Synset('greens.n.01'),
 Synset('k.n.07'),
 Synset('green.v.01'),
 Synset('green.s.01'),
 Synset('green.a.02'),
 Synset('green.a.03'),
 Synset('green.s.04'),
 Synset('fleeceable.s.01')]

In [None]:
# s  =  adjective satellite
# r  =  adverb

In [66]:
wn.synset('off.s.02').definition()

'below a satisfactory level'

In [25]:
a = wn.synset('off.s.02')
a.name()

'off.s.02'

In [48]:
wn.ADJ_SAT

's'

In [7]:
tag_map

{'CC': None,
 'CD': 'n',
 'DT': None,
 'EX': 'r',
 'FW': None,
 'IN': 'r',
 'JJ': ['a', 's'],
 'JJR': ['a', 's'],
 'JJS': ['a', 's'],
 'LS': None,
 'MD': None,
 'NN': 'n',
 'NNS': 'n',
 'NNP': 'n',
 'NNPS': 'n',
 'PDT': ['a', 's'],
 'POS': None,
 'PRP': None,
 'PRP$': None,
 'RB': 'r',
 'RBR': 'r',
 'RBS': 'r',
 'RP': ['a', 's'],
 'SYM': None,
 'TO': None,
 'UH': None,
 'VB': 'v',
 'VBD': 'v',
 'VBG': 'v',
 'VBN': 'v',
 'VBP': 'v',
 'VBZ': 'v',
 'WDT': None,
 'WP': None,
 'WP$': None,
 'WRB': None,
 '$': None,
 '#': None,
 '“': None,
 '”': None,
 '(': None,
 ')': None,
 ',': None,
 '.': None,
 ':': None}