In [1]:
import sys
sys.path.append('../')

import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer 
import random
import imp
from modules import data_cleaning

%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2

In [2]:
import os
os.getcwd() 

'/Users/jmankewitz/Projects/sense-aoa/setup'

In [3]:
# Get CDI
raw_cdi = pd.read_csv("../data/WSWG_50percentproducing.csv")
cdi_items = raw_cdi
# Fix Uni_Lemmas
import re
cdi_items['fixed_lemma'] = cdi_items['uni_lemma'].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
cdi_items['fixed_lemma'] = cdi_items['fixed_lemma'].apply(lambda x: x.strip())
cdi_items['fixed_lemma']=cdi_items['fixed_lemma'].apply(lambda x: x.strip())

cdi_items.head(3)

Unnamed: 0,id,word,language,form,type,category,lexical_category,lexical_class,uni_lemma,aoa_in_seconds,aoa,fixed_lemma
0,0,daddy (child's word),English (American),WG,word,people,other,other,daddy,29808000.0,345,daddy
1,1,mommy (child's word),English (American),WG,word,people,other,other,mommy,30240000.0,350,mommy
2,2,bye,English (American),WG,word,games_routines,other,other,bye,35164800.0,407,bye


In [4]:
raw_cdi[cdi_items['word'].str.contains("vanilla")]

Unnamed: 0,id,word,language,form,type,category,lexical_category,lexical_class,uni_lemma,aoa_in_seconds,aoa,fixed_lemma


In [5]:
set(cdi_items.lexical_class)

{'adjectives', 'function_words', 'nouns', 'other', 'verbs'}

In [6]:
# lemmatize the fixed_lemmas just in case (looking @ you, "beads")
# Default to no pos specified if lexical_category is not retreivable
cdi_wordnet_pos_map = {"nouns": wn.NOUN,
                       "verbs": wn.VERB,
                       "adjectives": wn.ADJ}

lemmatizer = WordNetLemmatizer()

def lemmatizeLemma(lemma, proposed_pos):
    if proposed_pos in list(cdi_wordnet_pos_map.keys()):
        return lemmatizer.lemmatize(lemma, cdi_wordnet_pos_map[proposed_pos])
    else:
        return lemmatizer.lemmatize(lemma)

# fix glasses -> glass -> glassses, check for any more like this

cdi_items['lemmatized_lemma'] = cdi_items.apply(lambda row: lemmatizeLemma(row.fixed_lemma, row.lexical_class), axis=1)

#actually, this seems to cause more problems than it solves, so we'll use the fixed_lemma to match words

In [7]:
cdi_items.groupby(by=['fixed_lemma'])['lexical_category'].agg('nunique').reset_index().sort_values(by='fixed_lemma')

Unnamed: 0,fixed_lemma,lexical_category
0,a,1
1,a lot,1
2,after,1
3,airplane,1
4,all,1
...,...,...
618,yucky,1
619,yum yum,1
620,zebra,1
621,zipper,1


In [8]:
cdi_items[cdi_items['fixed_lemma']=='water']

Unnamed: 0,id,word,language,form,type,category,lexical_category,lexical_class,uni_lemma,aoa_in_seconds,aoa,fixed_lemma,lemmatized_lemma
55,55,water (beverage),English (American),WS,word,food_drink,nouns,nouns,water,49161600.0,569,water,water
61,61,water (not beverage),English (American),WS,word,outside,nouns,nouns,water (not beverage),50544000.0,585,water,water


In [9]:
restricted_pos_cdi = cdi_items[cdi_items['lexical_class'].isin(list(cdi_wordnet_pos_map.keys()))]
print(len(set(restricted_pos_cdi.fixed_lemma)),"words")

455 words


In [10]:
lemmatizer.lemmatize("glasses")
[sense.name() for sense in wn.synsets("glass", 'n')]

['glass.n.01',
 'glass.n.02',
 'glass.n.03',
 'field_glass.n.01',
 'methamphetamine.n.01',
 'looking_glass.n.01',
 'glass.n.07']

In [11]:
#first get all the senses for each word in the cdi
def getSensesFromLemma(lemma, pos):
    return [sense.name() for sense in wn.synsets(lemma, pos)]

cdi_wordnet_df = pd.DataFrame({'lemma':[], 'sense_name':[]})
#restrict to just nouns, verbs, adjectives
for word in restricted_pos_cdi.fixed_lemma:
    #get senses
    pos = cdi_wordnet_pos_map[list(restricted_pos_cdi[restricted_pos_cdi['fixed_lemma']==word].lexical_class)[0]]
    senses = getSensesFromLemma(word,pos)
    word_wordnet_df = pd.DataFrame({'sense_name':senses})
    word_wordnet_df['lemma'] = word
    cdi_wordnet_df = pd.concat([cdi_wordnet_df, word_wordnet_df])
    
cdi_wordnet_df = cdi_wordnet_df.reset_index(drop=True)
#then get the definitions and examples

In [12]:
cdi_wordnet_df[cdi_wordnet_df['lemma']=='glasses']

Unnamed: 0,lemma,sense_name
855,glasses,spectacles.n.01
856,glasses,glass.n.01
857,glasses,glass.n.02
858,glasses,glass.n.03
859,glasses,field_glass.n.01
860,glasses,methamphetamine.n.01
861,glasses,looking_glass.n.01
862,glasses,glass.n.07


In [13]:
cdi_wordnet_df['definition'] = cdi_wordnet_df.sense_name.apply(lambda sense_name: wn.synset(sense_name).definition())

In [14]:
cdi_wordnet_df['num_wn_examples'] = cdi_wordnet_df.sense_name.apply(lambda sense_name: len(wn.synset(sense_name).examples()))

# Cant default to just the first one, should default to an example that contains the word
def getWNExample(lemma, sense_name):
    possible_examples = wn.synset(sense_name).examples()
    acceptable_examples = [example for example in possible_examples if lemma in example]
    if len(acceptable_examples) == 0:
        return "no_example"
    else:
        return acceptable_examples[0]
    
cdi_wordnet_df['wn_example'] = cdi_wordnet_df.apply(lambda row: getWNExample(row.lemma, row.sense_name), axis=1)

In [15]:
print(len(cdi_wordnet_df[cdi_wordnet_df['wn_example']!='no_example']), "senses with acceptable examples")
print(len(cdi_wordnet_df[cdi_wordnet_df['wn_example']!='no_example'])/len(cdi_wordnet_df), "%")
print(len(cdi_wordnet_df[cdi_wordnet_df['wn_example']=='no_example']), "senses without examples")
print(len(cdi_wordnet_df[cdi_wordnet_df['wn_example']=='no_example'])/len(cdi_wordnet_df), "%")

1567 senses with acceptable examples
0.5276094276094276 %
1403 senses without examples
0.4723905723905724 %


In [16]:
no_wn_example = cdi_wordnet_df[cdi_wordnet_df['wn_example']=='no_example']

cdi_wordnet_df[cdi_wordnet_df['wn_example']=='no_example']

#next pass is to pull from wordsense project or semcor for examples

Unnamed: 0,lemma,sense_name,definition,num_wn_examples,wn_example
6,ball,ball.n.07,United States comedienne best known as the sta...,0,no_example
8,ball,ball.n.09,a lavish dance requiring formal attire,0,no_example
10,ball,ball.n.11,the game of baseball,0,no_example
16,dog,frank.n.02,a smooth-textured sausage of minced beef or po...,0,no_example
17,dog,pawl.n.01,a hinged catch that fits into a notch of a rat...,0,no_example
...,...,...,...,...,...
2962,stone,stone.n.08,United States filmmaker (born in 1946),0,no_example
2963,stone,stone.n.09,United States feminist and suffragist (1818-1893),0,no_example
2964,stone,stone.n.10,United States journalist who advocated liberal...,0,no_example
2965,stone,stone.n.11,United States jurist who served on the United ...,0,no_example


## Semcor examples

In [17]:
import nltk
from nltk.corpus import semcor
nltk.download('semcor')

[nltk_data] Downloading package semcor to
[nltk_data]     /Users/jmankewitz/nltk_data...
[nltk_data]   Package semcor is already up-to-date!


True

In [19]:
def get_pos(lem, delim = '.'):
    """
    Inputs:
    lem- WordNet lemma
    delim- delimiting character (. or ,)
    Output:
    Part of speech for lem (n, v, s)
    """
    lem = str(lem)
    try:
        return lem.split(delim)[1]
    except:
        return 'No POS'

def get_sense_num(lem, delim = '.'):
    """
    DEPRECATED?
    Inputs: 
    lem- WordNet lemma
    delim- delimiting character (. or ,)
    Output:
    Number of sense in WordNet
    """
    lem = str(lem)
    try:
        return lem.split(delim)[2]
    except:
        return "No marked sense"

def get_name(lem, delim = '.'):
    """
    Inputs:
    lem- WordNet lemma
    delim- delimiting character (. or ,)
    Output:
    Name of sense in WordNet
    """
    if type(lem) != str:
        return lem.name()
    else:
        return lem.split(delim)[0]

In [28]:
tagged_sents = nltk.corpus.semcor.tagged_sents(tag = 'sem')[0:37176]
raw_sents = nltk.corpus.semcor.sents()[0:37176]

In [66]:
#restrict to sentences with the target lemmas
target_lemmas = set(no_wn_example.lemma)
target_tagged_sents = []
target_untagged_sents = []
for i in tqdm(np.arange(0, len(tagged_sents))):
    poss_sent = raw_sents[i]
    if not target_lemmas.isdisjoint(set(poss_sent)) and len(poss_sent)<15:
        target_tagged_sents.append(tagged_sents[i])
        target_untagged_sents.append(raw_sents[i])

HBox(children=(FloatProgress(value=0.0, max=37176.0), HTML(value='')))




In [87]:
# for each tagged sentence in the reduced sentence list, check for a sense from my sense list

semcor_wn_senses = []
semcor_wn_sentences = []
for i in tqdm(np.arange(0, len(target_tagged_sents))):
    s = target_tagged_sents[i]
    for tok in s:
        if type(tok) == nltk.tree.Tree:
            tag_lemma = get_name(tok.label())
            tag_pos = get_pos(tok.label())
            tag_num = get_sense_num(tok.label())
            wn_sense_tag = tag_lemma+'.'+tag_pos+'.'+tag_num
            semcor_wn_senses.append(wn_sense_tag)
            #print(i, wn_sense_tag)
            clean_sentence = ''
            for item in target_untagged_sents[i]:
                clean_sentence+= item+' '
            semcor_wn_sentences.append(clean_sentence.strip())
            
sense_sentence_df = pd.DataFrame(data={'sense_name':semcor_wn_senses, 'semcor_sentence':semcor_wn_sentences})
#restrict to senses in the missing sense list
target_senses = sense_sentence_df[sense_sentence_df['sense_name'].isin(set(no_wn_example.sense_name))]

HBox(children=(FloatProgress(value=0.0, max=3347.0), HTML(value='')))




In [157]:
cdi_wordnet_df = pd.merge(cdi_wordnet_df,
                        target_senses,
                        left_on='sense_name',
                        right_on='sense_name',
                        how='left')
semcor_example_counts = target_senses.groupby(by=['sense_name'])['semcor_sentence'].agg('nunique').reset_index()
semcor_example_counts = semcor_example_counts.rename(columns={'semcor_sentence':'semcor_example_count'})

cdi_wordnet_df = pd.merge(cdi_wordnet_df,
                         semcor_example_counts,
                         left_on='sense_name',
                         right_on='sense_name',
                         how='left')
cdi_wordnet_df = cdi_wordnet_df.fillna(value={'semcor_sentence':'no_example', 'semcor_example_count':0})

cdi_wordnet_df.groupby(['sense_name', 'lemma']).agg(np.random.choice).reset_index()

Unnamed: 0,sense_name,lemma,definition,num_wn_examples,wn_example,semcor_sentence,semcor_example_count
0,abridge.v.01,cut,reduce in scope while retaining essential elem...,1,no_example,no_example,0.0
1,absorb.v.04,draw,"take in, also metaphorically",2,no_example,no_example,0.0
2,accept.v.02,have,receive willingly something given or offered,3,The only girl who would have him was the mille...,no_example,0.0
3,accept.v.02,take,receive willingly something given or offered,3,no_example,no_example,0.0
4,accept.v.05,take,admit into a group or community,2,no_example,Few new writers have their first story accepte...,1.0
...,...,...,...,...,...,...,...
2882,yellow.s.03,yellow,changed to a yellowish color by age,1,yellowed parchment,no_example,0.0
2883,yellow.s.05,yellow,cowardly or treacherous; -M.W.Straight,2,the little yellow stain of treason,no_example,0.0
2884,yield.v.01,give,be the cause or source of,2,no_example,Space probes have yielded little information .,1.0
2885,yogurt.n.01,yogurt,a custard-like food made from curdled milk,0,no_example,no_example,0.0


In [165]:
cdi_wordnet_df = cdi_wordnet_df[['lemma', 'sense_name', 'definition', 'num_wn_examples', 'wn_example', 'semcor_example_count', 'semcor_sentence']]
cdi_wordnet_df.loc[(cdi_wordnet_df['wn_example']=='no_example') & (cdi_wordnet_df['semcor_sentence']=='no_example')]

Unnamed: 0,lemma,sense_name,definition,num_wn_examples,wn_example,semcor_example_count,semcor_sentence
6,ball,ball.n.07,United States comedienne best known as the sta...,0,no_example,0.0,no_example
8,ball,ball.n.09,a lavish dance requiring formal attire,0,no_example,0.0,no_example
10,ball,ball.n.11,the game of baseball,0,no_example,0.0,no_example
16,dog,frank.n.02,a smooth-textured sausage of minced beef or po...,0,no_example,0.0,no_example
17,dog,pawl.n.01,a hinged catch that fits into a notch of a rat...,0,no_example,0.0,no_example
...,...,...,...,...,...,...,...
4040,stone,stone.n.08,United States filmmaker (born in 1946),0,no_example,0.0,no_example
4041,stone,stone.n.09,United States feminist and suffragist (1818-1893),0,no_example,0.0,no_example
4042,stone,stone.n.10,United States journalist who advocated liberal...,0,no_example,0.0,no_example
4043,stone,stone.n.11,United States jurist who served on the United ...,0,no_example,0.0,no_example


## Still missing 1151 senses - Try CHILDES

In [166]:
# from the wordsense project! 
#for each word without a wordnet example look for a parent utterance from childes for the target senses
# need to get childes tags
# look for utterances with just the single tag for the target sense
# prioritize utterances with tags from multiple participants
# if there are more than 3, show all 3 and I can pick the best 

In [167]:
from configs import postgres_config
from configs import mysql_config
from sqlalchemy import create_engine

wordsense_auth = postgres_config.Authenticator('WordSense')
ws_engine = create_engine(wordsense_auth.connectionString)

childesdb_auth  = mysql_config.Authenticator('ec2')
cdb_engine = create_engine(childesdb_auth.connectionString)

all_tags_df = pd.read_csv('../data/raw_tags.csv')
majority_token = pd.read_csv('../data/raw_majority_token_df.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [168]:
derived_tokens_df = pd.read_sql_query('SELECT * from derived_tokens',ws_engine)
utterances = pd.read_sql_query('SELECT * from utterance where corpus_id=49', cdb_engine)
utterances_man = pd.read_sql_query('SELECT * from utterance where corpus_id=204', cdb_engine)
utterances = pd.concat([utterances, utterances_man])

In [169]:
tokens_w_one_tag = all_tags_df.groupby(by=['token_id'])['sense_name'].agg('nunique').reset_index()
tokens_w_one_tag = set(tokens_w_one_tag[tokens_w_one_tag['sense_name']==1].token_id.astype(int))

target_tags_df = majority_token[(majority_token['sense_name'].isin(set(no_wn_example.sense_name)))&
           majority_token['token_id'].isin(tokens_w_one_tag)]

target_tags_df = pd.merge(target_tags_df, derived_tokens_df[['id', 'utterance_id','speaker_role']], 
                         left_on='token_id',
                         right_on='id',
                         how='left')

target_tags_df = pd.merge(target_tags_df,
                         utterances[['id','gloss']],
                         left_on='utterance_id',
                         right_on='id',
                         how='left')

target_tags_df['lemma']= target_tags_df.type.apply(lambda word: word.split("+")[0])

In [170]:
parent_utterances = target_tags_df[target_tags_df['speaker_role']!='Target_Child']

In [182]:
def getChildesExamples(target_sense, utterances=parent_utterances):
    return parent_utterances[(parent_utterances['sense_name']==target_sense)][['utterance_id','gloss']]

In [184]:
cdi_wordnet_df.loc[(cdi_wordnet_df['wn_example']=='no_example') & (cdi_wordnet_df['semcor_sentence']=='no_example')]

Unnamed: 0,lemma,sense_name,definition,num_wn_examples,wn_example,semcor_example_count,semcor_sentence
6,ball,ball.n.07,United States comedienne best known as the sta...,0,no_example,0.0,no_example
8,ball,ball.n.09,a lavish dance requiring formal attire,0,no_example,0.0,no_example
10,ball,ball.n.11,the game of baseball,0,no_example,0.0,no_example
16,dog,frank.n.02,a smooth-textured sausage of minced beef or po...,0,no_example,0.0,no_example
17,dog,pawl.n.01,a hinged catch that fits into a notch of a rat...,0,no_example,0.0,no_example
...,...,...,...,...,...,...,...
4040,stone,stone.n.08,United States filmmaker (born in 1946),0,no_example,0.0,no_example
4041,stone,stone.n.09,United States feminist and suffragist (1818-1893),0,no_example,0.0,no_example
4042,stone,stone.n.10,United States journalist who advocated liberal...,0,no_example,0.0,no_example
4043,stone,stone.n.11,United States jurist who served on the United ...,0,no_example,0.0,no_example


In [218]:
cdi_wordnet_df['num_childes_examples'] = cdi_wordnet_df.sense_name.apply(lambda sense_name: len(getChildesExamples(sense_name, target_tags_df)))
cdi_wordnet_df['childes_example'] = 'no_example'

In [224]:
def replaceChildesExample(string, sense_name):
    cdi_wordnet_df.loc[cdi_wordnet_df['sense_name'] == sense_name, 'childes_example'] = string
    return None

In [1339]:
len(cdi_wordnet_df.loc[(cdi_wordnet_df['wn_example']=='no_example') & 
                   (cdi_wordnet_df['semcor_sentence']=='no_example') &
                   (cdi_wordnet_df['childes_example']=='no_example') &
                  (cdi_wordnet_df['num_childes_examples']>0)])

47

In [1471]:
#manually pick through these examples from childes
target_senses = cdi_wordnet_df.loc[(cdi_wordnet_df['wn_example']=='no_example') & 
                   (cdi_wordnet_df['semcor_sentence']=='no_example') &
                   (cdi_wordnet_df['childes_example']=='no_example') &
                  (cdi_wordnet_df['num_childes_examples']>0)].head(1)

print(list(target_senses['lemma'])[0])
print(list(target_senses['sense_name'])[0])
print(list(target_senses['definition'])[0])
print(' ')
for gloss in getChildesExamples(list(target_senses['sense_name'])[0], target_tags_df)['gloss']:
    print(gloss)

IndexError: list index out of range

In [1469]:
bad_example_str = 'no_good_example'
select_example = "Here's a tool bench"

In [1470]:
replaceChildesExample(select_example, list(target_senses['sense_name'])[0])

In [1464]:
replaceChildesExample(bad_example_str, list(target_senses['sense_name'])[0])

In [1485]:
len(set(cdi_wordnet_df.sense_name))

2729

In [1483]:
#get senses with wordnet examples
cdi_wordnet_df = cdi_wordnet_df.groupby(['sense_name', 'lemma']).agg(np.random.choice).reset_index()

wn_examples = cdi_wordnet_df.loc[(cdi_wordnet_df['wn_example']!='no_example')]
wn_examples['example'] = wn_examples['wn_example']
wn_examples['example_source'] = 'wordnet'

#get senses without wordnet senses that do have semcor senses
semcor_examples = cdi_wordnet_df.loc[(cdi_wordnet_df['wn_example']=='no_example') & 
                                    (cdi_wordnet_df['semcor_sentence']!='no_example') ]

semcor_examples['example'] = semcor_examples['semcor_sentence']
semcor_examples['example_source'] = 'semcor'

#get senses without semcor or childes senses that do have childes
childes_examples = cdi_wordnet_df.loc[(cdi_wordnet_df['wn_example']=='no_example') & 
                                    (cdi_wordnet_df['semcor_sentence']=='no_example') &
                                    (~cdi_wordnet_df['childes_example'].isin(['no_example','no_good_example']))]

childes_examples['example'] = childes_examples['childes_example']
childes_examples['example_source'] = 'childes'

#get senses without any example
no_examples = cdi_wordnet_df.loc[(cdi_wordnet_df['wn_example']=='no_example') & 
                                    (cdi_wordnet_df['semcor_sentence']=='no_example') &
                                    (cdi_wordnet_df['childes_example'].isin(['no_example','no_good_example']))]
no_examples['example'] = 'no_example'
no_examples['example_source'] = 'no_example'

#merge back together
target_columns=['lemma', 'sense_name', 'definition', 
                'example', 'example_source']
examples_df = pd.concat([wn_examples[target_columns], semcor_examples[target_columns],
         childes_examples[target_columns], no_examples[target_columns]])

In [1488]:
examples_df.to_csv("sense_examples_from_corpora.csv")

In [27]:
examples_df = pd.read_csv("sense_examples_from_corpora.csv")
examples_df.groupby(by=['example_source'])['sense_name'].agg('nunique')

example_source
childes            264
hand_written       283
no_example          52
previous_study     446
semcor             219
wordnet           1544
Name: sense_name, dtype: int64

## Now use the examples from the previous qualtrics

In [1491]:
raw_word_sense_text_entry_df = pd.read_csv('../data/word_sense_text_entry_most.csv',dtype=str)
word_sense_text_entry_df = raw_word_sense_text_entry_df[raw_word_sense_text_entry_df['Q3']=='I consent to participate in this study.']
question_dict = raw_word_sense_text_entry_df.loc[0].to_dict()

sense_survey = pd.read_csv("final_sense_survey.csv")

In [1492]:
sense_survey[sense_survey['wordnet_sense']=='slide_fastener.n.01']
question_dict['zipper n_say_1']

'Please enter your best estimate of the first age (in numbers) by which you think a child would say the word [ zipper ] to express it in each of the following ways: - a fastener for locking together two toothed edges by means of a sliding tab ( EXAMPLE(s): The zipper on my jacket broke  )'

In [1513]:
sense_question_to_wn_map = pd.read_csv('sense_question_to_wn_map.csv')

In [1519]:
sense_question_to_wn_map['question_word'] = sense_question_to_wn_map.sense_question.apply(lambda ques: ques.split(" ")[0])

In [1548]:
import string

examples_df[examples_df['example_source']=='no_example']
def getExampleFromQuestion(lemma, sense_name):
    question_conversion = sense_question_to_wn_map[sense_question_to_wn_map['wordnet_sense']==sense_name]
    target_question = ''
    if len(question_conversion) == 0:
        return 'no_example'
    elif len(question_conversion) > 1:
        restricted_question_conversion = question_conversion[question_conversion['question_word']==lemma]
        if len(restricted_question_conversion) <1:
            return 'no_example'
        else:
            target_question = list(restricted_question_conversion.sense_question)[0]
    else:
        target_question = list(question_conversion.sense_question)[0]
    
    full_question = question_dict[target_question]
    match_example = re.search('(?<=\( EXAMPLE\(s\)\:).*', full_question)
    match_example = str(match_example.group()).strip()
    match_example = match_example.strip(string.punctuation).replace('"', '').replace("`","'").strip()
    return match_example

In [1551]:
missing_examples = examples_df[examples_df['example']=='no_example']
missing_examples['example'] = missing_examples.apply(lambda row: getExampleFromQuestion(row.lemma, row.sense_name), axis=1)
missing_examples.loc[missing_examples['example'] != 'no_example', 'example_source'] = 'previous_study'

#z['c'] = z.apply(lambda x: math.log(x.b) if x.b > 0 else 0, axis=1)

In [1553]:
examples_df = pd.concat([missing_examples, examples_df[examples_df['example']!='no_example']])
examples_df

Unnamed: 0,lemma,sense_name,definition,example,example_source
277,better,adept.s.01,having or showing knowledge and skill and apti...,She's better at computer science than the rest,previous_study
280,windy,airy.s.02,not practical or realizable; speculative,no_example,no_example
282,touch,allude.v.01,make a more or less disguised reference to,He touched on the problem but did not mention it,previous_study
283,jump,alternate.v.01,go back and forth; swing back and forth betwee...,He kept jumping in and out of focus,previous_study
284,dog,andiron.n.01,metal supports for logs in a fireplace,no_example,no_example
...,...,...,...,...,...
2882,cut,write_out.v.02,make out and issue,cut a ticket,wordnet
2883,yard,yard.n.02,the enclosed land around a house or other buil...,it was a small house with almost no yard,wordnet
2884,yard,yard.n.03,a tract of land enclosed for particular activi...,they opened a repair yard on the edge of town,wordnet
2885,yellow,yellow.s.03,changed to a yellowish color by age,yellowed parchment,wordnet


In [1554]:
examples_df.groupby(by=['example_source']).size().reset_index()

Unnamed: 0,example_source,0
0,childes,272
1,hand_written,8
2,no_example,209
3,previous_study,631
4,semcor,246
5,wordnet,1521


In [43]:
examples_df = pd.read_csv("sense_examples_from_corpora.csv")

In [44]:
examples_df.groupby(by=['example_source']).size().reset_index()

Unnamed: 0,example_source,0
0,childes,267
1,hand_written,291
2,no_example,53
3,previous_study,454
4,semcor,229
5,wordnet,1593


In [45]:
wn.synset('airplane.n.01').examples()

['the flight was delayed due to trouble with the airplane']

In [46]:
def checkExamples(sense_name, curr_example, curr_source):
    if curr_source == 'previous_study':
        if curr_example in wn.synset(sense_name).examples():
            return "wordnet"
        else:
            return "new_hand_written"
    else:
        return curr_source

In [47]:
examples_df.example_source = examples_df.apply(lambda row: checkExamples(row.sense_name, row.example, row.example_source),
                                               axis=1)

In [73]:
examples_df.groupby(by=['example_source']).size().reset_index()

Unnamed: 0,example_source,0
0,childes,267
1,hand_written,291
2,new_hand_written,367
3,no_example,54
4,semcor,229
5,wordnet,1679


In [58]:
def fixNoExample(example, example_source):
    if example == 'no_example':
        return "no_example"
    else:
        return example_source

In [64]:
examples_df = examples_df.fillna({"example":'no_example'})

In [65]:
examples_df.example_source = examples_df.apply(lambda row: fixNoExample(row.example, row.example_source), axis=1)

In [60]:
def containsWord(lemma, definition):
    print(lemma, definition)
    if lemma in definition:
        return True
    else:
        return False

In [69]:
examples_df['contains_word'] = examples_df.apply(lambda row: containsWord(row.lemma, row.example), axis=1)

airplane the flight was delayed due to trouble with the airplane
alligator she had an alligator purse
alligator He didn't want to be eaten by the alligator did he
animal We saw animals at the zoo
ankle be careful you're walking around with your pants down by your ankles
ant queen ants may live up to fifteen years
apple I don't have a pear, that's an apple
apple That's a nice apple tree
applesauce I made some homemade applesauce
applesauce The baby talks applesauce
arm There are three arms of the US government
arm Did you bash your head on the chair arm?
arm He did not lower his arm
arm She wiped it off with the sleeve of her coat
arm right to bear arms
arm the arm of the record player
asleep He is now asleep, our dear departed
asleep were all asleep when the phone rang
asleep my foot is asleep
awake awake to the dangers of her situation
awake lay awake thinking about his new job
bad a bad report card
bad a bad headache
bad my throat feels bad
bad bad meat
bad a bad (or uncollectible) d

cold the boxer was out cold
cold cold in his grave
comb The turkey's comb
comb There are special combs for animal grooming
comb no_example
comb Now get your comb and your razor and all your styling supplies
comb his hair needed a comb
cook I have to cook your toast
cook My husband doesn't cook
cook These potatoes have to cook for 20 minutes
cook The apothecary cooked the medicinal mixture in a big iron kettle
cook cook the books
cookie The cookie at camp is so talented
cookie please clear your cookies
cookie He gave you a chocolate chip cookie at ten o'clock in the morning
corn I have corn on my feet from running in bad shoes
corn Have some corn cereal for breakfast
corn You can only find the strongest corn in the midwest
corn Nebraska's got lots of corn
corn Improves utilization of low moisture corn ( less than 14 % ) .
corn They had already lost most of their corn , she thought .
corn that movie was pure corn
couch Freud always had people lay on his couch during analysis
couch no_exa

fall fall into a trap
fall fall into a category
fall We must stand or fall
fall Several deer have fallen to the same gun
fall Christmas falls on a Monday this year
fall The line of men fall forward
fall payments fall on the 1st of the month
fall a fallen woman
fall The hills around here fall towards the ocean
fall rain, snow and sleet were falling
fast He made a_rope fast to each corner
fast fast women
fast fast film
fast my watch is fast
fast the band played a fast fox trot
fast a fast road
fast fast colors
fast a fast lens
fast fast friends
fast a fast visit
feed His admiration fed her vanity
feed This dog doesn't feed certain kinds of meat
feed We should feed soil if we want to grow healthy plants
feed Feed the steward
feed Fed to dairy cattle to increase milk production and butterfat percentage .
feed What does he feed his SMU football mastodons at the training table ?
feed Her success feeds her vanity
feed feed carrots into a food processor
feed This dish feeds six
feed feed one's

hit he hit his fist on the table
hit hit the bottle
hit He tries to hit on women in bars
hit We hit Detroit by noon
hit The thermometer hit 100 degrees
hit He hit a home run
hold She holds the title of Duchess
hold She was hold incompetent
hold Hold that position behind the trees
hold The bridge held
hold Hold a grudge
hold They held on the road and kept marching
hold Hold the taxi
hold The dissatisfied students held the President's office for almost a week
hold Hold the fire extinguisher directly on the flames
hold I hold the right to disagree
hold hold everything
hold The auditorium can't hold more than 500 people
hold I hold with those who say life is sacred
hold This theory holds for all irrational numbers
hold He can hold his liquor
hold The flask holds one gallon
hold hold your tongue
hold hold these truths to be self-evident
hold She holds a Master's degree from Harvard
hold A crazy idea took hold of him
hold hold a reception
hold The beam holds up the roof
hold The canteen hold

plate How about Mommy helps you finish eating what you have on your plate?
plate The Anniston catcher was straddling home plate .
plate a vegetable plate
play Play about with a young girl's affection
play Gielgud played Hamlet
play She plays deaf when the news are bad
play She acts in this play
play She played all her money on the dark horse
play play a joke
play She plays with the thought of moving to Tasmania
play Princeton plays Yale this weekend
play We played hockey all afternoon
play This factor played only a minor part in his decision
play The band played all night long
play The kids were playing outside all day
play She played the third movement very beautifully
play He plays the flute
play The spotlights played on the politicians
play He played $20 on the new horse
play On weekends I play
play Let's play like I am mommy
play The tape was playing for hours
play The prodigy played Carnegie Hall at the age of 16
play He is playing his cards close to his chest
play They played gam

stone he must have a heart of stone
stop Stop the rebel movement
stop The fighter plane was ordered to stop an aircraft that had entered the country's airspace
stop Are you gonna stop eating it
stop Just stop a minute
stop Hans shook his head but neither of us tried to stop her .
stop stop the busy road
stop stop the project
stop Your rights stop where you infringe upon the rights of other
stop the car stopped
stop we stopped at Aunt Mary's house
stop stop a car
story how can I stop my child from telling stories?
story what story is the office on?
story it's a story book, what's the story about
story `` She says she has to finish a story '' .
story a history of France
story the story was on the 11 o'clock news
stove Boil some water on the stove
stove dinner was already on the stove
strawberry Pick some strawberries from the garden
strawberry There's a strawberry on her back
strawberry I'm getting you some strawberries do you want them in a bowl
street they walked the streets of the sma

tv What were they showing on tv?
tv the British call a tv set a telly
underpants okay let's find you some underpants
vacuum Black holes are vacuums
vacuum This town is like a vacuum
vacuum Going to have_to go round with the vacuum cleaner I think and pick up all these bits
vacuum without their support he'll be ruling in a vacuum
vitamins You have to take your vitamins to stay healthy
wait I can hardly wait '' .
wait he is waiting to be drafted
wait I had to wait on line for an hour to get the tickets
wait I'm waiting on tables at Maxim's
wake You awaken something in me
wake His words woke us to terrible facts of the situation
wake He told me to wake you '' .
wake `` Carla , wake up '' , he said shaking her .
wake Please wake me at 6 AM.
walk The batter walked to first base
walk How can you walk Maris to get to Mantle '' ?
walk We walked instead of driving
walk I'll walk you to your car
walk Paul walked the streets of Damascus
walk walk in sadness
walk We must walk with our dispossessed

In [71]:
examples_df.groupby(by=['contains_word']).size().reset_index()

Unnamed: 0,contains_word,0
0,False,450
1,True,2437


In [66]:
examples_df[examples_df['lemma']=='green']

Unnamed: 0,lemma,sense_name,definition,example,example_source
1308,green,fleeceable.s.01,naive and easily deceived or tricked,At that early age she had been gullible and in...,new_hand_written
1309,green,green.a.02,concerned with or supporting or in conformity ...,no_example,no_example
1310,green,green.a.03,not fully developed or mature; not ripe,fried green tomatoes,wordnet
1311,green,green.s.01,of the color between blue and yellow in the co...,a green tree,wordnet
1312,green,green.s.04,looking pale and unhealthy,you're looking green,wordnet


In [72]:
# Move to a google sheet to fill the rest
examples_df.to_csv("sense_examples_from_corpora.csv", index=False)

Estimates Counts

In [1504]:
num_senses = len(cdi_wordnet_df)
num_words = len(set(cdi_wordnet_df.lemma))
num_estimations = 15
num_estimations_from_part = 50
print(f"{num_senses} senses for {num_words} words")
print(f"    {num_estimations} estimations per senses")
print(f"{num_senses*num_estimations} estimations needed")
print(f"    {num_estimations_from_part} estimations per participant")
print(f"{num_senses*num_estimations/num_estimations_from_part} participants needed")


2887 senses for 439 words
    15 estimations per senses
43305 estimations needed
    50 estimations per participant
866.1 participants needed


In [None]:
cdi_wordnet_df.groupby(by=['lemma'])['sense_name'].agg('nunique').reset_index().hist('sense_name', bins=60)

In [None]:
len(set(sense_survey.word_sense_combo))

In [None]:
cdi_wordnet_df['sense_pos'] = cdi_wordnet_df.sense_name.apply(lambda sense_name: sense_name.split(".")[1])

In [None]:
cdi_wordnet_df = pd.merge(cdi_wordnet_df, cdi_items[['lexical_class', 'fixed_lemma']],
                         left_on='lemma', right_on='fixed_lemma', how='left')

In [None]:
cdi_wordnet_df.lexical_class = cdi_wordnet_df.lexical_class.replace('nouns', 'n')
cdi_wordnet_df.lexical_class = cdi_wordnet_df.lexical_class.replace('verbs', 'v')
cdi_wordnet_df.lexical_class = cdi_wordnet_df.lexical_class.replace('adjective', 'a')

In [None]:
cdi_wordnet_df[cdi_wordnet_df['sense_pos']==cdi_wordnet_df['lexical_class']]

In [None]:
cdi_wordnet_df[cdi_wordnet_df['lemma']=='water']

In [None]:
84840/60

In [None]:
cdi_wordnet_df.lemma.value_counts().hist(bins=30)

In [None]:
set(sense_survey.wordnet_sense)

In [None]:
cdi_wordnet_df.lemma.value_counts().hist()

In [None]:
cdi_items

In [None]:
set(cdi_items.fixed_lemma) - set(cdi_wordnet_df.lemma)

In [None]:
# match from wordsense project <3

# qualities of a good example
# 3 or more words in the utterance
# less than 7 words in the utterance
# descriptive <- hard to quantify

In [None]:
# What to do about mine and IT - technically polysemous, but across the pronoun vs other noun boundary.
# Do we want to show all senses for a single word, even across pos