In [15]:
from generationary_paired_data_creator import get_closest_words_with_sentences
from data.db import DbConnection
from typing import Tuple, Optional, List
import numpy as np
import spacy
from nlp.embedding import EmbeddingExtractor

nlp = spacy.load('en_core_web_md')
extractor = EmbeddingExtractor()

word_db = DbConnection('/home/josh/scrapbox/toudai/cuwsdm_words')
sentence_db = DbConnection('/home/josh/scrapbox/toudai/cuwsdm_sentences')


def embed_word(sentence: str, word:str) -> Tuple[str, str, int, np.ndarray]:
    doc = nlp(sentence)

    for token, embedding in extractor.get_word_embeddings(doc):
        if token.text == word:
            return token.text, token.lemma_, token.pos, embedding

    raise StopIteration(f"Couldn't find target word {word}")



def get_closest_words(target_lemma, target_embedding, target_pos: Optional[int] = None):
    if target_pos:
        where_clause = f'where pos={target_pos} and (form=\'{target_lemma}\' or lemma=\'{target_lemma}\')'
    else:
        where_clause = f'where form=\'{target_lemma}\' or lemma=\'{target_lemma}\''

    words = list(word_db.read_words(use_tqdm=False, where_clause=where_clause))

    if len(words) == 0:
        raise RuntimeError(f'{target_lemma} not found in db')

    return get_closest_words_with_sentences(target_embedding, words, sentence_db)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
import tabulate
from scipy.spatial.distance import cdist


def table_for_word(sentence: str, word: str):
    form, lemma, pos, embedding = embed_word(sentence, word)
    closest_words_with_sentences = get_closest_words(lemma, embedding)
    word_embeddings = np.array([w.embedding for w in closest_words_with_sentences])
    distances = cdist( np.expand_dims(embedding, axis=0), word_embeddings)[0]

    data = [
        [w.sentence, distance] for w, distance in zip(closest_words_with_sentences, distances)
    ]

    return tabulate.tabulate(data, tablefmt='html')

table_for_word('I hate dogs more than cats', 'hate')

0,1
He also hates pumpernickel bread.,12.4524
"He hates carrots and is allergic to horses, as shown in ""Free Tippy"".",12.5535
"Although Sebastian hates dogs, Pluto took a liking to him and only listened to Sebastian's orders.",12.6226
"As a running gag, she hates carrots and grimaces in disgust whenever she uses them to transform.",12.827
"John happened to hate tea, and not wanting to seem rude, he dumped it when she wasn't looking.",12.8335


In [21]:
def comparison_table(sentence: str, word: str, comparison_targets: List[str]):
        form, lemma, pos, embedding = embed_word(sentence, word)

        cmp_results = [embed_word(s, word) for s in comparison_targets]

        sentence_with_embedding = [
            (cmp_trg, cmp_res[3]) for cmp_trg, cmp_res in zip(comparison_targets, cmp_results)
        ]

        word_embeddings = np.array([x[1] for x in sentence_with_embedding])
        distances = cdist(np.expand_dims(embedding, axis=0), word_embeddings)[0]

        data = [
            [x[0], distance] for x, distance in zip(sentence_with_embedding, distances)
        ]
        data.sort(key=lambda x: x[1])

        return tabulate.tabulate(data, tablefmt='html')



comparison_table('I lead him home', 'lead', [
    'It is made of lead',
    'I lead an army',
    'Who will lead development of the software?',
    'I am the lead developer',
    'Can you lead my dog to his food?'
])

0,1
Can you lead my dog to his food?,12.7813
I lead an army,15.1573
Who will lead development of the software?,15.4442
I am the lead developer,17.5069
It is made of lead,19.7876


In [22]:
# word used in test data

comparison_table(
    'He is a dog wrangler', 'wrangler', [
        'I am a legendary wrangler in the field of mathematics',
        'My father works as a wrangler on a farm',
        'Do you know a wrangler who can help solve business problems?'
    ]
)

0,1
I am a legendary wrangler in the field of mathematics,12.2327
My father works as a wrangler on a farm,12.9508
Do you know a wrangler who can help solve business problems?,13.7292


In [24]:
table_for_word('My father is a wrangler in the field of biology', 'wrangler',)

0,1
"Thierry Sabine (13 June 1949, Neuilly-sur-Seine – 14 January 1986, Mali) was a French wrangler, motorcycle racer and founder and main organizer of Paris Dakar.",13.9026
Mathematical insight is something higher than skill in solving problems; consequently the senior wrangler has not always turned out the most distinguished mathematician in after life.,14.2743
"As part of Cabaret Voltaire, Mallinder has had video work exhibited at MoMA in New York, and with Wrangler in the Turbines (Tate Modern, 2010).",14.2871
"1 EP Maps, May 2012 Vocals on Ashtar Command American Sunshine ""The Breakup Song"", ""Rosa"", ""All the Stars in Heaven"", 2011 Vocals on TV on the Radio Nine Types of Light - ""Will Do"", 2011 Vocals on Amos Lee's Mission Bell ""Stay With Me"" – 2011 Credits of Priscilla Ahn as a vocalist and/or songwriter Credits of Priscilla Ahn as a vocalist and/or songwriter 2009: Men in Trees – Leave the Light On (Warner Bros. Television) 2009: Grey's Anatomy – Dream (ABC Studios) 2009: Psych – A Good Day (Morning Song) (NBC Studios, Inc.) 2009: Knight Rider-I Don't Think So (NBC Studios, Inc) 2009: Ghost Whisperer – Dream (ABC Studios) 2009: Eli Stone – Dream (ABC Studios) 2009: ER Promos – Silent Night (NBC Studios, Inc) 2010: Jeep Wrangler ""We Build"" – In a Tree (commercials – BBDO) 2010: So You Think You Can Dance – Dream (DanceNation Productions, Inc.) 2010: Life – Dream (NBC Universal Television) 2010: The Biggest Loser – Red Cape (BL4 Productions, Inc) 2010: Brothers And Sisters – Dream (ABC Studios) 2010: Grey's Anatomy – Christmas Time Is Here (American Broadcasting Company) 2010: Kansai Electric Power – Leave The Light On (EMI Music Japan) 2011: So You Think You Can Dance Season 7 – Dream (Dick Clark Productions, Inc.) 2011: Psych – Christmas Time Is Here (GEP) 2011: Kansai Electric Power – Leave The Light On (EMI Japan) 2011: ""Married, Single, Other"" BBC – Find My Way Back Home (EMI UK) 2011: ""Married, Single, Other"" ITV – Find My Way Back Home (EMI UK) 2012: ABC Family: Make It or Break It – Dream (ABC Studios) 2012: United States: In Plain Sight – Season 4 – Wallflower (NBC Universal Television Music) 2012: WB: Men in Trees (ET Upgrade) – Leave The Light On (Warner Bros. Television) 2012: RIS – Dream (EMI Music France) 2012: NHK: Soko wo Nantoka – I'll Be Here (EMI Music Japan) 2014: NHK: Soko wo Nantoka 2 – Best I can ベスト・アイ・キャン (Universal Music Japan) 2008: The Tonight Show with Jay Leno – (Ep.",14.2873
"Wrangler may refer to: Wrangler (profession), a handler of animals, especially horses and cattle, or a professional who searches for and/or handles animals (or other products) for film productions Jeep Wrangler, a type of motor vehicle Goodyear Wrangler, a commercial line (family) of automotive tires for SUVs / 4x4s Wrangler (jeans), a brand of jeans Wrangler (TV series), a 1960 Western program starring Jason Evers Wrangler (University of Cambridge), a student who has completed the final year of the mathematical tripos with first-class honours Wrangler: Anatomy of an Icon, a documentary about Jack Wrangler Data wrangler, a professional in computing who transforms raw data to a clean format Wrangler (band), an electronica musical band founded by Stephen Mallinder Ruger Wrangler, a single-action rimfire revolver ""Wrangler Jane"" Thrift, a character on the TV series F Troop.",14.3688


In [25]:
table_for_word('My father is a wrangler on a farm in Texas', 'wrangler')

0,1
"Thierry Sabine (13 June 1949, Neuilly-sur-Seine – 14 January 1986, Mali) was a French wrangler, motorcycle racer and founder and main organizer of Paris Dakar.",14.0178
"3-5] (uncredited) The Utah Kid (1944) - Henchman Slim Gangsters of the Frontier (1944) - Townsman in Mayor's Office (uncredited) Black Arrow (1944)* - Paul Brent (uncredited) Mystery of the River Boat (1944, Serial) - Citizen (uncredited) The Great Mike (1944) - Sam McBride Harmony Trail (1944) - Bronco Crazy Knights (1944) - Electrician The Big Bonanza (1944)* - Rancher Willis (uncredited) Rough Ridin' Justice (1945)* - Henchman Mike (uncredited) Sudan (1945) - Man (uncredited) The Return of the Durango Kid (1945)* - Kirby Henchman (uncredited) Beyond the Pecos (1945)* - Sheriff (uncredited) Both Barrels Blazing (1945) - Henchman Poker Player (uncredited) Trail to Vengeance (1945)* - Henchman Clancy (uncredited) Flaming Bullets (1945) - Henchman (uncredited) Frontier Feud (1945) - Townsman (uncredited) San Antonio (1945)* - Joey Simms (uncredited) Gun Town (1946) - Joe - Henchman Gunman's Code (1946) - Townsman (uncredited) The Yearling (1946)* - Millwheel Forrester (uncredited) Duel in the Sun (1946)* - Ed, the Wrangler (uncredited) The Sea of Grass (1947)* - Wake - Brewton Ranch Hand (uncredited) Albuquerque (1948) - Henchman Jackson The Westward Trail (1948) - Henchman (uncredited) Silver River (1948) - Miner (uncredited) I Wouldn't Be in Your Shoes (1948) - Prisoner Shaggy (1948) - Joe Simms Four Faces West (1948) - Clint Waters The Walls of Jericho (1948) - Loafer (uncredited) Red River (1948)* - Laredo (uncredited) Station West (1948)* - Pete Sunset Carson Rides Again (1948) - Sheriff Norton Unknown Island (1948)** - Crewman Edwards Gunning for Justice (1948) - Sheriff Outlaw Country (1949) - Jim McCord Dynamite (1949)* - Skipper Brown (uncredited) Cover-Up (1949) - Gabe South of St. Louis (1949) - Sentry (uncredited) El Paso (1949) - Henchman (uncredited) She Wore a Yellow Ribbon (1949) - Trooper (uncredited) The Cowboy and the Indians (1949) - Farmer (uncredited) Intruder in the Dust (1949)* - Will Legate (uncredited) Roseanna McCoy (1949) - Abel Hatfield Return of the Frontiersman (1950) - Nicol The Gunfighter (1950) - Card Player in Barber Shop (uncredited) A Lady Without Passport (1950) - Airport Dispatcher (uncredited) Never a Dull Moment (1950) - Shivaree Partyer (uncredited) Vengeance Valley (1951)* - Cowhand at Campfire (uncredited) Sugarfoot (1951) - Rancher (uncredited) Oh!",14.0816
Wieghorst worked with the mounted patrol of the 7th Cavalry Regiment of the United States Cavalry (1920-1922) with occasional interludes as a wrangler on ranches in the western states.,14.4171
"Wrangler may refer to: Wrangler (profession), a handler of animals, especially horses and cattle, or a professional who searches for and/or handles animals (or other products) for film productions Jeep Wrangler, a type of motor vehicle Goodyear Wrangler, a commercial line (family) of automotive tires for SUVs / 4x4s Wrangler (jeans), a brand of jeans Wrangler (TV series), a 1960 Western program starring Jason Evers Wrangler (University of Cambridge), a student who has completed the final year of the mathematical tripos with first-class honours Wrangler: Anatomy of an Icon, a documentary about Jack Wrangler Data wrangler, a professional in computing who transforms raw data to a clean format Wrangler (band), an electronica musical band founded by Stephen Mallinder Ruger Wrangler, a single-action rimfire revolver ""Wrangler Jane"" Thrift, a character on the TV series F Troop.",14.4609
"Kate first entered the world of country music when her husband entered her into a country music talent search show, the Wrangler Country Showdown (which later became the Colgate Country Showdown), which she won.",14.59
