In [1]:
import torch
from transformers import BertTokenizer, BertModel

### Perform Word Embedding

In [2]:
sentence = ["He who controls the spice"]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
print(last_hidden_states.shape)

torch.Size([1, 8, 768])


### Perform Sentence Embedding

In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-mpnet-base-v2')

  0%|          | 0.00/405M [00:00<?, ?B/s]

In [12]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)
print(sentence_embeddings.shape)

(3, 768)


In [74]:
#!/usr/bin/env python3
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

"""
    Generates word embeddings for a given word and context using
    pretrained BERT.

    - sentence
"""
def compute_bert_word_embedding(sentences):
    embeddings = []
    for sentence in sentences:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
        outputs = model(input_ids)
        last_hidden_state = outputs[0].squeeze()  # The last hidden-state is the first element of the output tuple
        embeddings.append(last_hidden_state)

    return embeddings

"""
    Generates sentence embeddings for a sentence using pretrained
    BERT.
"""
def compute_bert_sentence_embedding(sentences):
    model = SentenceTransformer('paraphrase-mpnet-base-v2')
    sentence_embeddings = model.encode(sentences)
    return sentence_embeddings


In [75]:
print(compute_bert_word_embedding(["hello it me you want what are you"])[0].shape)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([10, 768])


In [71]:
print(compute_bert_sentence_embedding(["hello its me", "youre looking for"]))

(2, 768)


### Test GraphConstructor

In [2]:
# test data
label = {"text": ["jorge pereira won the open savoury amateur prize with his empanada chilena , a traditional chilean pasty made with beef , onion , hard-boiled egg , olives and sultanas", "a chilean miner stole the show at the world pasty championships by beating his cornish competition .", "jorge pereira won the open savoury amateur prize with his empanada chilena , a traditional chilean pasty made with beef , onion , hard-boiled egg , olives and sultanas .", "mr pereira decided to take part in the contest while on a two-month visit to the uk to see his wife 's family .", "wife gail , who spoke on the non-english-speaking cook 's behalf , said : ` jorge feels very excited and happy to be so far from my country to win such a prize .", "` it 's all about getting recognition for his country rather than winning . '", "there were also pasty makers from the us and canada at the championships , held on saturday at the eden project in bodelva , cornwall , to celebrate the british delicacy .", "betty lethbridge , 88 , from st kew , cornwall , who took the cornish pasty amateur title , has been baking for eight decades .", "but she only decided to enter this year at the behest of her son , fisherman 's friends singer john lethbridge .", "she said : ` i 'm shocked really , i did n't think for one minute i would win anything . '", "i did it to please john .", "he said to me ` you make a good pasty , mother , why do n't you enter ? '", "` i 've been making them for years and years .", "i started when i was eight years old .", "` mother used to make pasties so i used to roll the pastry out on a bench .", "you need to get really good meat to make a pasty and the seasoning is important . '", "eden project spokesman david rowe said : ` the fourth world pasty championships was a very memorable one for sure and mrs lethbridge 's win was such a heart-warming story .", "creative entries to the world pasty championships , held on saturday at the eden project in bodelva , cornwall", "pasty makers came from across the globe , including the us , canada and chile to compete in the annual championships to celebrate the british delicacy", "the pasty winners including mr pereira ( centre ) pose with their pastry trophies", "world pasty championships , eden project , cornwall", "` we commend the entrants who took so much pride in their pasties .", "` to have pasty makers from the usa , canada and chile gave it a great international feel and there was a nice spread of entrants from different parts of the uk . '", "in the professional ranks , west cornwall pasty company were came tops in the prestigious cornish pasty company category .", "open savoury professional winner luisa ead from padstow , cornwall , scored 97 with a smoked haddock , white wine and mustard pasty while daniel beddoes ,13 , from bristol , defended his title when he won the open savoury junior category with a barbeque effort .", "marion symonds , of portreath bakery in redruth , cornwalll , was handed the pasty ambassador award for her contribution to the cornish pasty association and the industry .", "marion has been dubbed as ` mama pasty ' in mexico after she travelled the globe to promote the cornish food .", "each pasty went under close scrutiny to pick the tastiest winner in each category at the competition", "a huge globe pasty was made for the world pasty championships , in cornwall which are held to celebrate the british delicacy"], "summary": ["jorge pereira won the amateur prize at the world pasty championships .", "the chilean miner beat international entrants at the cornwall competition .", "his winning 'em panada chilena ' had beef , onion , egg , olives and sultanas ."], "label": [0, 1]}
tfidf = {"0": {"jorge": 0.1970754899950844, "won": 0.1970754899950844, "amateur": 0.1970754899950844, "boiled": 0.21588043496510168, "chilena": 0.21588043496510168, "pereira": 0.18248924147735213, "traditional": 0.21588043496510168, "prize": 0.1970754899950844, "made": 0.1970754899950844, "hard": 0.21588043496510168, "chilean": 0.1970754899950844, "beef": 0.21588043496510168, "the": 0.08273535496020604, "savoury": 0.1970754899950844, "his": 0.16049502459871406, "with": 0.36497848295470425, "pasty": 0.0918711888787694, "open": 0.1970754899950844, "onion": 0.21588043496510168, "olives": 0.21588043496510168, "empanada": 0.21588043496510168, "egg": 0.21588043496510168, "and": 0.12526238236651893, "sultanas": 0.21588043496510168}, "1": {"at": 0.2297707086462822, "competition": 0.2908048946349681, "stole": 0.3265076047003899, "cornish": 0.2297707086462822, "by": 0.3265076047003899, "pasty": 0.12375642751599365, "miner": 0.3265076047003899, "championships": 0.20443923272301817, "chilean": 0.26547341871170405, "the": 0.22289973786349668, "world": 0.2297707086462822, "beating": 0.3265076047003899, "show": 0.3265076047003899, "his": 0.21619716823995921}, "2": {"jorge": 0.1970754899950844, "won": 0.1970754899950844, "amateur": 0.1970754899950844, "boiled": 0.21588043496510168, "chilena": 0.21588043496510168, "pereira": 0.18248924147735213, "traditional": 0.21588043496510168, "prize": 0.1970754899950844, "made": 0.1970754899950844, "hard": 0.21588043496510168, "chilean": 0.1970754899950844, "beef": 0.21588043496510168, "the": 0.08273535496020604, "savoury": 0.1970754899950844, "his": 0.16049502459871406, "with": 0.36497848295470425, "pasty": 0.0918711888787694, "open": 0.1970754899950844, "onion": 0.21588043496510168, "olives": 0.21588043496510168, "empanada": 0.21588043496510168, "egg": 0.21588043496510168, "and": 0.12526238236651893, "sultanas": 0.21588043496510168}, "3": {"see": 0.25030607692104806, "while": 0.22293579468788, "visit": 0.25030607692104806, "uk": 0.22293579468788, "family": 0.25030607692104806, "pereira": 0.18845331712120475, "two": 0.25030607692104806, "decided": 0.22293579468788, "month": 0.25030607692104806, "take": 0.25030607692104806, "contest": 0.25030607692104806, "mr": 0.22293579468788, "the": 0.17087858943603718, "on": 0.17614598816280297, "to": 0.3298099720374512, "in": 0.13522974732335197, "his": 0.16574028979582717, "part": 0.25030607692104806, "wife": 0.22293579468788}, "4": {"gail": 0.21255351592449967, "jorge": 0.17282080943703718, "behalf": 0.21255351592449967, "such": 0.18931137257717393, "be": 0.21255351592449967, "so": 0.17282080943703718, "far": 0.21255351592449967, "and": 0.10984595960224892, "cook": 0.21255351592449967, "the": 0.07255286293409717, "to": 0.18671079292422438, "on": 0.1495786660897114, "speaking": 0.21255351592449967, "prize": 0.17282080943703718, "non": 0.21255351592449967, "my": 0.21255351592449967, "who": 0.17282080943703718, "from": 0.14074241328771422, "happy": 0.21255351592449967, "win": 0.17282080943703718, "said": 0.16002973493280964, "very": 0.18931137257717393, "wife": 0.18931137257717393, "spoke": 0.21255351592449967, "english": 0.21255351592449967, "feels": 0.21255351592449967, "excited": 0.21255351592449967, "country": 0.18931137257717393}, "5": {"about": 0.3281805437476925, "all": 0.3281805437476925, "it": 0.26683363465090476, "country": 0.29229490239092215, "recognition": 0.3281805437476925, "than": 0.3281805437476925, "for": 0.20548672555411704, "rather": 0.3281805437476925, "getting": 0.3281805437476925, "winning": 0.3281805437476925, "his": 0.21730490563859275}, "6": {"held": 0.20001312297433757, "bodelva": 0.21909837690879452, "celebrate": 0.20001312297433757, "there": 0.21909837690879452, "project": 0.18520944993217386, "british": 0.20001312297433757, "and": 0.12712955978928664, "us": 0.21909837690879452, "the": 0.33587447584464036, "to": 0.1080443058548297, "on": 0.17311396834904058, "also": 0.24599753153409148, "makers": 0.20001312297433757, "cornwall": 0.15402871441458366, "were": 0.21909837690879452, "at": 0.34622793669808116, "in": 0.1329020231578971, "from": 0.1628873843857124, "pasty": 0.093240632812666, "championships": 0.15402871441458366, "eden": 0.18520944993217386, "saturday": 0.21909837690879452, "canada": 0.20001312297433757, "delicacy": 0.20001312297433757}, "7": {"for": 0.16857958704671522, "amateur": 0.21890807699778617, "the": 0.0919010144514658, "cornish": 0.18946779765848812, "kew": 0.26923656694885717, "decades": 0.26923656694885717, "from": 0.17827512291606235, "pasty": 0.10204894220713819, "baking": 0.26923656694885717, "88": 0.26923656694885717, "cornwall": 0.16857958704671522, "st": 0.26923656694885717, "who": 0.21890807699778617, "has": 0.23979628760955912, "been": 0.21890807699778617, "eight": 0.23979628760955912, "title": 0.23979628760955912, "betty": 0.26923656694885717, "lethbridge": 0.21890807699778617, "took": 0.23979628760955912}, "8": {"friends": 0.2585357391597812, "but": 0.2585357391597812, "john": 0.23026556595741052, "enter": 0.23026556595741052, "fisherman": 0.2585357391597812, "only": 0.2585357391597812, "of": 0.2102075588618672, "this": 0.2585357391597812, "at": 0.18193738565949655, "son": 0.2585357391597812, "her": 0.23026556595741052, "the": 0.08824840165658744, "behest": 0.2585357391597812, "singer": 0.2585357391597812, "to": 0.11355119826603927, "year": 0.2585357391597812, "she": 0.2102075588618672, "lethbridge": 0.2102075588618672, "decided": 0.23026556595741052}, "9": {"did": 0.28655025562584724, "one": 0.28655025562584724, "shocked": 0.32173061498198563, "think": 0.32173061498198563, "minute": 0.32173061498198563, "for": 0.20144817187574485, "said": 0.2422282445500704, "she": 0.2615893934288652, "win": 0.2615893934288652, "would": 0.32173061498198563, "anything": 0.32173061498198563, "really": 0.28655025562584724}, "10": {"did": 0.480172092925497, "to": 0.2367879726128344, "please": 0.5391238001749402, "it": 0.4383451909875716, "john": 0.480172092925497}, "11": {"why": 0.30274416394098685, "you": 0.5392798417482314, "enter": 0.2696399208741157, "pasty": 0.1147493523620465, "he": 0.2696399208741157, "said": 0.22793350699096526, "mother": 0.2696399208741157, "good": 0.2696399208741157, "to": 0.13296793199760873, "me": 0.30274416394098685, "do": 0.30274416394098685, "make": 0.24615208662652746}, "12": {"years": 0.6507334415892818, "ve": 0.365312656753008, "and": 0.1887906636657796, "been": 0.29702462818857733, "them": 0.365312656753008, "making": 0.365312656753008, "for": 0.22873659962414672}, "13": {"old": 0.45291025025823084, "was": 0.3187230792081898, "started": 0.45291025025823084, "years": 0.40338575797124326, "eight": 0.40338575797124326, "when": 0.40338575797124326}, "14": {"bench": 0.2876544902472511, "used": 0.5753089804945022, "so": 0.23388313115642134, "pasties": 0.25620026156630404, "roll": 0.2876544902472511, "mother": 0.25620026156630404, "the": 0.09818777502932301, "on": 0.20242890247547432, "to": 0.25268082594952374, "out": 0.2876544902472511, "pastry": 0.25620026156630404, "make": 0.23388313115642134}, "15": {"the": 0.10615281070310745, "important": 0.3109891495351247, "good": 0.2769833399322116, "is": 0.3109891495351247, "seasoning": 0.3109891495351247, "pasty": 0.11787445556749142, "meat": 0.3109891495351247, "and": 0.1607167089567562, "you": 0.2769833399322116, "need": 0.3109891495351247, "to": 0.27317840614388317, "get": 0.3109891495351247, "really": 0.2769833399322116, "make": 0.252855834047397}, "16": {"sure": 0.22827739101122818, "one": 0.20331588509701362, "was": 0.3212878177750554, "project": 0.1718681067955436, "rowe": 0.22827739101122818, "mrs": 0.22827739101122818, "win": 0.18560541480174225, "and": 0.11797193267804176, "the": 0.07792003904971363, "warming": 0.22827739101122818, "world": 0.1606439088875277, "david": 0.22827739101122818, "spokesman": 0.22827739101122818, "for": 0.14293343859225635, "pasty": 0.08652415437657175, "heart": 0.22827739101122818, "said": 0.1718681067955436, "championships": 0.14293343859225635, "story": 0.22827739101122818, "very": 0.20331588509701362, "eden": 0.1718681067955436, "such": 0.20331588509701362, "memorable": 0.22827739101122818, "lethbridge": 0.18560541480174225, "fourth": 0.22827739101122818}, "17": {"held": 0.2749372894558824, "at": 0.23796181229040314, "bodelva": 0.3011718080078979, "to": 0.14851729802089295, "entries": 0.3381472851733771, "pasty": 0.12816822452172155, "project": 0.254588215956711, "cornwall": 0.2117272937383877, "championships": 0.2117272937383877, "world": 0.23796181229040314, "eden": 0.254588215956711, "the": 0.23084589804137204, "saturday": 0.3011718080078979, "on": 0.23796181229040314, "creative": 0.3381472851733771, "in": 0.18268662309184244}, "18": {"across": 0.26436465563193723, "in": 0.14282500058361802, "celebrate": 0.21494687384556618, "came": 0.23545710641808923, "annual": 0.26436465563193723, "from": 0.17504918448317094, "pasty": 0.10020233792877838, "globe": 0.21494687384556618, "including": 0.23545710641808923, "compete": 0.26436465563193723, "championships": 0.16552909205919522, "british": 0.21494687384556618, "and": 0.13662154284534722, "us": 0.23545710641808923, "the": 0.36095215910701267, "to": 0.23222262054564838, "chile": 0.23545710641808923, "makers": 0.21494687384556618, "canada": 0.21494687384556618, "delicacy": 0.21494687384556618}, "19": {"with": 0.2572291044346068, "pereira": 0.2572291044346068, "their": 0.3042959163042942, "pasty": 0.12949773612578308, "including": 0.3042959163042942, "winners": 0.34165494661752643, "trophies": 0.34165494661752643, "mr": 0.3042959163042942, "the": 0.11662025163348767, "centre": 0.34165494661752643, "pose": 0.34165494661752643, "pastry": 0.3042959163042942}, "20": {"eden": 0.4708624727733462, "world": 0.4401118368327703, "pasty": 0.23704792031508953, "championships": 0.39159093325913186, "project": 0.4708624727733462, "cornwall": 0.39159093325913186}, "21": {"commend": 0.3351329053047962, "who": 0.2724863887188468, "entrants": 0.29848704230120987, "their": 0.29848704230120987, "we": 0.3351329053047962, "pasties": 0.29848704230120987, "so": 0.2724863887188468, "the": 0.11439402278305023, "much": 0.3351329053047962, "in": 0.18105808161582637, "took": 0.29848704230120987, "pride": 0.3351329053047962}, "22": {"was": 0.15030580711173072, "spread": 0.21358679416426174, "different": 0.21358679416426174, "nice": 0.21358679416426174, "entrants": 0.19023166467854272, "feel": 0.21358679416426174, "it": 0.17366093659744974, "of": 0.3473218731948995, "and": 0.22075989908983742, "the": 0.145811122757784, "to": 0.09380922146382575, "makers": 0.17366093659744974, "parts": 0.21358679416426174, "international": 0.21358679416426174, "have": 0.21358679416426174, "uk": 0.19023166467854272, "usa": 0.21358679416426174, "great": 0.21358679416426174, "gave": 0.21358679416426174, "pasty": 0.08095596620059779, "from": 0.28285319794702646, "chile": 0.19023166467854272, "canada": 0.17366093659744974, "there": 0.19023166467854272}, "23": {"professional": 0.23688607797753597, "cornish": 0.18716838337146743, "came": 0.23688607797753597, "company": 0.5319381298887851, "pasty": 0.20162091683892722, "cornwall": 0.1665336757322555, "ranks": 0.26596906494439254, "tops": 0.26596906494439254, "west": 0.26596906494439254, "the": 0.18157137537517704, "were": 0.23688607797753597, "in": 0.28738358964894917, "category": 0.216251370338324, "prestigious": 0.26596906494439254}, "24": {"won": 0.13792872948430046, "scored": 0.16963950402956682, "title": 0.15108989005176304, "winner": 0.15108989005176304, "haddock": 0.16963950402956682, "padstow": 0.16963950402956682, "white": 0.16963950402956682, "he": 0.15108989005176304, "barbeque": 0.16963950402956682, "luisa": 0.16963950402956682, "savoury": 0.2758574589686009, "beddoes": 0.16963950402956682, "the": 0.0579046252447652, "ead": 0.16963950402956682, "mustard": 0.16963950402956682, "while": 0.15108989005176304, "when": 0.15108989005176304, "his": 0.1123268795729207, "daniel": 0.16963950402956682, "with": 0.25544028049508044, "professional": 0.15108989005176304, "junior": 0.16963950402956682, "smoked": 0.16963950402956682, "wine": 0.16963950402956682, "defended": 0.16963950402956682, "from": 0.2246537591458414, "pasty": 0.06429859115700752, "open": 0.2758574589686009, "cornwall": 0.10621795493903413, "bristol": 0.16963950402956682, "13": 0.16963950402956682, "and": 0.08766834096123034, "97": 0.16963950402956682, "category": 0.13792872948430046, "effort": 0.16963950402956682}, "25": {"contribution": 0.2424982969367547, "redruth": 0.2424982969367547, "was": 0.17065147865025562, "association": 0.2424982969367547, "of": 0.19716800158036055, "and": 0.12532118329386144, "the": 0.24832257828640913, "cornwalll": 0.2424982969367547, "to": 0.10650741086757222, "in": 0.13101153525507145, "symonds": 0.2424982969367547, "industry": 0.2424982969367547, "marion": 0.21598177400664978, "bakery": 0.2424982969367547, "her": 0.21598177400664978, "handed": 0.2424982969367547, "ambassador": 0.2424982969367547, "cornish": 0.17065147865025562, "award": 0.2424982969367547, "pasty": 0.18382863048561363, "portreath": 0.2424982969367547, "for": 0.15183770622396642}, "26": {"marion": 0.2455211016235744, "mama": 0.2756642280562043, "promote": 0.2756642280562043, "cornish": 0.19399108663035913, "globe": 0.224134213062989, "the": 0.18819005526212698, "pasty": 0.10448522352027546, "mexico": 0.2756642280562043, "she": 0.224134213062989, "as": 0.2756642280562043, "has": 0.2455211016235744, "been": 0.224134213062989, "to": 0.12107418307655847, "food": 0.2756642280562043, "after": 0.2756642280562043, "in": 0.14892967987303665, "dubbed": 0.2756642280562043, "travelled": 0.2756642280562043}, "27": {"under": 0.2688276067869392, "at": 0.18918000324004502, "competition": 0.2394320461184441, "each": 0.5376552135738784, "category": 0.2185755639085401, "pasty": 0.10189393372370094, "close": 0.2688276067869392, "scrutiny": 0.2688276067869392, "pick": 0.2688276067869392, "winner": 0.2394320461184441, "the": 0.18352284057293303, "to": 0.11807147815174199, "in": 0.1452361436306873, "tastiest": 0.2688276067869392, "went": 0.2688276067869392}, "28": {"held": 0.24904508078762472, "cornwall": 0.19178788398757068, "celebrate": 0.24904508078762472, "was": 0.21555176776318172, "globe": 0.24904508078762472, "for": 0.19178788398757068, "pasty": 0.23219597380616883, "delicacy": 0.24904508078762472, "championships": 0.19178788398757068, "made": 0.24904508078762472, "british": 0.24904508078762472, "the": 0.20910599446507794, "are": 0.30630227758767875, "world": 0.21555176776318172, "to": 0.1345306871875166, "in": 0.16548211738308755, "which": 0.30630227758767875, "huge": 0.30630227758767875}}

In [3]:
from dataset_management.graph_constructor import CNNDailyMailConstructor

ModuleNotFoundError: No module named 'torch_geometric'