# Test two ways of embedding... same!?

In [35]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [36]:
# Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence', 
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173468e-02 -4.28515822e-02 -1.56286396e-02  1.40537601e-02
  3.95537689e-02  1.21796273e-01  2.94333566e-02 -3.17524076e-02
  3.54959667e-02 -7.93140009e-02  1.75878331e-02 -4.04369794e-02
  4.97259982e-02  2.54912656e-02 -7.18699992e-02  8.14968273e-02
  1.47072971e-03  4.79627140e-02 -4.50335816e-02 -9.92174894e-02
 -2.81769410e-02  6.45046160e-02  4.44670692e-02 -4.76217382e-02
 -3.52952257e-02  4.38671745e-02 -5.28565869e-02  4.33028938e-04
  1.01921476e-01  1.64071992e-02  3.26996557e-02 -3.45986784e-02
  1.21339448e-02  7.94871077e-02  4.58341092e-03  1.57778561e-02
 -9.68207605e-03  2.87626237e-02 -5.05806431e-02 -1.55793866e-02
 -2.87907384e-02 -9.62279178e-03  3.15556526e-02  2.27349345e-02
  8.71449709e-02 -3.85027751e-02 -8.84718895e-02 -8.75495560e-03
 -2.12343261e-02  2.08923966e-02 -9.02078152e-02 -5.25732562e-02
 -1.05638504e-02  2.88311224e-02 -1.61454976e-02  6.17839908e-03
 -1.23234

In [4]:
sentence

'The quick brown fox jumps over the lazy dog.'

In [6]:
embedding.shape

(384,)

In [21]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


In [22]:
encoded_input = tokenizer(sentences, max_length = 120, padding=True, truncation=True, return_tensors='pt')


In [23]:
encoded_input

{'input_ids': tensor([[  101,  2023,  7705, 19421,  7861,  8270,  4667,  2015,  2005,  2169,
          7953,  6251,   102],
        [  101, 11746,  2024,  2979,  2004,  1037,  2862,  1997,  5164,  1012,
           102,     0,     0],
        [  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [24]:
with torch.no_grad():
    model_output = model(**encoded_input)


In [25]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [26]:
model_output['last_hidden_state']

tensor([[[ 0.2913, -0.2685, -0.2250,  ...,  0.4261,  0.0493, -0.2095],
         [-0.6272, -0.0421, -0.2452,  ...,  0.5336,  1.3115,  0.5999],
         [ 0.0023, -0.2805, -0.4198,  ..., -0.2900,  1.5808, -0.4912],
         ...,
         [ 0.1802, -0.5567,  0.0146,  ...,  0.9311,  0.5940, -0.3536],
         [ 0.0603, -0.2502,  0.5959,  ...,  0.9435,  0.9465, -1.0680],
         [-0.3356,  0.0650,  0.1109,  ...,  1.0801,  0.2653, -0.2762]],

        [[ 0.0856,  0.1876,  0.0488,  ...,  0.1204, -0.0907, -0.1662],
         [ 0.1291, -0.0266,  0.6318,  ...,  0.7958,  0.1555, -1.2737],
         [ 0.0062,  0.2263,  0.1851,  ...,  0.3981,  0.6461, -0.2192],
         ...,
         [ 0.3036,  0.3740,  0.2523,  ...,  0.6319,  0.5731, -0.2901],
         [-0.2124,  0.2626,  0.6867,  ...,  0.5504,  0.7065, -0.4728],
         [-0.2220,  0.2086,  0.6693,  ...,  0.5410,  0.5683, -0.3963]],

        [[ 0.0464,  0.3381,  0.2082,  ...,  0.2766, -0.0861, -0.0358],
         [ 0.1162,  0.2264,  0.1021,  ...,  0

In [27]:
# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-0.0137, -0.0429, -0.0156,  ...,  0.1002,  0.1237, -0.0423],
        [ 0.0565,  0.0550,  0.0314,  ...,  0.0665,  0.0849, -0.0333],
        [ 0.0439,  0.0589,  0.0482,  ...,  0.0522,  0.0561,  0.1021]])


In [33]:
sentence_embeddings[0]

tensor([-1.3717e-02, -4.2852e-02, -1.5629e-02,  1.4054e-02,  3.9554e-02,
         1.2180e-01,  2.9433e-02, -3.1752e-02,  3.5496e-02, -7.9314e-02,
         1.7588e-02, -4.0437e-02,  4.9726e-02,  2.5491e-02, -7.1870e-02,
         8.1497e-02,  1.4707e-03,  4.7963e-02, -4.5034e-02, -9.9217e-02,
        -2.8177e-02,  6.4505e-02,  4.4467e-02, -4.7622e-02, -3.5295e-02,
         4.3867e-02, -5.2857e-02,  4.3306e-04,  1.0192e-01,  1.6407e-02,
         3.2700e-02, -3.4599e-02,  1.2134e-02,  7.9487e-02,  4.5835e-03,
         1.5778e-02, -9.6821e-03,  2.8763e-02, -5.0581e-02, -1.5579e-02,
        -2.8791e-02, -9.6228e-03,  3.1556e-02,  2.2735e-02,  8.7145e-02,
        -3.8503e-02, -8.8472e-02, -8.7550e-03, -2.1234e-02,  2.0892e-02,
        -9.0208e-02, -5.2573e-02, -1.0564e-02,  2.8831e-02, -1.6146e-02,
         6.1784e-03, -1.2323e-02, -1.0734e-02,  2.8335e-02, -5.2857e-02,
        -3.5862e-02, -5.9799e-02, -1.0906e-02,  2.9157e-02,  7.9798e-02,
        -3.2787e-04,  6.8350e-03,  1.3272e-02, -4.2

In [39]:
model.encode(sentences[0])

array([-1.37173338e-02, -4.28515710e-02, -1.56286471e-02,  1.40537657e-02,
        3.95537540e-02,  1.21796280e-01,  2.94333957e-02, -3.17523666e-02,
        3.54959443e-02, -7.93139935e-02,  1.75878275e-02, -4.04369980e-02,
        4.97259684e-02,  2.54912954e-02, -7.18699768e-02,  8.14968571e-02,
        1.47071178e-03,  4.79627401e-02, -4.50335778e-02, -9.92174894e-02,
       -2.81769447e-02,  6.45045936e-02,  4.44670618e-02, -4.76217158e-02,
       -3.52952294e-02,  4.38671671e-02, -5.28566092e-02,  4.33034380e-04,
        1.01921476e-01,  1.64072346e-02,  3.26996446e-02, -3.45986709e-02,
        1.21339560e-02,  7.94871375e-02,  4.58342489e-03,  1.57778393e-02,
       -9.68209561e-03,  2.87626348e-02, -5.05806506e-02, -1.55794034e-02,
       -2.87907310e-02, -9.62279923e-03,  3.15556377e-02,  2.27349326e-02,
        8.71449634e-02, -3.85027602e-02, -8.84718969e-02, -8.75494629e-03,
       -2.12343261e-02,  2.08924171e-02, -9.02078077e-02, -5.25732487e-02,
       -1.05638541e-02,  

In [42]:
model.encode(sentences[0])[:20]

array([-0.01371733, -0.04285157, -0.01562865,  0.01405377,  0.03955375,
        0.12179628,  0.0294334 , -0.03175237,  0.03549594, -0.07931399,
        0.01758783, -0.040437  ,  0.04972597,  0.0254913 , -0.07186998,
        0.08149686,  0.00147071,  0.04796274, -0.04503358, -0.09921749],
      dtype=float32)

In [43]:
sentence_embeddings[0][:20]

tensor([-0.0137, -0.0429, -0.0156,  0.0141,  0.0396,  0.1218,  0.0294, -0.0318,
         0.0355, -0.0793,  0.0176, -0.0404,  0.0497,  0.0255, -0.0719,  0.0815,
         0.0015,  0.0480, -0.0450, -0.0992])

In [29]:
encoded_input['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])

In [32]:
model_output[0]

tensor([[[ 0.2913, -0.2685, -0.2250,  ...,  0.4261,  0.0493, -0.2095],
         [-0.6272, -0.0421, -0.2452,  ...,  0.5336,  1.3115,  0.5999],
         [ 0.0023, -0.2805, -0.4198,  ..., -0.2900,  1.5808, -0.4912],
         ...,
         [ 0.1802, -0.5567,  0.0146,  ...,  0.9311,  0.5940, -0.3536],
         [ 0.0603, -0.2502,  0.5959,  ...,  0.9435,  0.9465, -1.0680],
         [-0.3356,  0.0650,  0.1109,  ...,  1.0801,  0.2653, -0.2762]],

        [[ 0.0856,  0.1876,  0.0488,  ...,  0.1204, -0.0907, -0.1662],
         [ 0.1291, -0.0266,  0.6318,  ...,  0.7958,  0.1555, -1.2737],
         [ 0.0062,  0.2263,  0.1851,  ...,  0.3981,  0.6461, -0.2192],
         ...,
         [ 0.3036,  0.3740,  0.2523,  ...,  0.6319,  0.5731, -0.2901],
         [-0.2124,  0.2626,  0.6867,  ...,  0.5504,  0.7065, -0.4728],
         [-0.2220,  0.2086,  0.6693,  ...,  0.5410,  0.5683, -0.3963]],

        [[ 0.0464,  0.3381,  0.2082,  ...,  0.2766, -0.0861, -0.0358],
         [ 0.1162,  0.2264,  0.1021,  ...,  0

# Use this Embedding

In [44]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [48]:
# https://www.sbert.net/index.html

# Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence', 
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)


In [47]:
embeddings.shape

(3, 384)

# Load Data

In [50]:
import sys
import os


sys.path.append("..")


import pandas as pd
import numpy as np

import random
import itertools
from sklearn import metrics
from tqdm.auto import tqdm

from src.data_process import load_wls_adress_AddDomain

In [51]:
df_wls_merge = load_wls_adress_AddDomain(dt='wls')
df_adress = load_wls_adress_AddDomain(dt='adress')

In [52]:
df_all

Unnamed: 0,file,text,idtlkbnk,age 2011,education,"category fluency version, 2011","category fluency, scored words named, 2011","> 1 sd below mean for normals ages 60-79 (Tombaugh, Kozak, & Rees, 1999) -- normal cutoff = 12+ for 9-12 yrs eductation, 14+ for 13-21 yrs education",label,domain,domain_index
0,2000015784,Well i see a boy taking a cookie outof a cooki...,2000015784,70,12,2,9,Y,1,wls,0
1,2000015088,The mother's doing dishes. She's naughty. She'...,2000015088,70,12,1,10,Y,1,wls,0
2,2000015573,Okay. Somebody is into the cookie jar and he's...,2000015573,71,12,2,14,N,0,wls,0
3,2000015045,Okay. There's a young boy up on a chair that's...,2000015045,72,14,1,22,N,0,wls,0
4,2000015213,Kids are in the cookie jar. And the child is r...,2000015213,63,12,2,16,N,0,wls,0
...,...,...,...,...,...,...,...,...,...,...,...
1355,2000017316,Well the kid's gonna fall off the stool with t...,2000017316,70,18,2,24,N,0,wls,0
1356,2000017754,There is a kid going after a cookie jar standi...,2000017754,72,12,2,20,N,0,wls,0
1357,2000017794,I see mom doing dishes and spilling water on t...,2000017794,78,12,2,12,N,0,wls,0
1358,2000017790,Two children. One child on a stool that's tipp...,2000017790,64,18,1,14,N,0,wls,0
