In [2]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import transformers
import torch
import numpy as np
import pandas as pd
from nltk.corpus import stopwords 

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
df = pd.read_csv('data.csv')

def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    word_tokens = sentence.split(" ")
    filtered_sentence = [w.lower() for w in word_tokens if not w in stop_words and w.isalpha()]
    return ' '.join(filtered_sentence)

df['Sentence'] = df['Sentence'].apply(remove_stopwords)
data = df['Sentence'].tolist()
df.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Label
0,0,the cardiff roller collective roller sports le...,11
1,1,pack fight song green bay first,11
2,2,the journal founded jesuit chaldean,14
3,3,ajman international airport مطار عجمان upcomin...,0
4,4,kapla construction set children the sets consi...,4


In [4]:
import random
random_seed = 42
random.seed(random_seed)

In [5]:
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [7]:
tokenizer.tokenize(data[0])

['the',
 'cardiff',
 'roller',
 'collective',
 'roller',
 'sports',
 'league',
 'based',
 'founded']

In [14]:
encoding = tokenizer.batch_encode_plus(
            data,
            padding=True,
            truncation=True,
            return_tensors='pt',
            add_special_tokens=True
)

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    word_embeddings = outputs.last_hidden_state

print(word_embeddings)


tensor([[[-0.4290,  0.1327,  0.0481,  ..., -0.4838,  0.2110,  0.2386],
         [-0.0974, -0.2360, -0.7596,  ..., -0.1143,  0.4335, -0.1886],
         [ 0.2341,  0.1203,  0.6591,  ..., -0.3528, -0.5009, -1.0290],
         ...,
         [ 0.0519, -0.2148,  0.4049,  ...,  0.0147, -0.1591,  0.0069],
         [-0.0186, -0.1655,  0.4328,  ..., -0.0134, -0.1543, -0.0319],
         [-0.0638, -0.1615,  0.4873,  ..., -0.0411, -0.1506,  0.0063]],

        [[-0.5071, -0.0117, -0.0223,  ..., -0.2034,  0.2611,  0.3132],
         [ 0.1913, -0.1086,  0.1273,  ...,  0.0740, -0.0250,  0.0762],
         [ 0.0119, -0.1807,  0.0075,  ...,  0.1420, -0.6898,  0.2468],
         ...,
         [-0.1719, -0.0975,  0.0186,  ...,  0.1558, -0.0718,  0.1889],
         [-0.0336, -0.3679, -0.6167,  ...,  0.4977, -0.0801, -0.1862],
         [-0.0523, -0.1249, -0.2135,  ...,  0.2645, -0.1796,  0.0445]],

        [[-0.5199,  0.1083, -0.2523,  ..., -0.4810,  0.7192,  0.1198],
         [-0.9432, -0.2891, -1.0047,  ...,  0

In [16]:
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
tokenized_text = tokenizer.tokenize(decoded_text)
encoded_text = tokenizer.encode(data, return_tensors='pt')

In [10]:
for token, embedding in zip(tokenized_text, word_embeddings):
    print(token, embedding)

the tensor([[-0.4290,  0.1327,  0.0481,  ..., -0.4838,  0.2110,  0.2386],
        [-0.0974, -0.2360, -0.7596,  ..., -0.1143,  0.4335, -0.1886],
        [ 0.2341,  0.1203,  0.6591,  ..., -0.3528, -0.5009, -1.0290],
        ...,
        [ 0.0519, -0.2148,  0.4049,  ...,  0.0147, -0.1591,  0.0069],
        [-0.0186, -0.1655,  0.4328,  ..., -0.0134, -0.1543, -0.0319],
        [-0.0638, -0.1615,  0.4873,  ..., -0.0411, -0.1506,  0.0063]])
cardiff tensor([[-0.5071, -0.0117, -0.0223,  ..., -0.2034,  0.2611,  0.3132],
        [ 0.1913, -0.1086,  0.1273,  ...,  0.0740, -0.0250,  0.0762],
        [ 0.0119, -0.1807,  0.0075,  ...,  0.1420, -0.6898,  0.2468],
        ...,
        [-0.1719, -0.0975,  0.0186,  ...,  0.1558, -0.0718,  0.1889],
        [-0.0336, -0.3679, -0.6167,  ...,  0.4977, -0.0801, -0.1862],
        [-0.0523, -0.1249, -0.2135,  ...,  0.2645, -0.1796,  0.0445]])
roller tensor([[-0.5199,  0.1083, -0.2523,  ..., -0.4810,  0.7192,  0.1198],
        [-0.9432, -0.2891, -1.0047,  ...,  

In [11]:
for i, token_str in enumerate(tokenized_text):
    print(i, token_str)

0 the
1 cardiff
2 roller
3 collective
4 roller
5 sports
6 league
7 based
8 founded


In [26]:
# get word embeddings (did this for the cosine similarity)
def get_word_embeddings(sentences, tokenizer, model):
    word_embeddings = {}
    for sentence in sentences:
        words = sentence.split()
        for word in words:
            if word not in word_embeddings:
                inputs = tokenizer(word, 
                    padding=True,
                    truncation=True,
                    return_tensors='pt',
                    add_special_tokens=True)
                with torch.no_grad():
                    outputs = model(**inputs)
                word_embeddings[word] = outputs.last_hidden_state[0][0].numpy()
    return word_embeddings

# Get word embeddings for all words in the dataset
word_embeddings = get_word_embeddings(data, tokenizer, model)
words = list(word_embeddings.keys())
embeddings = np.array([word_embeddings[word] for word in words]).astype('float32')



In [23]:
# compute cosine similarity 
def cosine_similarity(embedding1, embedding2):
    similarity = cosine_similarity([embedding1], [embedding2])
    return similarity[0][0]

word1 = 'born'
word2 = 'Arabic'

# Retrieve embeddings 
embedding1 = word_embeddings.get(word1)
embedding2 = word_embeddings.get(word2)

if embedding1 is not None and embedding2 is not None:
    similarity = cosine_similarity(embedding1, embedding2)
    print(f"Similarity between '{word1}' and '{word2}': {similarity}")
else:
    missing_words = [word for word in [word1, word2] if word not in word_embeddings]
    print(f"Embeddings not found: {', '.join(missing_words)}")

Similarity between 'born' and 'Arabic': 0.8113988041877747


In [30]:
# Faiss
import faiss

d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

def faiss_search(word, word_embeddings, index, words, k=5):
    if word not in word_embeddings:
        return f"Embedding for '{word}' not found."
    embedding = np.array([word_embeddings[word]]).astype('float32')
    d, I = index.search(embedding, k)
    similar_words = [words[i] for i in I[0]]
    return similar_words

print(faiss_search('born', word_embeddings, index, words, k=10))

['born', 'received', 'represented', 'served', 'known', 'created', 'accepted', 'named', 'person', 'announced']


In [29]:
print(faiss_search('arabic', word_embeddings, index, words, k=10))

['arabic', 'english', 'egyptian', 'kurdish', 'spanish', 'albanian', 'french', 'urdu', 'greek', 'danish']
