In [14]:
from torch import nn
from transformers import AutoConfig
model_ckpt = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_ckpt)


def sdp_attention(query, key, value):
    dim_k = query.size(-1) # dimension component
    sfact = sqrt(dim_k)     
    scores = torch.bmm(query, key.transpose(1,2)) / sfact
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

'''

Attention Class

# nn.linear : apply linear transformation to incoming data
#             y = x * A^T + b
# Ax = b where x is input, b is output, A is weight

# calculate scaled dot product attention matrix
# Requires embedding dimension 
# Each attention head is made of different q,k,v vectors

'''

class Attention(nn.Module):
    
    # initalisation 
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        
        # Define the three vectors
        # input - embed_dim, output - head_dim
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    # main class operation
    def forward(self, hidden_state):
        
        # calculate scaled dot product given a 
        attn_outputs = sdp_attention(
            self.q(hidden_state), 
            self.k(hidden_state), 
            self.v(hidden_state))
        
        return attn_outputs
    

    
'''

Multihead attention class

'''


class multiHeadAttention(nn.Module):
    
    # Config during initalisation
    def __init__(self, config):
        super().__init__()
        
        # model params, read from config file
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        
        # attention head (define only w/o hidden state)
        # each attention head is initialised with embedd/heads head dimension
        self.heads = nn.ModuleList(
            [Attention(embed_dim, head_dim) for _ in range(num_heads)])
        
        # output uses whole embedding dimension for output
        self.out_linear = nn.Linear(embed_dim, embed_dim)

    # Given a hidden state (embeddings)
    # Apply operation for multihead attention
        
    def forward(self, hidden_state):
        
        # for each head embed_size/heads, calculate attention
        heads = [head(hidden_state) for head in self.heads] 
        x = torch.cat(heads, dim=-1) # merge/concat head data together
    
        # apply linear transformation to multihead attension scalar product
        x = self.out_linear(x)
        return x
    
    


class feedForward(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # define layer operations input x
        
    def forward(self, x):    # note must be forward
        x = self.gelu(self.linear1(x))
        x = self.linear2(x)
        x = self.dropout(x)
        return x
    
    
class encoderLayer(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.norm1 = nn.LayerNorm(config.hidden_size)
        self.norm2 = nn.LayerNorm(config.hidden_size)
        self.attention = multiHeadAttention(config)    # multihead attention layer 
        self.feed_forward = feedForward(config)        # feed forward layer

    def forward(self, x):
        
        # Apply layer norm. to hidden state, copy input into query, key, value
        # Apply attention with a skip connection
        x = x + self.attention(self.norm1(x))
        
        # Apply feed-forward layer with a skip connection
        x = x + self.feed_forward(self.norm2(x))
        
        return x
    
    
'''

Token + Position Embedding 


'''

class tpEmbedding(nn.Module):
    
    def __init__(self, config):        
        super().__init__()
        
        # token embedding layer
        self.token_embeddings = nn.Embedding(config.vocab_size,
                                             config.hidden_size)
        
        # positional embedding layer
        # config.max_position_embeddings -> max number of positions in text 512 (tokens)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        
        self.norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        
        # Create position IDs for input sequence
        seq_length = input_ids.size(1) # number of tokens
        position_ids = torch.arange(seq_length, dtype=torch.long)[None,:] # range(0,9)
        
        # tensor([[ 1996, 11286,  1997,  1037,  5340,  3392,  2003,  2200,  5931]])
        # tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8]])
        
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        
        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        
        # Add normalisation & dropout layers
        embeddings = self.norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
    
    
    
# full transformer encoder combining the `Embedding` with the ``Embedding` ` layers

class TransformerEncoder(nn.Module):
    
    def __init__(self, config):       
        super().__init__()
        
        # token & positional embedding layer
        self.embeddings = tpEmbedding(config)
        
        # attention & forward feed layer 
        self.layers = nn.ModuleList([encoderLayer(config)
                                     for _ in range(config.num_hidden_layers)])
        
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    def forward(self, x):
        
        # embeddings layer output
        x = self.embeddings(x)
        
        # cycle through all heads
        for layer in self.layers:
            x = layer(x)
            
        x = x[:, 0, :] # select hidden state of [CLS] token
        x = self.dropout(x)
        return x

In [163]:
from transformers import AutoTokenizer  
import torch
import pandas as pd
from math import sqrt
import torch.nn.functional as F

class TrainEmbeddings():
    def __init__(self):
        self.model = TransformerEncoder(config)
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        
    def getTensor(self,text):
        inputs = self.tokenizer(text, 
                   return_tensors="pt",      # pytorc tensor
                   add_special_tokens=False,
                          padding=True) # don't use pad, sep tokens
        return inputs.input_ids
    
    def train(self,data,epochs=100):
        self.model.train()
        tensorData = self.getTensor(data)
        for epoch in range(epochs):
            if epoch % 10 == 0:
                print("epoch %s" % epoch)
            self.model.forward(tensorData)
    
    def getEmbeddings(self,q):
        self.model.eval()
        inputs = self.getTensor(q)
        with torch.no_grad():
            x = self.model(inputs).cpu().detach().numpy()
            return x
        

data = list(pd.read_csv("SEC-CompanyTicker.csv",index_col=0).companyName[:100])
train = TrainEmbeddings()
train.train(data)


epoch 0
epoch 10
epoch 20
epoch 30
epoch 40
epoch 50
epoch 60
epoch 70
epoch 80
epoch 90


In [162]:
from gensim.models import Word2Vec
import numpy as np
import random

class Word2VecEmbeddings():
    #required to train on all data and queries, because use keys to find embeddings
    def train(self, data,epochs=100):
        data = [[x] for x in data]
        self.model = Word2Vec(data, 
                 min_count = 1, vector_size = 768,
                                             window = 5, sg = 1)
        for epoch in range(epochs):
            if epoch % 10 == 0:
                print("epoch %s" % epoch)
            self.model.train([["shell"]], total_examples=self.model.corpus_count, epochs=100)
            
            
    def getEmbeddings(self,q):
        values = []
        for val in q:
            values.append(self.model.wv.get_vector(val))
        return np.array(values)
    
    def getKeys(self):
        words = list(self.model.wv.index_to_key)
        return words

data = list(pd.read_csv("SEC-CompanyTicker.csv",index_col=0).companyName[:100])
train = Word2VecEmbeddings()
FullKeys = data + ["shell"]
train.train(FullKeys)
xq = train.getEmbeddings(["shell"])
xb = train.getEmbeddings(data)
xq

epoch 0
epoch 10
epoch 20
epoch 30
epoch 40
epoch 50
epoch 60
epoch 70
epoch 80
epoch 90


array([[-6.98212534e-05,  3.07853334e-05,  6.64498657e-04,
         1.17308239e-03, -1.21132156e-03, -9.26667824e-04,
         8.40999011e-04,  1.16835779e-03, -6.53050549e-04,
        -4.90022358e-04,  9.61003185e-04, -1.99670743e-04,
        -5.90704847e-04,  8.53392121e-04, -6.32833398e-04,
        -2.36460633e-04,  3.74554656e-04,  1.29150227e-04,
        -1.07880402e-03, -1.23031484e-03,  9.52052884e-04,
         6.60190359e-04,  8.79907981e-04,  9.93314534e-05,
         8.26938835e-04, -4.43407014e-04, -1.23229343e-04,
         7.51116313e-04, -9.79379867e-04, -5.12513507e-04,
        -9.78070660e-04, -1.21099256e-04,  1.24194252e-03,
        -9.53016512e-04, -3.03876121e-04, -2.52310041e-04,
         1.05174957e-03, -7.72252039e-04,  5.88052580e-06,
        -6.18975784e-04, -1.25046226e-03,  6.51991286e-04,
        -1.14057108e-03, -5.71852259e-04, -4.57031047e-06,
        -3.85652929e-05, -9.97557305e-04,  1.25191966e-03,
         6.48705463e-04,  1.20223220e-03, -1.06222881e-0

In [169]:
from openai import OpenAI
from openai import OpenAI
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
load_dotenv()
key = os.environ.get("OPENAI_KEY")

# Initialize OpenAI client (replace '...' with your API key)
client = OpenAI(api_key=key)


class OpenAIEmbeddings:
    def __init__(self,api_key):
        self.client = OpenAI(api_key=api_key)
        
    
    def getEmbeddings(self, text_list):
        data = self.client.embeddings.create(input=text_list, model='text-embedding-ada-002').data
        embeddings = [embedding.embedding for embedding in data]
        return np.array(embeddings)
    
    
e = OpenAIEmbeddings(key)
e.getEmbeddings(["hi"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


array([[-0.03503197, -0.02060164, -0.01537573, ..., -0.01162699,
        -0.00087646,  0.00465802]])

In [154]:
import faiss
    
class Faiss:
    def __init__(self):
        pass

    def faiss(self,xb):
        d = xb[0].size
        M = 32
        index = faiss.IndexHNSWFlat(d, M)            
        index.hnsw.efConstruction = 40         # Setting the value for efConstruction.
        index.hnsw.efSearch = 16               # Setting the value for efSearch.
        index.add(xb)
        return index
    
    def query(self,index,xq,k=3):
        D, I = index.search(xq, k)   
        return D, I

xq = train.getEmbeddings(["shell"])
xb = train.getEmbeddings(data)
index = Faiss().faiss(xb)
D,I = Faiss().query(index,xq)
I = I[0]
print(data[I[0]] + "\n" + data[I[1]], "\n" + data[I[2]])

Tesla, Inc.
Salesforce, Inc. 
Qualcomm Inc/De


In [126]:
model = train.model

In [128]:
data = [[x] for x in data]

In [130]:
num_epochs = 5  # Adjust as needed
for epoch in range(num_epochs):
    model.train(data, total_examples=model.corpus_count, epochs=1)

In [None]:
        for epoch in range(epochs):
            self.model.train(data, total_examples=self.model.corpus_count, epochs=1)