In [19]:
secDataPath = "../../inputs/SEC-CompanyTicker.csv"

To Do <br>
-Make GloVe Embeddings

In [20]:
from torch import nn
from transformers import AutoConfig
model_ckpt = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_ckpt)


def sdp_attention(query, key, value):
    dim_k = query.size(-1) # dimension component
    sfact = sqrt(dim_k)     
    scores = torch.bmm(query, key.transpose(1,2)) / sfact
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

'''

Attention Class

# nn.linear : apply linear transformation to incoming data
#             y = x * A^T + b
# Ax = b where x is input, b is output, A is weight

# calculate scaled dot product attention matrix
# Requires embedding dimension 
# Each attention head is made of different q,k,v vectors

'''

class Attention(nn.Module):
    
    # initalisation 
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        
        # Define the three vectors
        # input - embed_dim, output - head_dim
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    # main class operation
    def forward(self, hidden_state):
        
        # calculate scaled dot product given a 
        attn_outputs = sdp_attention(
            self.q(hidden_state), 
            self.k(hidden_state), 
            self.v(hidden_state))
        
        return attn_outputs
    

    
'''

Multihead attention class

'''


class multiHeadAttention(nn.Module):
    
    # Config during initalisation
    def __init__(self, config):
        super().__init__()
        
        # model params, read from config file
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        
        # attention head (define only w/o hidden state)
        # each attention head is initialised with embedd/heads head dimension
        self.heads = nn.ModuleList(
            [Attention(embed_dim, head_dim) for _ in range(num_heads)])
        
        # output uses whole embedding dimension for output
        self.out_linear = nn.Linear(embed_dim, embed_dim)

    # Given a hidden state (embeddings)
    # Apply operation for multihead attention
        
    def forward(self, hidden_state):
        
        # for each head embed_size/heads, calculate attention
        heads = [head(hidden_state) for head in self.heads] 
        x = torch.cat(heads, dim=-1) # merge/concat head data together
    
        # apply linear transformation to multihead attension scalar product
        x = self.out_linear(x)
        return x
    
    


class feedForward(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # define layer operations input x
        
    def forward(self, x):    # note must be forward
        x = self.gelu(self.linear1(x))
        x = self.linear2(x)
        x = self.dropout(x)
        return x
    
    
class encoderLayer(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.norm1 = nn.LayerNorm(config.hidden_size)
        self.norm2 = nn.LayerNorm(config.hidden_size)
        self.attention = multiHeadAttention(config)    # multihead attention layer 
        self.feed_forward = feedForward(config)        # feed forward layer

    def forward(self, x):
        
        # Apply layer norm. to hidden state, copy input into query, key, value
        # Apply attention with a skip connection
        x = x + self.attention(self.norm1(x))
        
        # Apply feed-forward layer with a skip connection
        x = x + self.feed_forward(self.norm2(x))
        
        return x
    
    
'''

Token + Position Embedding 


'''

class tpEmbedding(nn.Module):
    
    def __init__(self, config):        
        super().__init__()
        
        # token embedding layer
        self.token_embeddings = nn.Embedding(config.vocab_size,
                                             config.hidden_size)
        
        # positional embedding layer
        # config.max_position_embeddings -> max number of positions in text 512 (tokens)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        
        self.norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        
        # Create position IDs for input sequence
        seq_length = input_ids.size(1) # number of tokens
        position_ids = torch.arange(seq_length, dtype=torch.long)[None,:] # range(0,9)
        
        # tensor([[ 1996, 11286,  1997,  1037,  5340,  3392,  2003,  2200,  5931]])
        # tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8]])
        
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        
        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        
        # Add normalisation & dropout layers
        embeddings = self.norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
    
    
    
# full transformer encoder combining the `Embedding` with the ``Embedding` ` layers

class TransformerEncoder(nn.Module):
    
    def __init__(self, config):       
        super().__init__()
        
        # token & positional embedding layer
        self.embeddings = tpEmbedding(config)
        
        # attention & forward feed layer 
        self.layers = nn.ModuleList([encoderLayer(config)
                                     for _ in range(config.num_hidden_layers)])
        
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    def forward(self, x):
        
        # embeddings layer output
        x = self.embeddings(x)
        
        # cycle through all heads
        for layer in self.layers:
            x = layer(x)
            
        x = x[:, 0, :] # select hidden state of [CLS] token
        x = self.dropout(x)
        return x

In [45]:
from transformers import AutoTokenizer  
import torch
import pandas as pd
from math import sqrt
import torch.nn.functional as F

class TransformerEmbeddings():
    def __init__(self):
        self.model = TransformerEncoder(config)
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        
    def getTensor(self,text):
        inputs = self.tokenizer(text, 
                   return_tensors="pt",      # pytorc tensor
                   add_special_tokens=False,
                          padding=True) # don't use pad, sep tokens
        return inputs.input_ids
    
    def train(self,data,epochs=1):
        self.model.train()
        tensorData = self.getTensor(data)
        for epoch in range(epochs):
            if epoch % 10 == 0:
                print("epoch %s" % epoch)
            self.model.forward(tensorData)
    
    def getEmbeddings(self,q):
        self.model.eval()
        inputs = self.getTensor(q)
        with torch.no_grad():
            x = self.model(inputs).cpu().detach().numpy()
            return x
        

data = list(pd.read_csv(secDataPath,index_col=0).companyName[:100])
train = TransformerEmbeddings()
train.train(data)


epoch 0


In [None]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
#word2vec is bad for datasets with only one or two words because it doesnt not look for co occurences between different inputs
#Since it only looks at the words its surrounded by in its input, it lacks the same self attention features as other models
#Indeed, the model looks at the surrounding words to predict the target word (Skip-gram) or predicts surrounding words given the target word (CBOW).
# so if the input is one word, it doesn't really work, much better for longer sentences

import time
from gensim.models import Word2Vec
from tqdm import tqdm

data = list(pd.read_csv(secDataPath, index_col=0).companyName[:100]) + ["shell"]

tqdm.pandas()
def preprocessing(titles_array):
    processed_array = []
    for title in tqdm(titles_array):
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', title)
        words = processed.split()
        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_array.append(' '.join([word for word in words if len(word) > 1]))
    return processed_array



# Assuming 'processed' is a list of strings
data = list(pd.read_csv(secDataPath, index_col=0).companyName[:100]) + ["shell"]
processed = preprocessing(data)

# Use tqdm's progress_apply with lambda function
data = [sublist[0] for sublist in tqdm(pd.DataFrame({"companyName": processed}).apply(lambda x: x.str.split()).values)]



class Word2VecEmbeddings():
    # Required to train on all data and queries because use keys to find embeddings
    def train(self, data, epochs=100):
#         # Convert data to a list of lists
#         data = [[x] for x in data]
        
        # Initialize Word2Vec model
        self.model = Word2Vec(data, 
                              min_count=1, 
                              vector_size=768,
                              window=5, 
                              sg=1)
        
        # Train the model
        for epoch in range(epochs):
            if epoch % 10 == 0:
                print("epoch %s" % epoch)
            self.model.train(data, total_examples=self.model.corpus_count, epochs=100)
            
    def getEmbeddings(self, q):
        values = []
        maxlength = 0

        # Find the maximum length of sequences
        for sentence in q:
            embedding = []
            for val in sentence:
                embedding += list(self.model.wv.get_vector(val))
            values.append(embedding)
            maxlength = max(maxlength, len(embedding))

        # Pad each sequence individually
        padded_values = pad_sequences(values, maxlen=3840, padding='post', dtype='float32')

        return np.array(padded_values)

    
    def getKeys(self):
        words = list(self.model.wv.index_to_key)
        return words



# Create an instance of Word2VecEmbeddings
train = Word2VecEmbeddings()

# Train the model with the data
train.train(data)

# Get embeddings for the query and the full data
xq = train.getEmbeddings([["shell"]])
xb = train.getEmbeddings(data)

# Now xq should work as expected


100%|██████████| 101/101 [00:00<00:00, 190136.76it/s]
100%|██████████| 101/101 [00:00<00:00, 1486402.47it/s]


epoch 0
epoch 10
epoch 20
epoch 30
epoch 40
epoch 50
epoch 60
epoch 70
epoch 80
epoch 90


In [51]:
from openai import OpenAI
from openai import OpenAI
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
load_dotenv()
key = os.environ.get("OPENAI_KEY")



class OpenAIEmbeddings:
    def __init__(self,api_key):
        self.client = OpenAI(api_key=api_key)
        
    
    def getEmbeddings(self, text_list):
        data = self.client.embeddings.create(input=text_list, model='text-embedding-ada-002').data
        embeddings = [embedding.embedding for embedding in data]
        return np.array(embeddings)
    
    
train = OpenAIEmbeddings(key)


In [50]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class BERTEmbeddings:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        self.model = AutoModel.from_pretrained(model_ckpt)
        
    #CLS is a special classification token and the last hidden state of BERT Embedding
    def cls_pooling(self, model_output):
        return model_output.last_hidden_state[:, 0]

    #BERT tokenizer of input text
    def getEmbeddings(self, text_list):
        encoded_input = self.tokenizer(
            text_list, padding=True, truncation=True, return_tensors="pt"
        )
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        model_output = self.model(**encoded_input)
        return self.cls_pooling(model_output).cpu().detach().numpy()
    
train = BERTEmbeddings()
train.getEmbeddings(["julia"])

array([[ 4.67052944e-02, -2.26962656e-01, -3.52288604e-01,
         1.39271274e-01,  3.63313109e-02, -2.48244584e-01,
        -7.24242255e-03,  2.84364730e-01, -2.24267125e-01,
         2.74040073e-01,  6.68052286e-02, -2.88262486e-01,
        -2.35641703e-01,  3.63490671e-01, -6.73132315e-02,
        -1.02233335e-01,  1.31339893e-01,  2.46826887e-01,
         2.06331387e-01, -1.72131449e-01, -1.58596531e-01,
         2.81898826e-01, -7.99801722e-02,  2.85342634e-01,
        -2.63511576e-02, -2.79757380e-01,  2.15131685e-01,
         4.59036194e-02, -9.49969664e-02,  9.71716344e-02,
        -4.19612303e-02, -1.43848658e-02, -8.02190006e-02,
        -7.30765983e-02, -1.07166685e-04, -1.66405842e-01,
         1.28892303e-01,  5.45171127e-02, -1.84124723e-01,
        -1.19634345e-01, -7.17815757e-02, -2.55356371e-01,
        -3.24021101e-01, -3.18603784e-01, -2.07002670e-01,
        -2.33787283e-01,  1.15620144e-01, -2.81645685e-01,
        -2.65673101e-02, -1.69645041e-01,  2.72051960e-0

In [46]:
import faiss
    
class Faiss:
    def __init__(self):
        pass

    def faiss(self,xb):
        d = xb[0].size
        M = 32
        index = faiss.IndexHNSWFlat(d, M)            
        index.hnsw.efConstruction = 40         # Setting the value for efConstruction.
        index.hnsw.efSearch = 16               # Setting the value for efSearch.
        index.add(xb)
        return index
    
    def query(self,index,xq,k=10):
        D, I = index.search(xq, k)   
        return D, I

model = "Transformer"
#if using Word2Vec
if model=="Word2Vec":
    q = [['shell']]
    #Word2Vec needs prepocessing of data to transform sentences into individual tokens
else:
    q = ["shell"]
    data = list(pd.read_csv(secDataPath,index_col=0).companyName[:100])
xq = train.getEmbeddings(q)
xb = train.getEmbeddings(data)
index = Faiss().faiss(xb)
D,I = Faiss().query(index,xq)
I = I[0]

for guess in I:
    print(data[guess])
    
if model != "Word2Vec":
    print(data[I[0]] + "\n" + data[I[1]], "\n" + data[I[2]])

Shell Plc
Lvmh Moet Hennessy Louis Vuitton
Danaher Corp /De/
Anheuser-Busch Inbev Sa/Nv
Spdr S&P 500 Etf Trust
Qualcomm Inc/De
Invesco Qqq Trust, Series 1
T-Mobile Us, Inc.
Wells Fargo & Company/Mn
Merck & Co., Inc.
Shell Plc
Lvmh Moet Hennessy Louis Vuitton 
Danaher Corp /De/


In [59]:
load_dotenv()
key = os.environ.get("OPENAI_KEY")

class runDataset():
    def __init__(self,model):
        models = ["Transformer", "BERT", "OpenAI"]
        self.model = model

        if ("data" not in model.keys()):
            raise Exception("No data inputted.")
        #not using word2vec because it performed terribly on datasets with only one or two words per code-word
        if (model["name"] == "Transformer"):
            self.Embeddings=TransformerEmbeddings()
            self.Embeddings.train(model["data"])
        elif (model["name"] == "BERT"):
            self.Embeddings=BERTEmbeddings()
        elif (model["name"] == "OpenAI"):
            if ("api_key" not in model.keys()):
                raise Exception("No API Key inputted.")
            self.Embeddings = OpenAIEmbeddings(model["api_key"])
        else:
            raise Exception("Invalid Model")
        
        xb = self.Embeddings.getEmbeddings(model["data"])
        self.index = Faiss().faiss(xb)

    def getEmbeddings(self,x):
        return self.Embeddings.getEmbeddings(x)
    
    def similaritySearch(self,q):
        xq = self.getEmbeddings(q)
        D,I = Faiss().query(self.index,xq)
        I = I[0]
        guesses = [data[guess] for guess in I]
        return guesses

data = list(pd.read_csv(secDataPath,index_col=0).companyName[:100])

model = {
    "name" : "OpenAI",
    "data" : data,
    "api_key":key
}
similaritySearch = runDataset(model)
similaritySearch.similaritySearch(["Shell Plc"])
        




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


['Shell Plc',
 'Unilever Plc',
 'Linde Plc',
 'Accenture Plc',
 'Hsbc Holdings Plc',
 'Astrazeneca Plc',
 'Bhp Group Ltd',
 'Oracle Corp',
 'Alibaba Group Holding Ltd',
 'Mcdonalds Corp']