# Example of UniXcoder implementation

In [36]:
import torch
from unixcoder import UniXcoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = UniXcoder("microsoft/unixcoder-base")
model.to(device)

UniXcoder(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

This experiment will try to explore the Encoder-only mode for finding similarity with the Markdown. 

In [37]:
def tokenize(content):
    return model.tokenize([content], max_length=512, mode="<encoder-only>")

In [38]:
def generate_embedding(token_ids):
    source_ids = torch.tensor(token_ids).to(device)
    tokens_embeddings, content_embedding = model(source_ids)
    # print(tokens_embeddings)
    # print("========================================================")
    # print(content_embedding)
    return content_embedding

In [39]:
content = "return the maximum value"
content_token_ids = tokenize(content)
content_embedding = generate_embedding(content_token_ids)

# content2 = "get the highest value"
# content2 = "Just go to hell"
# content2 = "def f(a,b): if a<b: return a else return b"
content2 = "def f(a,b): if a>b: return a else return b"
content2_token_ids = tokenize(content)
content2_embedding = generate_embedding(content2_token_ids)

In [40]:
# Normalize Embedding
norm_content_embedding = torch.nn.functional.normalize(content_embedding, p=2, dim=1)
norm_content2_embedding = torch.nn.functional.normalize(content2_embedding, p=2, dim=1)

cosine_sim = torch.einsum("ac,bc->ab", norm_content2_embedding, norm_content_embedding)
print(cosine_sim)

tensor([[1.0000]], grad_fn=<ViewBackward0>)


In [57]:
# Encode maximum function
func = "palindrome function"
tokens_ids = tokenize(func)
max_func_embedding = generate_embedding(tokens_ids)

# Encode minimum function
func = "def f(a,b): if a<b: return a else return b"
tokens_ids = tokenize(func)
min_func_embedding = generate_embedding(tokens_ids)

# Encode NL
nl = "return a function that calculate palindrome from input"
tokens_ids = tokenize(nl)
nl_embedding = generate_embedding(tokens_ids)

# print(max_func_embedding.shape)
# print(max_func_embedding)

In [58]:
norm_max_func_embedding = torch.nn.functional.normalize(max_func_embedding, p=2, dim=1)
# norm_min_func_embedding = torch.nn.functional.normalize(min_func_embedding, p=2, dim=1)
norm_nl_embedding = torch.nn.functional.normalize(nl_embedding, p=2, dim=1)

max_func_nl_similarity = torch.einsum("ac,bc->ab",norm_max_func_embedding,norm_nl_embedding)
# min_func_nl_similarity = torch.einsum("ac,bc->ab",norm_min_func_embedding,norm_nl_embedding)

print(max_func_nl_similarity)
# print(min_func_nl_similarity)

tensor([[0.7158]], grad_fn=<ViewBackward0>)


# Pre-processing functions

In [94]:
import pandas as pd
import string
import re
# import NLP lib and its stopwords module
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
# stopwords[10:20]
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Veera\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Veera\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Veera\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Veera\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [100]:
def tokenization(text):
  text = text.lower()
  text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text).strip() # replace punctuation with ' '(space)
  text = " ".join(text.split()) # remove the excess spaces and newlines
  return nltk.word_tokenize(text)

def remove_hyperlink(text):
  text = re.sub(r'https?://\S+', "", text)
  text = " ".join(re.sub(r'https?://\S+', "", text).split())
  return text

def remove_tags(text):
  return re.sub(r"<.*?>", " ", text)

def remove_stopwords(tokens):
  # output = "".join([i for i in text if i not in stopwords])
  return [token for token in tokens if token not in stopwords]
  

def stemming(tokens):
  # stemmed_text = " ".join([porter_stemmer.stem(word) for word in text.split()])
  return [porter_stemmer.stem(token) for token in tokens]

def lemmatizer(tokens):
  # lemmatized_text = " ".join([wordnet_lemmatizer.lemmatize(word) for word in text.split()])
  return [wordnet_lemmatizer.lemmatize(token) for token in tokens]

def full_preprocess(text):
  preprocessed_text = tokenization(text)
  # preprocessed_text = preprocess_text(preprocessed_text)
  preprocessed_text = remove_hyperlink(preprocessed_text)
  preprocessed_text = remove_tags(preprocessed_text)
  preprocessed_text = remove_stopwords(preprocessed_text)
  # preprocessed_text = stemming(preprocessed_text)
  # preprocessed_text = lemmatizer(preprocessed_text)
  return preprocessed_text


In [101]:
test = "['The max voting method is generally used for classification problems. In this technique, multiple models are used to make predictions for each data point. The predictions by each model are considered as a â€˜voteâ€™. The predictions which we get from the majority of the models are used as the final prediction.\n', '\n', 'For example, when you asked 5 of your colleagues to rate your movie (out of 5); weâ€™ll assume three of them rated it as 4 while two of them gave it a 5. Since the majority gave a rating of 4, the final rating will be taken as 4. You can consider this as taking the mode of all the predictions.\n', '\n', 'The result of max voting would be something like this:\n', '\n', 'Colleague 1-5\n', '\n', 'Colleague 2-4\n', '\n', 'Colleague 3-5\n', '\n', 'Colleague 4-4\n', '\n', 'Colleague 5-4\n', '\n', 'Finalrating-4']"
# tokenization(test)
tokens = tokenization(test)
tokens = remove_stopwords(tokens)
tokens = stemming(tokens)
tokens = lemmatizer(tokens)
print(tokens)

['max', 'vote', 'method', 'gener', 'use', 'classif', 'problem', 'techniqu', 'multipl', 'model', 'use', 'make', 'predict', 'data', 'point', 'predict', 'model', 'consid', 'â€˜voteâ€™', 'predict', 'get', 'major', 'model', 'use', 'final', 'predict', 'exampl', 'ask', '5', 'colleagu', 'rate', 'movi', '5', 'weâ€™ll', 'assum', 'three', 'rate', '4', 'two', 'gave', '5', 'sinc', 'major', 'gave', 'rate', '4', 'final', 'rate', 'taken', '4', 'consid', 'take', 'mode', 'predict', 'result', 'max', 'vote', 'would', 'someth', 'like', 'colleagu', '1', '5', 'colleagu', '2', '4', 'colleagu', '3', '5', 'colleagu', '4', '4', 'colleagu', '5', '4', 'finalr', '4']
