In [1]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util, models
from sklearn.feature_extraction import _stop_words as stop_words
from tqdm.notebook import tqdm
from rank_bm25 import BM25Okapi

import torch
import string
import json
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahmedalsayed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
bi_enc_weights = '../models/bienc-exp7/'
cr_enc_weights = '../models/crenc-exp7/'
data_folder = 'generated5'
top_k = 50
use_base = False

In [3]:
df = pd.read_excel('../data/20231004_data.xlsx', index_col=0)
df.head(2)

Unnamed: 0,Question Link,Question Title,Question Body,Accepted Answer Body,link,readme,docker,readme_short
0,390150,Authenticating against Active Directory with J...,<p>I have a simple task of authenticating agai...,<p>Here's the code I put together based on exa...,https://github.com/jenkinsci/active-directory-...,Active Directory plugin for Jenkins\n=========...,Dockerfile from src/test/resources/fixture/Doc...,Active Directory plugin for Jenkins ==========...
1,1197678,Using Thrift with Delphi Win32,<p>I'm interested in connecting to the Evernot...,<p><strong>Old Answer Replaced thanks to Leo:<...,https://github.com/apache/thrift,Apache Thrift\n=============\nIntroduction\n==...,Dockerfile from build/docker/msvc2017/Dockerfi...,Apache Thrift ============= Introduction =====...


In [4]:
english_stopwords = set(stopwords.words('english'))

def bm25_tokenizer(text):
  tokenized_doc = []
  for token in text.lower().split():
    token = token.strip(string.punctuation)

    if len(token) > 0 and token not in english_stopwords:
      tokenized_doc.append(token)
      
  return tokenized_doc

In [5]:
if use_base:
    word_embedding_model = models.Transformer('distilroberta-base', max_seq_length=350)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    cr_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
else:
    bi_encoder = SentenceTransformer(bi_enc_weights)
    cr_encoder = CrossEncoder(cr_enc_weights)

In [6]:
with open('/Users/Documents/final-code-microservice-paper/experiments/test_passage_100.json', 'r') as f:
    val_passage = json.load(f)

with open('/Users/Documents/final-code-microservice-paper/experiments/test_corpus_100.json', 'r') as f:  # Note the _100 in the filename
    val_corpus = json.load(f)

val_query_answer = {}
for idx, rel in val_passage.items():
    query = val_corpus.get(idx)  
    if query:  
       
        answers = [val_corpus[str(p)] for p in rel if str(p) in val_corpus]
        if answers:  
            val_query_answer[query] = answers



In [7]:
import torch
import json
import pandas as pd


def shorten(text):
    tmp = text.split()[:512]
    return ' '.join(tmp)

val_text = list(val_corpus.values())


with open("/Users/Documents/final-code-microservice-paper/experiments/data/embeddings_GPT.json", "r") as jsonfile:
    embeddings_dict = json.load(jsonfile)

val_emb_tensors = []


title_to_id = dict(zip(df['Question Title'], df.index))


for text in val_text:
   
    text_id = title_to_id[text]
    embedding = embeddings_dict.get(str(text_id))  
    if embedding:  
        val_emb_tensors.append(torch.tensor(embedding))


val_emb = torch.stack(val_emb_tensors)


In [8]:
from tqdm import tqdm
tokenized_corpus = []
for idx, passage in tqdm(val_corpus.items()):
    tokenized_corpus.append(bm25_tokenizer(passage))

100%|██████████| 100/100 [00:00<00:00, 262307.94it/s]


In [9]:
#Total Questions Answered Correctly
import pandas as pd
import numpy as np
from sklearn.metrics import average_precision_score, ndcg_score
import warnings
warnings.filterwarnings("ignore", message="No positive class found in y_true, recall is set to one for all thresholds.")

def compute_cosine_similarity(query_embedding, corpus_embeddings):
    # Normalize embeddings
    if len(query_embedding.shape) == 1:
        query_embedding = query_embedding / torch.norm(query_embedding, keepdim=True)
        query_embedding = query_embedding.unsqueeze(0)  # Add an additional dimension
    else:
        query_embedding = query_embedding / torch.norm(query_embedding, dim=1, keepdim=True)
    
    corpus_embeddings = corpus_embeddings / torch.norm(corpus_embeddings, dim=1, keepdim=True)
    
    # Compute cosine similarity
    cosine_similarities = torch.mm(query_embedding, corpus_embeddings.transpose(0, 1))
    
    return cosine_similarities



def forward_pass_rerank(query, precomputed_embedding=None, val_embeddings=None, top_k=50):
    
    if precomputed_embedding is None:
        q_emb = bi_encoder.encode(query, convert_to_tensor=True)
    else:
        q_emb = precomputed_embedding

    # Ensure the query embedding is 2-dimensional
    if len(q_emb.shape) == 1:
        q_emb = q_emb.unsqueeze(0)

    if val_embeddings is None:
        raise ValueError("No embeddings provided for the validation set.")
    
    
    if val_embeddings.shape[1] != q_emb.shape[1]:
        val_embeddings = val_embeddings.transpose(0, 1)
    
    cosine_similarities = compute_cosine_similarity(q_emb, val_embeddings)
    

    top_indices = torch.topk(cosine_similarities, k=top_k+1, dim=1).indices[0].tolist()
    hits = [{'corpus_id': index, 'score': cosine_similarities[0, index].item()} for index in top_indices]
    

    cross_inputs = []
    
    to_remove = -1
    for hit in hits:
        text = val_text[hit['corpus_id']]
        if query == text:
            to_remove = hits.index(hit)
        cross_inputs.append([query, text])
        
    cross_scores = cr_encoder.predict(cross_inputs)
    
    for idx in range(len(cross_scores)):
        hits[idx]['cross_score'] = cross_scores[idx]
        
    if to_remove != -1: 
        del hits[to_remove]
    hits = hits[:top_k]

    return hits

total_questions_answered_correctly = 0

for query, answers_text in tqdm(val_query_answer.items(), total=len(val_query_answer)):
    r = answers_text
    query_id = title_to_id[query]
    precomputed_query_embedding = torch.tensor(embeddings_dict.get(str(query_id)))
    hits = forward_pass_rerank(query, precomputed_embedding=precomputed_query_embedding, val_embeddings=val_emb, top_k=top_k)
    
    hit_info_list = []
    
    for hit in hits[:5]:
        h_text = val_text[hit['corpus_id']]
        cscore_output_cr = cr_encoder.predict([query, h_text])
        cscore_cr = cscore_output_cr[0] if isinstance(cscore_output_cr, (list, tuple, np.ndarray)) else cscore_output_cr
        hit_info = {
            'h_text': h_text,
            'cscore': cscore_cr,
            'is_correct': "Yes" if h_text in r else "No"
        }
        hit_info_list.append(hit_info)
    
    question_answered_correctly = any(hit_info['is_correct'] == "Yes" for hit_info in hit_info_list)
    if question_answered_correctly:
        total_questions_answered_correctly += 1

print(f"Total Questions Answered Correctly: {total_questions_answered_correctly}")

100%|██████████| 85/85 [00:31<00:00,  2.73it/s]

Total Questions Answered Correctly: 16





In [10]:
#Total Correcte Answeres 
import pandas as pd
import numpy as np
from sklearn.metrics import average_precision_score, ndcg_score
import warnings
warnings.filterwarnings("ignore", message="No positive class found in y_true, recall is set to one for all thresholds.")

def compute_cosine_similarity(query_embedding, corpus_embeddings):
    # Normalize embeddings
    if len(query_embedding.shape) == 1:
        query_embedding = query_embedding / torch.norm(query_embedding, keepdim=True)
        query_embedding = query_embedding.unsqueeze(0)  # Add an additional dimension
    else:
        query_embedding = query_embedding / torch.norm(query_embedding, dim=1, keepdim=True)
    
    corpus_embeddings = corpus_embeddings / torch.norm(corpus_embeddings, dim=1, keepdim=True)
    
    # Compute cosine similarity
    cosine_similarities = torch.mm(query_embedding, corpus_embeddings.transpose(0, 1))
    
    return cosine_similarities



def forward_pass_rerank(query, precomputed_embedding=None, val_embeddings=None, top_k=50):
    
    if precomputed_embedding is None:
        q_emb = bi_encoder.encode(query, convert_to_tensor=True)
    else:
        q_emb = precomputed_embedding

    # Ensure the query embedding is 2-dimensional
    if len(q_emb.shape) == 1:
        q_emb = q_emb.unsqueeze(0)

    if val_embeddings is None:
        raise ValueError("No embeddings provided for the validation set.")
    
    
    if val_embeddings.shape[1] != q_emb.shape[1]:
        val_embeddings = val_embeddings.transpose(0, 1)
    
    cosine_similarities = compute_cosine_similarity(q_emb, val_embeddings)
    

    top_indices = torch.topk(cosine_similarities, k=top_k+1, dim=1).indices[0].tolist()
    hits = [{'corpus_id': index, 'score': cosine_similarities[0, index].item()} for index in top_indices]
    

    cross_inputs = []
    
    to_remove = -1
    for hit in hits:
        text = val_text[hit['corpus_id']]
        if query == text:
            to_remove = hits.index(hit)
        cross_inputs.append([query, text])
        
    cross_scores = cr_encoder.predict(cross_inputs)
    
    for idx in range(len(cross_scores)):
        hits[idx]['cross_score'] = cross_scores[idx]
        
    if to_remove != -1: 
        del hits[to_remove]
    hits = hits[:top_k]

    return hits

total_correct = 0

for query, answers_text in tqdm(val_query_answer.items(), total=len(val_query_answer)):
    r = answers_text
    # The rest of the loop remains the same

    #r = val_query_answer[query_key]
    # Fetch precomputed embedding
    query_id = title_to_id[query]
    precomputed_query_embedding = torch.tensor(embeddings_dict.get(str(query_id)))
    hits = forward_pass_rerank(query, precomputed_embedding=precomputed_query_embedding, val_embeddings=val_emb, top_k=top_k)
    
    hit_info_list = []
    
    for hit in hits[:5]:
        h_text = val_text[hit['corpus_id']]
        cscore_output_cr = cr_encoder.predict([query, h_text])
        cscore_cr = cscore_output_cr[0] if isinstance(cscore_output_cr, (list, tuple, np.ndarray)) else cscore_output_cr
        hit_info = {
            'h_text': h_text,
            'cscore': cscore_cr,
            'is_correct': "Yes" if h_text in r else "No"
        }
        hit_info_list.append(hit_info)
    
    correct_count = sum(1 for hit_info in hit_info_list if hit_info['is_correct'] == "Yes")
    total_correct += correct_count

print(f"Total Correct Answers: {total_correct}")

100%|██████████| 85/85 [00:31<00:00,  2.71it/s]

Total Correct Answers: 19





In [13]:
# print the question where more than 2 hits are correct :
import warnings
warnings.filterwarnings("ignore", message="No positive class found in y_true, recall is set to one for all thresholds.")

import pandas as pd
import numpy as np
from sklearn.metrics import average_precision_score, ndcg_score
import warnings
warnings.filterwarnings("ignore", message="No positive class found in y_true, recall is set to one for all thresholds.")

def compute_cosine_similarity(query_embedding, corpus_embeddings):
    # Normalize embeddings
    if len(query_embedding.shape) == 1:
        query_embedding = query_embedding / torch.norm(query_embedding, keepdim=True)
        query_embedding = query_embedding.unsqueeze(0)  # Add an additional dimension
    else:
        query_embedding = query_embedding / torch.norm(query_embedding, dim=1, keepdim=True)
    
    corpus_embeddings = corpus_embeddings / torch.norm(corpus_embeddings, dim=1, keepdim=True)
    
    # Compute cosine similarity
    cosine_similarities = torch.mm(query_embedding, corpus_embeddings.transpose(0, 1))
    
    return cosine_similarities



def forward_pass_rerank(query, precomputed_embedding=None, val_embeddings=None, top_k=50):
    
    if precomputed_embedding is None:
        q_emb = bi_encoder.encode(query, convert_to_tensor=True)
    else:
        q_emb = precomputed_embedding

    # Ensure the query embedding is 2-dimensional
    if len(q_emb.shape) == 1:
        q_emb = q_emb.unsqueeze(0)

    if val_embeddings is None:
        raise ValueError("No embeddings provided for the validation set.")
    
    
    if val_embeddings.shape[1] != q_emb.shape[1]:
        val_embeddings = val_embeddings.transpose(0, 1)
    
    cosine_similarities = compute_cosine_similarity(q_emb, val_embeddings)
    

    top_indices = torch.topk(cosine_similarities, k=top_k+1, dim=1).indices[0].tolist()
    hits = [{'corpus_id': index, 'score': cosine_similarities[0, index].item()} for index in top_indices]
    

    cross_inputs = []
    
    to_remove = -1
    for hit in hits:
        text = val_text[hit['corpus_id']]
        if query == text:
            to_remove = hits.index(hit)
        cross_inputs.append([query, text])
        
    cross_scores = cr_encoder.predict(cross_inputs)
    
    for idx in range(len(cross_scores)):
        hits[idx]['cross_score'] = cross_scores[idx]
        
    if to_remove != -1: 
        del hits[to_remove]
    hits = hits[:top_k]

    return hits

total_correct = 0

for query, answers_text in tqdm(val_query_answer.items(), total=len(val_query_answer)):
    r = answers_text

    # Fetch precomputed embedding
    query_id = title_to_id[query]
    precomputed_query_embedding = torch.tensor(embeddings_dict.get(str(query_id)))
    hits = forward_pass_rerank(query, precomputed_embedding=precomputed_query_embedding, val_embeddings=val_emb, top_k=top_k)
    
    hit_info_list = []
    
    for hit in hits[:5]:
        h_text = val_text[hit['corpus_id']]
        cscore_output_cr = cr_encoder.predict([query, h_text])
        cscore_cr = cscore_output_cr[0] if isinstance(cscore_output_cr, (list, tuple, np.ndarray)) else cscore_output_cr
        hit_info = {
            'h_text': h_text,
            'cscore': cscore_cr,
            'is_correct': "Yes" if h_text in r else "No"
        }
        hit_info_list.append(hit_info)
    
    correct_count = sum(1 for hit_info in hit_info_list if hit_info['is_correct'] == "Yes")
    
    # Check if more than 2 hits are correct, and if so, print the question and its hits
    if correct_count >= 2:
        print(f"Question: {query}")
        for hit_info in hit_info_list:
            print(f"\tHit: {hit_info['h_text']} - Correct: {hit_info['is_correct']}")
        print("\n")
    
    total_correct += correct_count

print(f"Total Correct Answers: {total_correct}")


 11%|█         | 9/85 [00:03<00:29,  2.59it/s]

Question: Alternative to Kubernetes rolling update in rest api
	Hit: Are Kubernetes API calls secret update and configmap update atomic calls? - Correct: No
	Hit: Kubernetes Watch Pod Events with api - Correct: Yes
	Hit: Restricted Kubernetes dashboard? - Correct: No
	Hit: Running dashboard inside play-with-kubernetes - Correct: No
	Hit: Kubernetes Storage on bare-metal/private cloud - Correct: Yes




 14%|█▍        | 12/85 [00:04<00:26,  2.79it/s]

Question: Authentication views for Laravel 5.1
	Hit: Force logout of specific user by user id in Laravel - Correct: Yes
	Hit: Best way to make restfull API in Laravel - Correct: Yes
	Hit: Kubernetes simple authentication - Correct: No
	Hit: Create user with LDAP authentification in airflow 2.1.4 - Correct: No
	Hit: React Native Phone-based login - Correct: No




 81%|████████  | 69/85 [00:26<00:06,  2.65it/s]

Question: Giving a docker container a routable ip address
	Hit: Assigning IP address to docker containers? - Correct: Yes
	Hit: Bind to multiple ip addresses in a single docker container - Correct: Yes
	Hit: Run Jupyter Notebook in the Background on Docker - Correct: No
	Hit: Start kubernetes container with specific command - Correct: No
	Hit: Project layout with vagrant, docker and git - Correct: No




100%|██████████| 85/85 [00:33<00:00,  2.54it/s]

Total Correct Answers: 19



