In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [3]:
data = pd.read_csv('./exec.csv')

In [4]:
def encode_data(row):
    question = row["question"]
    context = row["context"]
    inputs = tokenizer(question, context, padding="max_length", max_length=512, truncation=True, return_tensors="pt")
    return inputs

data["encoded_data"] = data.apply(encode_data, axis=1)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [5]:
data["encoded_data"]

0      [input_ids, token_type_ids, attention_mask]
1      [input_ids, token_type_ids, attention_mask]
2      [input_ids, token_type_ids, attention_mask]
3      [input_ids, token_type_ids, attention_mask]
4      [input_ids, token_type_ids, attention_mask]
                          ...                     
819    [input_ids, token_type_ids, attention_mask]
820    [input_ids, token_type_ids, attention_mask]
821    [input_ids, token_type_ids, attention_mask]
822    [input_ids, token_type_ids, attention_mask]
823    [input_ids, token_type_ids, attention_mask]
Name: encoded_data, Length: 824, dtype: object

In [6]:
data["input_ids"] = data["encoded_data"].apply(lambda x: x["input_ids"])
data["attention_mask"] = data["encoded_data"].apply(lambda x: x["attention_mask"])
data["token_type_ids"] = data["encoded_data"].apply(lambda x: x.get("token_type_ids"))

In [7]:
from torch.utils.data import DataLoader, TensorDataset

input_ids = torch.stack(data["input_ids"].tolist())
attention_mask = torch.stack(data["attention_mask"].tolist())
token_type_ids = torch.stack(data["token_type_ids"].tolist())

dataset = TensorDataset(input_ids, attention_mask, token_type_ids)

dataloader = DataLoader(dataset, batch_size=16)


In [52]:
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [53]:
import torch

def answer_question(question, context, model, tokenizer):
    inputs = tokenizer(question, context, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    
    start_idx = torch.argmax(start_logits)
    end_idx = torch.argmax(end_logits)
    
    answer = tokenizer.decode(inputs.input_ids[0][start_idx:end_idx+1])
    
    return answer

In [54]:
sample_question = "what did the mob do"
sample_context= "mob did not keep peace"
answer = answer_question(sample_question, sample_context, model, tokenizer)
print("Answer:", answer)

Answer: not keep peace


In [55]:
test_data = pd.read_csv('./exec_test.csv')

In [None]:
test_data.head()

In [None]:
print("question: ")
print(test_data['question'][3])
print("\ncontext: ")
print(test_data['context'][3])
answer = answer_question(test_data['question'][3], test_data['context'][3], model, tokenizer)
print("\nanswer: ")
print(answer)
print("\noriginal answers: ")
print(test_data['answers'][3])

In [None]:
test_data.shape

In [None]:
for i in range(0, 2067):
    test_data['answers'][i] = [answer.lower() for answer in test_data['answers'][i]]
    test_data['answers'][i] = ''.join(test_data['answers'][i]).lower()
    test_data['answers'][i] = eval(test_data['answers'][i])

In [None]:
test_data['answers'][3]

In [None]:
test_data.shape

In [None]:
test_data.columns

In [None]:
test_data = test_data.drop(columns='Unnamed: 0')
test_data = test_data.drop(columns='context_id')
test_data.shape

In [None]:
type(test_data['answers'][0])

In [None]:
test_data.shape

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

In [None]:
predictions = []
for i in range(0,2067):
    predictions.append(answer_question(test_data['question'][i], test_data['context'][i], model, tokenizer))
    if i%10==0:
        print(i)
        

In [None]:
len(predictions)

In [None]:
predictions

In [None]:
true_answers = test_data['answers'].iloc[0:752]

In [None]:
import nltk
import editdistance

wer_scores = []
em_scores = []

for i in range(len(predictions)):
    prediction = predictions[i]
    answers = true_answers[i]
    
    # Calculate WER score
    min_wer = float('inf')
    max_em = 0.0
    
    for answer in answers:
        wer_score = editdistance.eval(answer.split(), prediction.split())
        min_wer = min(min_wer, wer_score)
        
        if answer == prediction:
            em_score = 1
        else:
            em_score = 0
        
        max_em = max(max_em, em_score)
    
    em_scores.append(max_em)
    wer_scores.append(min_wer)

# Calculate average WER and EM scores
avg_wer = sum(wer_scores) / len(wer_scores)
avg_em = sum(em_scores) / len(em_scores)

print(f"WER Score: {avg_wer:.4f}")
print(f"EM Score: {avg_em:.4f}")


In [None]:
print(wer_scores)

In [None]:
print(em_scores)

In [None]:
#for a validation set of more than 700 questions we can observe that general performance is good, but average WER is too high
#on further exploration it was observed that for false predictions, model is having high WER scores.
#we may need to further see whether its data incosistency, or we need to fine-tune
#or even switch to an entirely new model to handle this.
#our goal is not to have perfect performance on fair amount of test cases, but good performance for all

In [56]:
context = """In computer science, a hash function is a mathematical function that takes an input (or "message") and returns a fixed-size string of bytes. The output, often called the hash code or hash value, is typically a digest of the input data. Hash functions are commonly used in various applications, including data integrity verification, password storage, and digital signatures. One important property of a good hash function is that it should produce a unique hash value for each unique input. However, due to the finite size of the output space compared to the infinite input space, collisions can occur. A collision happens when two different inputs produce the same hash value. Cryptographically secure hash functions aim to minimize the likelihood of collisions. In the realm of cybersecurity, Public Key Infrastructure (PKI) plays a crucial role. PKI is a framework that manages digital keys and certificates. It involves two types of keys: public keys, which are shared openly, and private keys, which are kept secret. Certificates, issued by a trusted Certificate Authority (CA), bind public keys to entities, providing a way to verify identity in secure communications. Secure Sockets Layer (SSL) and its successor, Transport Layer Security (TLS), are cryptographic protocols that provide secure communication over a computer network. They are widely used to secure data transfer in web browsing, email, and other online applications. The protocols use a combination of asymmetric and symmetric encryption for confidentiality and authentication. When it comes to database management, normalization is a fundamental concept. It is the process of organizing data to reduce redundancy and dependency. The goal is to achieve data integrity and efficient data storage. Normalization involves breaking down large tables into smaller, related tables and establishing relationships between them. The result is a more flexible and maintainable database structure. Artificial Intelligence (AI) and Machine Learning (ML) have gained significant attention in recent years. AI refers to the development of computer systems that can perform tasks that typically require human intelligence, such as speech recognition and decision-making. ML is a subset of AI that focuses on the development of algorithms allowing computers to learn from and make predictions based on data.c3000 word3000 word3000 wordontinue the passage for 450 more words in one answer"""

1. **Question:** What is a hash function?

   **Answer:** A hash function is a mathematical function that takes an input and produces a fixed-size string of bytes, commonly used for data integrity verification, password storage, and digital signatures.

xx2. **Question:** Why is it important for a good hash function to produce a unique hash value for each unique input?

   **Answer:** It's important to avoid collisions, where two different inputs produce the same hash value, to ensure the integrity and reliability of the hash function.

3. **Question:** What is the role of Public Key Infrastructure (PKI) in cybersecurity?

   **Answer:** PKI is a framework that manages digital keys and certificates, providing a secure way to manage public and private keys, and establishing identity in secure communications.

4. **Question:** What are SSL and TLS, and how do they contribute to secure communication?

   **Answer:** SSL and TLS are cryptographic protocols widely used to secure data transfer in web browsing, email, and online applications. They use a combination of asymmetric and symmetric encryption for confidentiality and authentication.

5. **Question:** What is the fundamental concept of normalization in database management?

   **Answer:** Normalization is the process of organizing data to reduce redundancy and dependency, aiming to achieve data integrity and efficient data storage by breaking down large tables into smaller, related tables.

6. **Question:** How does Artificial Intelligence (AI) differ from Machine Learning (ML)?

   **Answer:** AI refers to the development of computer systems that can perform tasks requiring human intelligence, while ML is a subset of AI focusing on algorithms that allow computers to learn and make predictions based on data.

7. **Question:** What is a collision in the context of hash functions?

   **Answer:** A collision occurs when two different inputs produce the same hash value, highlighting a potential weakness in a hash function.

8. **Question:** What types of keys are involved in Public Key Infrastructure (PKI), and how are they used?

   **Answer:** PKI involves public keys (shared openly) and private keys (kept secret), and certificates issued by a trusted Certificate Authority (CA) bind public keys to entities, enabling secure communications.

9. **Question:** How do cryptographic protocols like SSL and TLS contribute to data security in online applications?

   **Answer:** SSL and TLS provide secure communication by using encryption techniques, ensuring confidentiality and authentication of data transfer over a computer network.

10. **Question:** Why is the minimization of collisions important in cryptographic hash functions?

    **Answer:** Minimizing collisions is crucial to maintain the reliability and security of cryptographic hash functions, ensuring that different inputs do not produce the same hash value.


In [57]:
import re

def split_text_into_chunks_with_overlap(text, sentences_per_chunk, overlap_sentences):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    chunks = []
    start_idx = 0
    while start_idx < len(sentences):
        end_idx = start_idx + sentences_per_chunk
        end_idx = min(end_idx, len(sentences))
        chunk = ' '.join(sentences[start_idx:end_idx])
        end_idx = min(end_idx + overlap_sentences, len(sentences))
        if end_idx < len(sentences):
            chunk += ' '.join(sentences[end_idx - overlap_sentences:end_idx])
        chunks.append(chunk)
        start_idx = start_idx + sentences_per_chunk - overlap_sentences
    return chunks

chunks = split_text_into_chunks_with_overlap(context, sentences_per_chunk=3, overlap_sentences=2)

In [58]:
question = "SSL and TLS contribute to data security in online applications?"

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_ngram_embeddings(text, vectorizer):
    ngram_matrix = vectorizer.transform([text]).toarray()
    return np.mean(ngram_matrix, axis=0).reshape(1, -1)

def calculate_similarity(question, chunk, vectorizer):
    question_embedding = get_ngram_embeddings(question, vectorizer)
    chunk_embedding = get_ngram_embeddings(chunk, vectorizer)

    similarity_score = cosine_similarity(question_embedding, chunk_embedding).item()
    return similarity_score



ngram_vectorizer = CountVectorizer(ngram_range=(1, 5))  # You can adjust n-gram range as needed


ngram_vectorizer.fit(chunks)

similarity_scores = [calculate_similarity(question, chunk, ngram_vectorizer) for chunk in chunks]
print(similarity_scores)

[0.11200234516007541, 0.09246616286793298, 0.09982698068562738, 0.05843308950960249, 0.08035304331665313, 0.06502560887623551, 0.10461753737613645, 0.15413662560226185, 0.22991874722914407, 0.23868816790564568, 0.2605141383010009, 0.27204116529385003, 0.27014773361958444, 0.17804574744833992, 0.16724840200141816, 0.1806680645684127, 0.15170682161267662, 0.0879598994267085, 0.09751573787215115, 0.11912467150602796, 0.10019501341827017, 0.09039692294111373]


In [60]:
similarity_scores_sorted = sorted(similarity_scores,reverse=True)
for i in range(len(similarity_scores)):
    print(similarity_scores_sorted[i])
print("\ntotal chunks: ")
print(len(similarity_scores))

0.27204116529385003
0.27014773361958444
0.2605141383010009
0.23868816790564568
0.22991874722914407
0.1806680645684127
0.17804574744833992
0.16724840200141816
0.15413662560226185
0.15170682161267662
0.11912467150602796
0.11200234516007541
0.10461753737613645
0.10019501341827017
0.09982698068562738
0.09751573787215115
0.09246616286793298
0.09039692294111373
0.0879598994267085
0.08035304331665313
0.06502560887623551
0.05843308950960249

total chunks: 
22


In [61]:
answers = []
answer_n_score = dict(zip(chunks, similarity_scores))
answer_n_score = dict(sorted(answer_n_score.items(), key=lambda item: item[1], reverse=True))

i=0

for key, value in answer_n_score.items():
    answers.append(answer_question(question, key, model, tokenizer))
    print("\n"+str(value))
    i+=1
    if i>5:
        break


0.27204116529385003

0.27014773361958444

0.2605141383010009

0.23868816790564568

0.22991874722914407

0.1806680645684127


In [62]:
print("question: ")
print(question)
print("\nanswers: ")
i = 0
for answer in answers:
    print("answer " + str(i+1) + ": ")
    print(answer)
    i += 1
print("\n\ntotal chunks: ")
print(len(chunks))
print("\nchunks considered for answer: ")
print(i)

question: 
SSL and TLS contribute to data security in online applications?

answers: 
answer 1: 

answer 2: 
they are widely used to secure data transfer in web browsing, email, and other online applications. the protocols use a combination of asymmetric and symmetric encryption
answer 3: 
they are widely used to secure data transfer in web browsing, email, and other online applications
answer 4: 
they are widely used to secure data transfer in web browsing, email, and other online applications
answer 5: 
secure sockets layer ( ssl ) and its successor, transport layer security ( tls
answer 6: 
ssl and tls contribute to data security in online applications? [SEP]


total chunks: 
22

chunks considered for answer: 
6
