In [None]:
!pip install --quiet transformers tqdm tensorboard datasets sklearn accelerate sentence-transformers 

[K     |████████████████████████████████| 5.5 MB 4.8 MB/s 
[K     |████████████████████████████████| 451 kB 41.6 MB/s 
[K     |████████████████████████████████| 175 kB 69.3 MB/s 
[K     |████████████████████████████████| 85 kB 2.7 MB/s 
[K     |████████████████████████████████| 182 kB 55.7 MB/s 
[K     |████████████████████████████████| 7.6 MB 43.9 MB/s 
[K     |████████████████████████████████| 115 kB 70.8 MB/s 
[K     |████████████████████████████████| 212 kB 72.5 MB/s 
[K     |████████████████████████████████| 127 kB 61.5 MB/s 
[K     |████████████████████████████████| 1.3 MB 48.9 MB/s 
[?25h  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [2]:
!git clone https://github.com/Master-Thesis-SLPL/multidoc-conv-qa
%cd multidoc-conv-qa
!git checkout ali
%cd src/retriever/notebooks/

Cloning into 'multidoc-conv-qa'...
remote: Enumerating objects: 781, done.[K
remote: Counting objects: 100% (247/247), done.[K
remote: Compressing objects: 100% (175/175), done.[K
remote: Total 781 (delta 114), reused 182 (delta 62), pack-reused 534[K
Receiving objects: 100% (781/781), 60.27 MiB | 22.42 MiB/s, done.
Resolving deltas: 100% (442/442), done.
/content/multidoc-conv-qa
Branch 'ali' set up to track remote branch 'ali' from 'origin'.
Switched to a new branch 'ali'
/content/multidoc-conv-qa/src/retriever/notebooks


# 1

In [3]:
import json
import gzip
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import tqdm
import os
from sentence_transformers import util


In [12]:
docs_path = '../../../dataset/multidoc2dial/v1.0/multidoc2dial_doc.json'
train_path = '../../../dataset/multidoc2dial/v1.0/multidoc2dial_dial_train.json'
eval_path = '../../../dataset/multidoc2dial/v1.0/multidoc2dial_dial_validation.json'

In [23]:

def construct_questions_dataset_list(dial_data_path):
    query_dataset = []

    with open(dial_data_path, 'r') as f:
        questions_dataset = json.load(f)['dial_data']
    
    for domain, domain_dials in questions_dataset.items():
        for dial in domain_dials:
            for i, turn in enumerate(dial['turns'][:-1]):
                if turn['role'] == 'user':
                    if dial['turns'][i+1]['role'] == 'agent':
                        agent_turn = dial['turns'][i+1]
                        query = dial['turns'][i]['utterance']
                        doc_id = agent_turn['references'][0]['doc_id']
                        span_id = agent_turn['references'][0]['id_sp']
                        section_text = docs_data[domain][doc_id]['spans'][span_id]['text_sec']

                        query_dataset.append({
                            'query': query,
                            'para': section_text
                        })
                    else:
                        continue
    return query_dataset


train_questions_dataset = construct_questions_dataset_list(train_path)
val_questions_dataset = construct_questions_dataset_list(eval_path)


In [31]:

def construct_validation_dataset(dial_data_path):
    query_dataset = []

    with open(dial_data_path, 'r') as f:
        questions_dataset = json.load(f)['dial_data']
    
    for domain, domain_dials in questions_dataset.items():
        for dial in domain_dials:
            for i, turn in enumerate(dial['turns'][:-1]):
                if turn['role'] == 'user':
                    if dial['turns'][i+1]['role'] == 'agent':
                        agent_turn = dial['turns'][i+1]
                        query = dial['turns'][i]['utterance']
                        doc_id = agent_turn['references'][0]['doc_id']
                        span_id = agent_turn['references'][0]['id_sp']
                        section_text = docs_data[domain][doc_id]['spans'][span_id]['text_sec']

                        query_dataset.append({
                            'query': query,
                            'para': section_text,
                            'doc_id': doc_id
                        })
                    else:
                        continue
    return query_dataset


val_questions_dataset = construct_validation_dataset(eval_path)

In [34]:

for vv in val_questions_dataset:
    print(vv)
    break

{'query': 'My insurance ended so what should i do', 'para': 'Because we all pay indirectly for crashes involving uninsured motorists , New York State requires every motorist to maintain auto insurance every single day a vehicle is registered. DMV works with insurance companies to electronically monitor your insurance coverage , and we know when coverage is dropped for any reason. When that happens , we mail you an insurance inquiry letter to allow you to clear up the problem. ', 'doc_id': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}


In [4]:
import nltk
import nltk.data
nltk.download('punkt')
nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def break_section(section_text, max_length=200):
    paragraphs = []
    current_paragraph = []
    sentences = nltk_tokenizer.tokenize(section_text)
    prev_sentence = ""
    for sentence in sentences:
        # current_paragraph = [prev_sentence]
        new_paragraph = current_paragraph + [sentence]
        if len(" ".join(new_paragraph).split()) > max_length:
            paragraphs.append(" ".join(current_paragraph))
            current_paragraph = [prev_sentence, sentence]
            if len(" ".join(current_paragraph).split()) > max_length:
                print("FUCK"*100)
        else:
            current_paragraph = new_paragraph
        prev_sentence = sentence
    paragraphs.append(" ".join(current_paragraph))
    return paragraphs


def merge_sections(sections, max_length=180):
    paragraphs = []
    current_paragraph = []
    sentences = sections
    current_sentence = ""
    for sentence in sentences:
        current_sentence += "\n" + sentence
        if len(current_sentence.split()) > max_length:
            paragraphs.append(current_sentence)
            current_sentence = ""
    return paragraphs

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
words = 0

for domain, domain_data in docs_data.items():
    for doc_id, doc_data in tqdm.tqdm(domain_data.items()):
        processed_section_ids = []
        for span_id, span in doc_data['spans'].items():
            if span['id_sec'] not in processed_section_ids:
                processed_section_ids.append(span['id_sec'])
                words += 1
print(words)

NameError: ignored

In [None]:
print(400000/180)

2222.222222222222


In [30]:
with open(docs_path, 'r') as f:
    docs_data = json.load(f)['doc_data']

doc_ids = []
paragraphs = []


for domain, domain_data in docs_data.items():
    for doc_id, doc_data in tqdm.tqdm(domain_data.items()):
        processed_section_ids = []
        doc_sections = []
        doc_paragraphs = []
        for span_id, span in doc_data['spans'].items():
            if span['id_sec'] not in processed_section_ids:
                processed_section_ids.append(span['id_sec'])
                section_text = span['text_sec']
                doc_sections.append(section_text)
                if len(section_text.split()) < 300:
                    doc_paragraphs.append(span['text_sec'])
                else:
                    new_paragraphs = break_section(section_text)
                    doc_paragraphs += new_paragraphs
        new_paragraphs = merge_sections(doc_paragraphs)
        paragraphs += new_paragraphs
        doc_ids += [doc_id]*len(new_paragraphs)

print()
print(len(paragraphs))
print(len(doc_ids))

100%|██████████| 109/109 [00:00<00:00, 5169.25it/s]
100%|██████████| 138/138 [00:00<00:00, 4038.44it/s]
100%|██████████| 149/149 [00:00<00:00, 3722.98it/s]
100%|██████████| 92/92 [00:00<00:00, 3802.82it/s]


1712
1712





merging small paragraphs with \n

In [None]:
lens = [len(x.split()) for x in paragraphs]
print(max(lens))
for _len in lens[:100]:
    print(_len)

In [None]:
tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model = T5ForConditionalGeneration.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model.eval()

#Select the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Parameters for generation
batch_size = 8 #Batch size
num_queries = 8 #Number of queries to generate for every paragraph
max_length_paragraph = 300 #Max length for paragraph
max_length_query = 64   #Max length for output query

gen_data = []
# Now for every paragraph in our corpus, we generate the queries
for start_idx in tqdm.trange(0, len(paragraphs), batch_size):
    sub_paragraphs = paragraphs[start_idx:start_idx+batch_size]
    inputs = tokenizer.prepare_seq2seq_batch(sub_paragraphs, max_length=max_length_paragraph, truncation=True, return_tensors='pt').to(device)
    outputs = model.generate(
        **inputs,
        max_length=max_length_query,
        do_sample=True,
        top_p=0.95,
        num_return_sequences=num_queries)

    for idx, out in enumerate(outputs):
        query = tokenizer.decode(out, skip_special_tokens=True)
        para = sub_paragraphs[int(idx/num_queries)]
        gen_data.append({
            "query": query,
            "para": para
        })


  0%|          | 0/214 [00:02<?, ?it/s]


RuntimeError: ignored

# 2

In [5]:
import json
with open('gen_data.json') as f:
    gen_data = json.load(f)['gen_data']

In [24]:
train_data = gen_data + train_questions_dataset

print(len(train_data))

36502


In [25]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets
import os

train_examples = []

print(gen_data[9]['query'])
print(gen_data[9]['para'])

for gen in train_data:
    query, paragraph = gen['query'], gen['para']
    train_examples.append(InputExample(texts=[query, paragraph]))

# For the MultipleNegativesRankingLoss, it is important
# that the batch does not contain duplicate entries, i.e.
# no two equal queries and no two equal paragraphs.
# To ensure this, we use a special data loader
train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=8)

# Now we create a SentenceTransformer model from scratch
word_emb = models.Transformer('distilbert-base-uncased')
pooling = models.Pooling(word_emb.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_emb, pooling])

# MultipleNegativesRankingLoss requires input pairs (query, relevant_passage)
# and trains the model so that is is suitable for semantic search
train_loss = losses.MultipleNegativesRankingLoss(model)

#Tune the model
num_epochs = 10
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, show_progress_bar=True)

os.makedirs('output', exist_ok=True)
model.save('output/programming-model')

how do you qualify for widows social security benefits



For Your Widow Or Widower 

There are about five million widows and widowers receiving monthly Social Security benefits based on their deceased spouse's earnings record. And , for many of those survivors, particularly aged women, those benefits are keeping them out of poverty. 
Widows and widowers can receive : reduced benefits as early as age 60 or full benefits at full retirement age or older. benefits as early as age 50 if they're disabled AND their disability started before or within seven years of your death. benefits at any age , if they have not remarried , and if they take care of your child who is under age 16 or disabled and receives benefits on your record. 
If applying for disability benefits on a deceased worker s record , they can speed up the application process if they complete an Adult Disability Report and have it available at the time of their appointment. 
We use the same definition of disability for widows a

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4562 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4562 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4562 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4562 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4562 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [27]:

os.makedirs('output', exist_ok=True)
model.save('output/genq-model')

# 3

In [28]:

from sentence_transformers import SentenceTransformer, util
import gzip
import json
import os

# Load the model we trained in 2_programming_train_bi-encoder.py
model = SentenceTransformer('output/genq-model')

# Load the corpus
docs = []
corpus_filepath = 'wiki-programmming-20210101.jsonl.gz'
if not os.path.exists(corpus_filepath):
    util.http_get('https://sbert.net/datasets/wiki-programmming-20210101.jsonl.gz', corpus_filepath)

with gzip.open(corpus_filepath, 'rt') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        title = data['title']
        for p in data['paragraphs']:
            if len(p) > 100:    #Only take paragraphs with at least 100 chars
                docs.append((title, p))


  0%|          | 0.00/173k [00:00<?, ?B/s]

In [29]:
print(docs[0])

('C (programming language)', 'C (, as in the letter "c") is a general-purpose, procedural computer programming language supporting structured programming, lexical variable scope, and recursion, with a static type system. By design, C provides constructs that map efficiently to typical machine instructions. It has found lasting use in applications previously coded in assembly language. Such applications include operating systems and various application software for computer architectures that range from supercomputers to PLCs and embedded systems.')


In [None]:
for doc in docs:
    print(doc)

('C (programming language)', 'C (, as in the letter "c") is a general-purpose, procedural computer programming language supporting structured programming, lexical variable scope, and recursion, with a static type system. By design, C provides constructs that map efficiently to typical machine instructions. It has found lasting use in applications previously coded in assembly language. Such applications include operating systems and various application software for computer architectures that range from supercomputers to PLCs and embedded systems.')
('C (programming language)', 'A successor to the programming language "B", C was originally developed at Bell Labs by Dennis Ritchie between 1972 and 1973 to construct utilities running on Unix. It was applied to re-implementing the kernel of the Unix operating system. During the 1980s, C gradually gained popularity. It has become one of the most widely used programming languages, with C compilers from various vendors available for the major

In [None]:

paragraph_emb = model.encode([d[1] for d in docs], convert_to_tensor=True)

In [None]:
paragraph_emb.shape

torch.Size([10, 768])

In [38]:
docs = list(zip(doc_ids, paragraphs))

In [36]:
paragraph_emb = model.encode([d[1] for d in docs], convert_to_tensor=True)

In [46]:

print("Available Docs:")
# print("\n".join(sorted(list(set([d[0] for d in docs])))))
print(len(docs))

# Example for semantic search

ats = {
    0: 0,
    1: 0,
    3: 0,
    5: 0,
    10: 0
}

for val_q in tqdm.notebook.tqdm(val_questions_dataset):
    real_doc_id = val_q['doc_id']
    query = val_q['query']
    query_emb = model.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_emb, paragraph_emb, top_k=10)[0]
    hit_doc_ids = [docs[hit['corpus_id']][0] for hit in hits]
    ats[0] += 1
    for at_k in ats.keys():
        if real_doc_id in hit_doc_ids[:at_k]:
            ats[at_k] += 1

    # for hit in hits:
    #     print(hit)
    #     doc = docs[hit['corpus_id']]
    #     print("{:.2f}\t{}\t\t{}".format(hit['score'], doc[0], doc[1]))

    # print("\n=================\n")

Available Docs:
1712


  0%|          | 0/4427 [00:00<?, ?it/s]

In [47]:
ats

{0: 4427, 1: 963, 3: 1529, 5: 1802, 10: 2170}

In [None]:
!zip -r hub.zip /root/.cache/huggingface/hub/

  adding: root/.cache/huggingface/hub/ (stored 0%)
  adding: root/.cache/huggingface/hub/version.txt (stored 0%)
  adding: root/.cache/huggingface/hub/models--BeIR--query-gen-msmarco-t5-large-v1/ (stored 0%)
  adding: root/.cache/huggingface/hub/models--BeIR--query-gen-msmarco-t5-large-v1/refs/ (stored 0%)
  adding: root/.cache/huggingface/hub/models--BeIR--query-gen-msmarco-t5-large-v1/refs/main (deflated 5%)
  adding: root/.cache/huggingface/hub/models--BeIR--query-gen-msmarco-t5-large-v1/.no_exist/ (stored 0%)
  adding: root/.cache/huggingface/hub/models--BeIR--query-gen-msmarco-t5-large-v1/.no_exist/5dd8dd401d24332c17e40015e9792ee31f3ced91/ (stored 0%)
  adding: root/.cache/huggingface/hub/models--BeIR--query-gen-msmarco-t5-large-v1/.no_exist/5dd8dd401d24332c17e40015e9792ee31f3ced91/added_tokens.json (stored 0%)
  adding: root/.cache/huggingface/hub/models--BeIR--query-gen-msmarco-t5-large-v1/blobs/ (stored 0%)
  adding: root/.cache/huggingface/hub/models--BeIR--query-gen-msmarco-t

In [None]:
from google.colab import drive
drive.mount('/content/drive')