#### Import Libraries

In [1]:
import torch
import json
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.preprocessor import PreProcessor
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http, eval_data_from_json
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.document_store import InMemoryDocumentStore
from haystack.retriever.sparse import TfidfRetriever
from haystack.utils import print_answers
from haystack.reader.farm import FARMReader
from haystack.pipeline import ExtractiveQAPipeline

06/28/2021 14:42:06 - INFO - faiss.loader -   Loading faiss with AVX2 support.
06/28/2021 14:42:06 - INFO - faiss.loader -   Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
06/28/2021 14:42:06 - INFO - faiss.loader -   Loading faiss.
06/28/2021 14:42:07 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
06/28/2021 14:42:08 - INFO - faiss.loader -   Loading faiss with AVX2 support.
06/28/2021 14:42:08 - INFO - faiss.loader -   Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
06/28/2021 14:42:08 - INFO - faiss.loader -   Loading faiss.


#### Check Cuda

In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1050


#### Connective to ElasticSearchDocumentStore

In [3]:
# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(
        host="localhost",
        port=9200,
        text_field = 'text',
        name_field = 'name',
        username="",
        password="",
        index="document")

06/28/2021 14:42:08 - INFO - elasticsearch -   HEAD http://localhost:9200/ [status:200 request:0.011s]
06/28/2021 14:42:08 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.003s]
06/28/2021 14:42:08 - INFO - elasticsearch -   GET http://localhost:9200/document [status:200 request:0.002s]
06/28/2021 14:42:08 - INFO - elasticsearch -   PUT http://localhost:9200/document/_mapping [status:200 request:0.008s]
06/28/2021 14:42:08 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.002s]


#### Store our documents in the document store

In [4]:
document_store.delete_all_documents()
all_docs = convert_files_to_dicts(dir_path="gem_explorer")
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True
)

def clean_unicode(text):
    return text.replace("\u200b","")

nested_docs = [preprocessor.process(d) for d in all_docs]
docs = [d for x in nested_docs for d in x]
for doc in docs:
    doc['text'] = clean_unicode(doc['text'])

document_store.write_documents(docs)

                1. delete_all_documents() method is deprecated, please use delete_documents method
                For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/1045
                
06/28/2021 14:42:08 - INFO - elasticsearch -   POST http://localhost:9200/document/_delete_by_query [status:200 request:0.081s]
06/28/2021 14:42:10 - INFO - haystack.preprocessor.utils -   Converting gem_explorer\after_applying\acceptance_letter.txt
06/28/2021 14:42:10 - INFO - haystack.preprocessor.utils -   Converting gem_explorer\after_applying\participation_letter.txt
06/28/2021 14:42:10 - INFO - haystack.preprocessor.utils -   Converting gem_explorer\back_ntu\credit_transfer.txt
06/28/2021 14:42:10 - INFO - haystack.preprocessor.utils -   Converting gem_explorer\back_ntu\post_exchange_review.txt
06/28/2021 14:42:10 - INFO - haystack.preprocessor.utils -   Converting gem_explorer\back_ntu\reenroll_courses.txt
06/28/2021 14:42:10 - INFO - haystack.preprocesso

#### Document Store insights

In [5]:
# Number of documents
print("Number of documents in the document store: ", document_store.get_document_count())

06/28/2021 14:42:11 - INFO - elasticsearch -   POST http://localhost:9200/document/_count [status:200 request:0.008s]


Number of documents in the document store:  59


In [6]:
# Example of the document
document_store.get_all_documents()[0]

06/28/2021 14:42:11 - INFO - elasticsearch -   POST http://localhost:9200/document/_search?scroll=1d&size=10000 [status:200 request:0.006s]
06/28/2021 14:42:11 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.004s]
06/28/2021 14:42:11 - INFO - elasticsearch -   DELETE http://localhost:9200/_search/scroll [status:200 request:0.004s]


{'text': "Acceptance Letter\nAfter successful application, students would be notified by their host universities regarding their Acceptance Letters. OGEM will notify students to collect the hard copies of their Acceptance Letters accordingly (if available). Type of Acceptance Letter\nSoft copy: By Email from host university/OGEM\n\nHard copy: Collection at OGEM's office\n\nApplication Outcome (including Acceptance Letter) from Host University:\nSemester 1: March to July\n\nSemester 2: August to December", 'score': None, 'probability': None, 'question': None, 'meta': {'_split_id': 0, 'name': 'acceptance_letter.txt'}, 'embedding': None, 'id': 'bfa38af674bcbf9b2c71176f5b17af8'}

#### Initialising ElasticSearchRetriever

In [7]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

#### Lets test a few queries from the retriever

In [8]:
question = "How is my application priority considered?"
documents = retriever.retrieve(query = question, top_k=2)
print("Top retrieved document: \n")
for document in documents:
    print("\nDocument Text:\n", document.text)
    print("\nScore: ", document.score)
    print("\nProbability: {:.2f}%".format(document.probability*100))
    print("-"* 100)


06/28/2021 14:42:11 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]


Top retrieved document: 


Document Text:
 Allocation Criteria
Before you apply, you should understand how our selection process works so that you can maximise your chances of securing a host university. OGEM's selection process is guided by

the ranking of top three universities in your application

the no. of available spots per semester and no. of applicants to each partner university. the priority of applicants as stated below. The system shall place students on their 1st university choice first, based on set allocation criteria (priority), before moving on to allocate 2nd university choices to students who did not get their 1st university choice.

Score:  8.688457

Probability: 74.76%
----------------------------------------------------------------------------------------------------

Document Text:
 Below are the terms of your priority and how to exercise it:

This guarantee for CN Yang SP, NTU-USP and TSP Scholars refers to any overseas programme managed by OGEM (e.g. 2-week to 

In [9]:
question = "what is the estimated cost for asian universities"
documents = retriever.retrieve(query = question, top_k=2)
print("Top retrieved document: \n")
for document in documents:
    print("\nDocument Text:\n", document.text)
    print("\nScore: ", document.score)
    print("\nProbability: {:.2f}%".format(document.probability*100))
    print("-"* 100)


06/28/2021 14:42:11 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]


Top retrieved document: 


Document Text:
 Do note that these costs vary as they are based on an individual's spending habits and lifestyle. Region	Estimated Cost
Asia	S$12,000 onwards
Oceania	S$18,000 onwards
Americas & Europe	S$20,000 onwards
There are several financial assistance schemes available to students who wish to seek financial support while on exchange. Please see Financial Aid to find out more about awards, scholarships, loans, and grants available for NTU students.

Score:  7.0461354

Probability: 70.70%
----------------------------------------------------------------------------------------------------

Document Text:
 Consumption of Controlled Drugs Overseas
The possession and consumption of illegal drugs outside of Singapore by Singapore citizens and Permanent Residents are still considered offences punishable by Singaporean law. Specifically, Section 8A of the Act criminalises drug consumption overseas by Singapore citizens or permanent residents. This is crucial for 

In [10]:
# Clear Cuda memory

import gc

gc.collect()

torch.cuda.empty_cache()

#### Using a pretrained model trained on SQuAD dataset

In [11]:
from haystack.reader.farm import FARMReader  
# deepset/roberta-base-squad2
reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)

06/28/2021 14:42:11 - INFO - farm.utils -   Using device: CUDA 
06/28/2021 14:42:11 - INFO - farm.utils -   Number of GPUs: 1
06/28/2021 14:42:11 - INFO - farm.utils -   Distributed Training: False
06/28/2021 14:42:11 - INFO - farm.utils -   Automatic Mixed Precision: None
06/28/2021 14:42:24 - INFO - farm.utils -   Using device: CUDA 
06/28/2021 14:42:24 - INFO - farm.utils -   Number of GPUs: 1
06/28/2021 14:42:24 - INFO - farm.utils -   Distributed Training: False
06/28/2021 14:42:24 - INFO - farm.utils -   Automatic Mixed Precision: None
06/28/2021 14:42:24 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
06/28/2021 14:42:24 - INFO - farm.infer -    0    0    0    0    0    0    0 
06/28/2021 14:42:24 - INFO - farm.infer -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
06/28/2021 14:42:24 - INFO - farm.infer -   /'\  / \  /'\  /'\  / \  / \  /'\
06/28/2021 14:42:24 - INFO - farm.infer -               


#### Fine tuning the model with our own domain specific data

In [12]:
train_data = "data"
train_filename = "answers.json"
# train_data = "PATH/TO_YOUR/TRAIN_DATA" 
reader.train(data_dir=train_data, train_filename=train_filename, use_gpu=True, n_epochs=5, save_dir="my_model")

06/28/2021 14:42:24 - INFO - farm.utils -   Using device: CUDA 
06/28/2021 14:42:24 - INFO - farm.utils -   Number of GPUs: 1
06/28/2021 14:42:24 - INFO - farm.utils -   Distributed Training: False
06/28/2021 14:42:24 - INFO - farm.utils -   Automatic Mixed Precision: None
Preprocessing Dataset data\answers.json: 100%|█████████████████████████████████████| 25/25 [00:06<00:00,  4.04 Dicts/s]
06/28/2021 14:42:32 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 1e-05}'
06/28/2021 14:42:32 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
06/28/2021 14:42:32 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_training_steps': 170, 'num_warmup_steps': 34}'
Train epoch 0/4 (Cur. train loss: 2.3700): 100%|███████████████████████████████████████| 34/34 [00:21<00:00,  1.56it/s]
Train epoch 1/4 (Cur. train loss: 1.4979): 100%|██

In [13]:
# Loading trained model

reader = FARMReader('my_model')

06/28/2021 14:44:43 - INFO - farm.utils -   Using device: CUDA 
06/28/2021 14:44:43 - INFO - farm.utils -   Number of GPUs: 1
06/28/2021 14:44:43 - INFO - farm.utils -   Distributed Training: False
06/28/2021 14:44:43 - INFO - farm.utils -   Automatic Mixed Precision: None
06/28/2021 14:44:44 - INFO - farm.utils -   Using device: CUDA 
06/28/2021 14:44:44 - INFO - farm.utils -   Number of GPUs: 1
06/28/2021 14:44:44 - INFO - farm.utils -   Distributed Training: False
06/28/2021 14:44:44 - INFO - farm.utils -   Automatic Mixed Precision: None
06/28/2021 14:44:44 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
06/28/2021 14:44:44 - INFO - farm.infer -    0    0    0    0    0    0    0 
06/28/2021 14:44:44 - INFO - farm.infer -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
06/28/2021 14:44:44 - INFO - farm.infer -   /'\  / \  /'\  /'\  / \  / \  /'\
06/28/2021 14:44:44 - INFO - farm.infer -               


In [14]:
# Creating a pipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [15]:
top_k_retriever=5
top_k_reader=2

#### Lets test the pipeline

In [16]:
question = "Do senior students get any priority?"
documents = retriever.retrieve(query = question, top_k=2)
print("Top retrieved document: \n")
for document in documents:
    print("\nDocument Text:\n", document.text)
    print("\nScore: ", document.score)
    print("\nProbability: {:.2f}%".format(document.probability*100))
    print("-"* 100)

06/28/2021 14:44:46 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]


Top retrieved document: 


Document Text:
 Allocation Criteria
Before you apply, you should understand how our selection process works so that you can maximise your chances of securing a host university. OGEM's selection process is guided by

the ranking of top three universities in your application

the no. of available spots per semester and no. of applicants to each partner university. the priority of applicants as stated below. The system shall place students on their 1st university choice first, based on set allocation criteria (priority), before moving on to allocate 2nd university choices to students who did not get their 1st university choice.

Score:  6.946327

Probability: 70.44%
----------------------------------------------------------------------------------------------------

Document Text:
 Please do not make any payment fee for any administrative matter including visa, air tickets and accommodation until you are advised to do so.

Score:  5.942872

Probability: 67.76%
-

#### Prediction Funcion

In [17]:
top_k_retriever=5
top_k_reader=3
def print_answer(question, top_answer= False):
    prediction = pipe.run(query=question, top_k_retriever=top_k_retriever, top_k_reader=top_k_reader)
    if len(prediction['answers']) == 0:
        return "No answers found"
    print("Top Answer = ", prediction['answers'][0]['answer'])
    print("-"* 100)
    answers = prediction['answers']
    if not top_answer:
        for answer in answers:
            ans = answer["answer"]
            context = answer["context"]
            document_name = answer["meta"]["name"]
            score = answer["score"]
            print("Answer: \n", ans)
            print("\nContext: \n", context)
            print("\nDocument_name: \n", document_name)
            print("\nScore: \n", score)
            print('-' * 100)
    return

In [18]:
print_answer("Do senior students get any priority?")

06/28/2021 14:44:48 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.004s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 31.33 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 38.57 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 37.06 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 34.57 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 37.13 Batches/s]

Top Answer =  Students at a more senior level with similar merits shall have a higher chance of allocation.
----------------------------------------------------------------------------------------------------
Answer: 
 Students at a more senior level with similar merits shall have a higher chance of allocation.

Context: 
 s for last competitive spot)
Students at a more senior level with similar merits shall have a higher chance of allocation. Students with prior oversea

Document_name: 
 allocation_criteria.txt

Score: 
 11.602807998657227
----------------------------------------------------------------------------------------------------
Answer: 
 Please do not make any payment fee for any administrative matter including visa, air tickets and accommodation until you are advised to do so.

Context: 
 Please do not make any payment fee for any administrative matter including visa, air tickets and accommodation until you are advised to do so.

Document_name: 
 dates_and_deadlines.txt

S


