# P@K for ElasticSearch responses

In [9]:
import pandas as pd
from elasticsearch import Elasticsearch
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import nltk
import re

## Read Data and init variables

In [2]:
#download and set stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\embis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
lemmatizer = WordNetLemmatizer()

In [4]:
DATA_PATH = "../data/data_clean.csv"
df = pd.read_csv(DATA_PATH, sep=";", header=None)
df.columns = ["source", "question", "answer", "category", "qId", "text_body"]

In [5]:
df.head()

Unnamed: 0,source,question,answer,category,qId,text_body
0,anwki,Which american journalist was born on October ...,E. Thomas Wood,HUM:ind,5954080,"E. Thomas Wood (born October 9, 1963) is an Am..."
1,weigelt-greffin,What is Armada de Molucca?,Spanish fleet,ENTY:other,12027162,The Armada de Molucca was the name of the Span...
2,anwki,What percent of the population are unable to e...,Approximately 65 percent,NUM:count,165423,Approximately 65 percent of the adult populati...
3,anwki,During which conflict was I-94 built?,World War II,ENTY:other,8687,Henry Ford built it to link the factories at W...
4,anwki,Where is Saraiki usually spoken?,south Punjab,LOC:other,24751,"Saraiki is mostly spoken in south Punjab, and ..."


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1585 entries, 0 to 1584
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   source     1585 non-null   object
 1   question   1585 non-null   object
 2   answer     1585 non-null   object
 3   category   1585 non-null   object
 4   qId        1585 non-null   int64 
 5   text_body  1585 non-null   object
dtypes: int64(1), object(5)
memory usage: 74.4+ KB


In [5]:
es = Elasticsearch("http://localhost:9200")
print("connected to ElasticSearch:",es.ping())

connected to ElasticSearch: True


## Queries for elastic search

In [17]:
# normal query
def build_query(query, k):   
    #searching
    my_query = {
        "match" : {
            "text" : {
                "query" : query,
                "operator" : "or",
                }
            }
        }
    res = es.search(index="wikibase", query=my_query, size=k)
    return res

# boosted query
def build_query_boosted(query, k):   
    #searching
    my_query = {
        "multi_match": {
            "query": query,
            "fields": ["title^1.2", "text"]
        }
    }
    res = es.search(index="wikibase", query=my_query, size=k)
    return res

def build_query_removed_stopwords(query, k):
    #searching
    #query_numbers_removed = re.sub('[0-9]+', '', query)
    query_tokenized = tokenizer.tokenize(query)
    filtered_query = [w for w in query_tokenized if not w.lower() in stop_words]
    filtered_query = ' '.join(filtered_query)
    my_query = {
        "match" : {
            "text" : {
                "query" : filtered_query,
                }
            }
        }
    res = es.search(index="wikibase", query=my_query, size=k)
    return res

# boosted query
def build_query_removed_stopwords_bossted(query, k):   
    #searching
    query_tokenized = tokenizer.tokenize(query)
    filtered_query = [w for w in query_tokenized if not w.lower() in stop_words]
    filtered_query = ' '.join(filtered_query)
    my_query = {
        "multi_match": {
            "query": filtered_query,
            "fields": ["title^1.1", "text"]
        }
    }
    res = es.search(index="wikibase", query=my_query, size=k)
    return res

#query without punctuation and numbers
def build_query_token(query, k):
    #remove punctuation and numbers
    query_numbers_removed = re.sub('[0-9]+', '', query)
    #remove punctuation
    query_tokenized = tokenizer.tokenize(query_numbers_removed)
    query_tokenized = ' '.join(query_tokenized)
    my_query = {
        "match" : {
            "text" : {
                "query" : query_tokenized,
                "operator" : "or",
                }
            }
        }
    res = es.search(index="wikibase", query=my_query, size=k)
    return res

def build_query_stemmed(query, k):
    query_tokenized = tokenizer.tokenize(query)
    filtered_query = [lemmatizer.lemmatize(w) for w in query_tokenized]
    filtered_query = ' '.join(filtered_query)
    my_query = {
        "match" : {
            "text" : {
                "query" : filtered_query,
                "fuzziness": 1
                }
            }
        }
    res = es.search(index="wikibase", query=my_query, size=k)
    return res

def build_query_boolean(query, k):
    query_tokenized = tokenizer.tokenize(query)
    filtered_query = [w for w in query_tokenized if not w.lower() in stop_words]
    filtered_query = ' '.join(filtered_query)
    my_query = {
        #"query": {
            "bool": {
                "should": [
                    {
                        "match": {
                            "text": {
                            "query": filtered_query
                            }
                        }
                    },
                    {
                        "match": {
                            "text": {
                            "query": filtered_query,
                            "operator": "and"
                            }
                        }
                    },
                    {
                        "match_phrase": {
                            "text": {
                            "query": filtered_query,
                            "boost": 2
                            }
                        }
                    }
                ]
            }
    #    }
    }
    res = es.search(index="wikibase", query=my_query, size=k)
    return res

In [11]:
# query _id of question
def build_query_id(query):
    # searching
    my_query = {
        "match": {
            "_id": query
        }
    }
    res = es.search(index="wikibase", query=my_query)
    return res

In [7]:
# helper function to show results
def show_results(res):
    for hit in res["hits"]["hits"]:
        score, doc = hit["_score"], hit["_source"]
        print("ID: " , hit["_id"] , " Score: " , score)

In [8]:
# helper function to show results
def precision_at_100(es_query):
    question_list = df["question"].values
    qId_list = df["qId"].values
    
    k1, k2, k3, k4, k5, k10, k20, k50, k100 = 0, 0, 0, 0, 0, 0, 0, 0, 0
    counter = 0
    for question, answer_id in zip(question_list, qId_list):
        #print(question, answer_id)
        counter += 1
        answer_id = str(answer_id)
        res = es_query(question, 100)
        #show_results(res)

        ans_arr = []
        hit_counter = 0
        for hit in res["hits"]["hits"]:
            ans_arr.append(hit["_id"])
        #print(len(ans_arr))

        #assert len(ans_arr) == 100
        if (len(ans_arr) != 0):
            if answer_id in ans_arr[0]:
                k1 += 1 
            if answer_id in ans_arr[:2]:
                k2 += 1
            if answer_id in ans_arr[:3]:
                k3 += 1
            if answer_id in ans_arr[:4]:
                k4 += 1
            if answer_id in ans_arr[:5]:
                k5 += 1
            if answer_id in ans_arr[:10]:
                k10 += 1
            if answer_id in ans_arr[:20]:
                k20 += 1
            if answer_id in ans_arr[:50]:
                k50 += 1
            if answer_id in ans_arr:
                k100 += 1

        if counter % 500 == 0:
            print(f"{counter} answers processed")
    
    print("\n")

    print(f"{counter} answers procesed")
    print(f"P@1 = {k1/counter}")
    print(f"P@2 = {k2/counter}")
    print(f"P@3 = {k3/counter}")
    print(f"P@4 = {k4/counter}")
    print(f"P@5 = {k5/counter}")
    print(f"P@10 = {k10/counter}")
    print(f"P@20 = {k20/counter}")
    print(f"P@50 = {k50/counter}")
    print(f"P@100 = {k100/counter}")
    #print(f"Average Percision: {((k1/counter)+(k5/counter)+(k10/counter)+(k20/counter)+(k50/counter)+(k100/counter)) / 6}")

In [10]:
#remove punctuation and numbers
query = "Which american journalist was born on October 9, 1963?"
query_numbers_removed = re.sub('[0-9]+', '', query)
#remove punctuation
query_tokenized = tokenizer.tokenize(query_numbers_removed)
print(query_tokenized)

['Which', 'american', 'journalist', 'was', 'born', 'on', 'October']


# Test different ElasticSearch queries
- best score
    - normal boosted query with title^1.1

### Normal Query
- without any preprocessing

In [14]:
precision_at_100(build_query)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.5981072555205047
P@2 = 0.6902208201892744
P@3 = 0.7287066246056783
P@4 = 0.7495268138801262
P@5 = 0.7697160883280757
P@10 = 0.8063091482649842
P@20 = 0.8422712933753943
P@50 = 0.8719242902208202
P@100 = 0.8965299684542587


### Query removed punctuation and numbers

In [18]:
precision_at_100(build_query_token)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.580441640378549
P@2 = 0.6725552050473186
P@3 = 0.7066246056782335
P@4 = 0.7255520504731862
P@5 = 0.7470031545741325
P@10 = 0.783596214511041
P@20 = 0.8252365930599369
P@50 = 0.8624605678233438
P@100 = 0.8883280757097792


### Boosted Query
- boosted title field

In [15]:
# boosted title^1.1
precision_at_100(build_query_boosted)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.6063091482649843
P@2 = 0.6990536277602524
P@3 = 0.7369085173501577
P@4 = 0.7558359621451104
P@5 = 0.7772870662460568
P@10 = 0.8145110410094637
P@20 = 0.849211356466877
P@50 = 0.8813880126182966
P@100 = 0.9053627760252366


In [17]:
# boosted title^1.2
precision_at_100(build_query_boosted)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.6050473186119874
P@2 = 0.6965299684542586
P@3 = 0.7356466876971609
P@4 = 0.7564668769716089
P@5 = 0.7766561514195583
P@10 = 0.8151419558359622
P@20 = 0.8504731861198738
P@50 = 0.8813880126182966
P@100 = 0.9066246056782334


### Normal Query with stopword removal
- removed stopwords in question

In [25]:
precision_at_100(build_query_removed_stopwords)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.5917981072555205
P@2 = 0.6788643533123029
P@3 = 0.7167192429022082
P@4 = 0.7381703470031545
P@5 = 0.7564668769716089
P@10 = 0.7936908517350157
P@20 = 0.8347003154574133
P@50 = 0.865615141955836
P@100 = 0.889589905362776


### Boosted Query with stopword removal

In [27]:
# title^1.1
precision_at_100(build_query_removed_stopwords_bossted)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.6012618296529968
P@2 = 0.689589905362776
P@3 = 0.7261829652996845
P@4 = 0.7488958990536277
P@5 = 0.7652996845425868
P@10 = 0.8031545741324921
P@20 = 0.8441640378548896
P@50 = 0.8757097791798107
P@100 = 0.8990536277602523


### Boolean Query

In [28]:
precision_at_100(build_query_boolean)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.5873817034700315
P@2 = 0.6883280757097792
P@3 = 0.717981072555205
P@4 = 0.7394321766561515
P@5 = 0.759621451104101
P@10 = 0.8069400630914827
P@20 = 0.8384858044164037
P@50 = 0.885173501577287
P@100 = 0.910410094637224


In [30]:
# with stopwords removal
precision_at_100(build_query_boolean)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.5873817034700315
P@2 = 0.6883280757097792
P@3 = 0.717981072555205
P@4 = 0.7394321766561515
P@5 = 0.759621451104101
P@10 = 0.8069400630914827
P@20 = 0.8384858044164037
P@50 = 0.885173501577287
P@100 = 0.910410094637224


### Normal Query with Lemmatization

In [44]:
precision_at_100(build_query_stemmed)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.571608832807571
P@2 = 0.6643533123028391
P@3 = 0.6996845425867508
P@4 = 0.7205047318611988
P@5 = 0.7438485804416404
P@10 = 0.7823343848580442
P@20 = 0.8214511041009463
P@50 = 0.8574132492113564
P@100 = 0.885173501577287


In [46]:
# fuzzines: 2
precision_at_100(build_query_stemmed)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.338801261829653
P@2 = 0.45741324921135645
P@3 = 0.5160883280757098
P@4 = 0.5495268138801261
P@5 = 0.579179810725552
P@10 = 0.6473186119873817
P@20 = 0.7009463722397477
P@50 = 0.7709779179810725
P@100 = 0.8189274447949527


In [48]:
# fuzzines: 1
precision_at_100(build_query_stemmed)

500 answers processed
1000 answers processed
1500 answers processed


1585 answers procesed
P@1 = 0.38359621451104103
P@2 = 0.49589905362776027
P@3 = 0.5451104100946372
P@4 = 0.5766561514195584
P@5 = 0.6094637223974764
P@10 = 0.6738170347003155
P@20 = 0.726813880126183
P@50 = 0.7861198738170347
P@100 = 0.832807570977918
