In [1]:
from bs4 import BeautifulSoup
import string
from collections import Counter
from math import log10, sqrt
import json
import re

In [2]:
file = open('../trec_documents.xml','rt',encoding='utf8')
xml_doc = file.read()
soup = BeautifulSoup(xml_doc, 'lxml')


In [49]:
def get_doc_dict(xml_soup):
    """
    returns dict of dic number as key with  text from <p>, <text> and <headline> tag as text
    """
    doc_dict = {}
    for doc in soup.find_all('doc'):
        text = ""
        doc_no = str(doc.find('docno').string)
        for para in doc.find_all('p'):
            text += str(para.string).strip("\n")
        for headline in doc.find_all('headline'):
            text += str(headline.string)
        for txt in doc.find_all('text'):
            text += str(txt.string)
        text = [t for t in text.split("\n") if t != '']
        doc_dict[doc_no] = {"text":text}
    return doc_dict
doc_dict = get_doc_dict(soup)
# print(doc_dict)

In [5]:
def get_doc_tf(doc_dict):
    """
    updates the document dict with tf values
    """
    idf_dict = {}
    for doc_no, doc_details in doc_dict.copy().items():
        text = doc_details.get("text")
        text = " ".join(text).lower().translate(str.maketrans('', '', string.punctuation))
        word_list = text.split(" ")
        tf = Counter(word_list)
        tf.pop('', None)
        tf_max = tf.get(tf.most_common(1)[0][0])
        for k,v in tf.copy().items():
            tf[k] = tf[k] / tf_max
            idf_dict[k] = idf_dict.get(k,0) + 1
        # tf = {k: v / tf_max for k, v in tf.copy().items()}
        doc_details["tf"] = tf
        doc_dict[doc_no] = doc_details
    idf_N = len(doc_dict)
    idf_dict = {k: log10(idf_N/v) for k, v in idf_dict.copy().items()}
    return idf_dict

idf_dict = get_doc_tf(doc_dict)
print(doc_dict["FT911-5"])




{'text': ['FT  14 MAY 91 / World News in Brief: Newspaper pays up', 'A Malaysian English-language newspaper agreed to pay former Singapore prime', 'minister Lee Kuan Yew Dollars 100,000 over allegations of corruption.'], 'tf': Counter({'newspaper': 1.0, 'ft': 0.5, '14': 0.5, 'may': 0.5, '91': 0.5, 'world': 0.5, 'news': 0.5, 'in': 0.5, 'brief': 0.5, 'pays': 0.5, 'up': 0.5, 'a': 0.5, 'malaysian': 0.5, 'englishlanguage': 0.5, 'agreed': 0.5, 'to': 0.5, 'pay': 0.5, 'former': 0.5, 'singapore': 0.5, 'prime': 0.5, 'minister': 0.5, 'lee': 0.5, 'kuan': 0.5, 'yew': 0.5, 'dollars': 0.5, '100000': 0.5, 'over': 0.5, 'allegations': 0.5, 'of': 0.5, 'corruption': 0.5})}


In [20]:
def get_q_tf(word_list):
    tf = Counter(word_list)
    tf_max = tf.most_common(1)[0][1]
    for k,v in tf.copy().items():
        tf[k] = tf[k] / tf_max
    # tf["q_len"] = len(word_list)
    return tf
# get_q_tf(query)

In [21]:
# SAve to json file
with open('./tf.json', 'w+') as fp:
    json.dump(doc_dict, fp)
fp.close()
with open('./idf.json', 'w+') as fp:
    json.dump(idf_dict, fp)
fp.close()

In [22]:
def get_q_tfidf(q_tf, idf_dict):
    tfidf = []
    for word, tf in q_tf.items():
        tfidf.append(tf * idf_dict.get(word, 0))
    return tfidf


In [23]:
def get_cosine(v1, v2, den_v2):
    num=0
    den_v1 = 0
    for val1, val2 in zip(v1, v2):
        num += val1*val2
        den_v1 += val1*val1
    den = sqrt(den_v1) * sqrt(den_v2)
    return num/den

In [38]:
def get_doc_dict_q(tf_dict, idf_dict, q_tf, q_tfidf):
    doc_q_dict = {}
    for docid, val in tf_dict.items():
        tfidf = []
        for word in q_tf.keys():
            tf = val.get("tf", {}).get(word, 0) 
            idf = idf_dict.get(word, 0)
            tfidf.append(tf*idf)
        den = sum([val*val*idf_dict.get(word,0)*idf_dict.get(word,0) for word, val in val.get("tf").items()])
        cosine = get_cosine(tfidf, q_tfidf, den) if sum(tfidf) > 0 and sum(q_tfidf) > 0 else 0
        doc_q_dict[docid] = cosine
    return doc_q_dict

In [39]:
pattern = {}
with open("../patterns.txt") as f:
    for line in f:
        line = line.split()
        # print(pattern.get(line[0],None))
        if pattern.get(line[0],None):
            pattern.get(line[0]).append(line[1])
        else:
            pattern[line[0]] = [line[1]]

In [40]:
# checking with the query
# q = ['What is the tallest building in Japan?']
def top_50_baseline(query, doc_dict, idf_dict):
    # for query in q:
    query = query.lower().translate(str.maketrans('', '', string.punctuation)).split(" ")
    q_tf = get_q_tf(query)
    q_tfidf = get_q_tfidf(q_tf, idf_dict)
    # print(q_tfidf)
    # get doc dictionary of query words
    doc_q_dict = get_doc_dict_q(doc_dict, idf_dict, q_tf, q_tfidf)
    top_50 = sorted(doc_q_dict, key=doc_q_dict.get, reverse=True)[:50]
    return top_50

# top_50_baseline(q, doc_dict, idf_dict)

In [41]:
def get_relevant_from_regex(regex, doc_dict):
        regex = re.compile(regex)
        doc = set()
        for docID, val in doc_dict.items():
            matches = regex.findall(" ".join(val.get("text")))
            if len(matches) > 0:
                doc.add(docID)
        return doc


In [42]:
def get_relevant_doc(pattern, doc_dict):            
    rel_doc_q = {}
    for query, regex_list in pattern.items():
        for regex in regex_list:
                rel_doc_q[query] = get_relevant_from_regex(regex, doc_dict)
    return rel_doc_q


In [43]:
def check_relevance(query_detail, pred, doc_dict, pattern):
    doc = set()
    for docid in pred:
        text = " ".join(doc_dict.get(docid).get("text"))
        p = pattern.get(str(query_detail.get("index")))
        for regex in p:
            regex = re.compile(regex)
            matches = regex.findall(text)
            if len(matches) > 0:
                doc.add(docid)
    print(len(doc))
    return len(doc)/len(pred)

In [44]:
def calculate_precision(pred, actual):
    counter = 0
    for value in pred:
        if value in actual:
            counter += 1
            print(counter)
    return counter/len(pred)

In [45]:
def get_query_list():
    file = open('../test_questions.txt','rt',encoding='utf8')
    xml_doc = file.read()
    soup = BeautifulSoup(xml_doc, 'lxml')
    qlist = []
    index = 0
    for ques in soup.findAll('top'):
        data = ques.find('desc').text
        data = data.replace("Description:\n", "").rstrip("\n")
        index += 1
        qlist.append({"index": index,
                    "query": data})
    return qlist

In [3]:
query = get_query_list()
# rel_doc = get_relevant_doc(pattern, doc_dict) 
mean_precision = []
for q_details in query:
    print(q_details.get("query",""))
    pred = top_50_baseline(q_details.get("query",""), doc_dict, idf_dict)
    print(pred)
    precision = check_relevance(q_details, pred, doc_dict, pattern)
    # print(q_details)
    # actual = rel_doc.get(str(q_details.get("index")))
    # print(pred, actual)
    # precision = calculate_precision(pred, actual) if pred and actual else 0
    # print(q_details.get("query"), precision)
    mean_precision.append(precision)
    break
print("Mean precision is ", str(sum(mean_precision)/len(mean_precision)))


NameError: name 'get_query_list' is not defined

In [50]:
for docid, detail in doc_dict.items():
    text = detail.get("text", "")
    text = "".join(text).lower().translate(str.maketrans('', '', string.punctuation))
    if len(text) > 512:
        print(docid, len(text))

90-0046  1075
 LA100890-0049  713
 LA100890-0050  1169
 LA100890-0078  718
 LA100890-0114  10672
 LA100890-0123  1200
 LA100890-0125  2032
 LA100890-0134  663
 LA100890-0135  1341
 LA100989-0033  795
 LA100989-0035  1036
 LA100989-0095  3090
 LA100990-0038  3745
 LA100990-0064  869
 LA100990-0108  1053
 LA100990-0135  525
 LA100990-0141  3489
 LA100990-0176  2158
 LA101089-0025  557
 LA101089-0029  828
 LA101089-0072  929
 LA101089-0120  3831
 LA101089-0138  788
 LA101090-0014  686
 LA101090-0088  3608
 LA101090-0143  782
 LA101090-0147  7753
 LA101090-0157  767
 LA101189-0012  658
 LA101189-0024  1258
 LA101189-0030  1196
 LA101189-0045  856
 LA101189-0090  5448
 LA101189-0109  826
 LA101189-0149  974
 LA101190-0045  3311
 LA101190-0088  561
 LA101190-0113  1092
 LA101190-0197  743
 LA101190-0198  585
 LA101190-0206  2191
 LA101190-0223  674
 LA101190-0234  752
 LA101190-0252  1149
 LA101190-0253  920
 LA101190-0255  1780
 LA101289-0003  862
 LA101289-0079  693
 LA101289-0111  7268
 L

In [8]:
l = [1,2,3]
l[0]

1

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/rbl/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True