In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import xml.etree.ElementTree as ET
import re
import nltk
import numpy as np
import operator

In [2]:
#to parse the document XML file and find the root
def parse_xml(file_path):
    # Add a dummy root element to the XML content
    with open(file_path, "r") as f:
        xml_content = f.read()
        xml_content = "<root>" + xml_content + "</root>"
    
    # Parse the XML content
    root = ET.fromstring(xml_content)
    return root

In [3]:
# to parse the query document as it already has root no need to add dummy root
def parse_xml_query(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    return root

In [4]:
# Extract document information
def extract_documents(xml_root):
    documents = []
    for doc in xml_root.findall("doc"):
        doc_info = {}
        docno_element = doc.find("docno")
        title_element = doc.find("title")
        author_element = doc.find("author")
        bib_element = doc.find("bib")
        text_element = doc.find("text")
        
        # Check if elements exist and have text before accessing their text
        if docno_element is not None and docno_element.text is not None:
            doc_info["docno"] = docno_element.text.strip()
        else:
            doc_info["docno"] = ""
        
        if title_element is not None and title_element.text is not None:
            doc_info["title"] = title_element.text.strip()
        else:
            doc_info["title"] = ""
        
        if author_element is not None and author_element.text is not None:
            doc_info["author"] = author_element.text.strip()
        else:
            doc_info["author"] = ""
        
        if bib_element is not None and bib_element.text is not None:
            doc_info["bib"] = bib_element.text.strip()
        else:
            doc_info["bib"] = ""
        
        if text_element is not None and text_element.text is not None:
            doc_info["text"] = text_element.text.strip()
        else:
            doc_info["text"] = ""
        
        documents.append(doc_info)
    return documents

In [5]:
# Extract query information
def extract_query(xml_root):
    query = []
    for doc in xml_root.findall("top"):
        query_info = {}
        no_element = doc.find("num")
        title_element = doc.find("title")
        
        # Check if elements exist and have text before accessing their text
        if no_element is not None and no_element.text is not None:
            query_info["num"] = no_element.text.strip()
        else:
            query_info["num"] = ""
        
        if title_element is not None and title_element.text is not None:
            query_info["title"] = title_element.text.strip()
        else:
            query_info["title"] = ""
        
        
        query.append(query_info)
    return query

In [6]:
documents_root = parse_xml("cran.all.1400.xml")
query_root = parse_xml_query("cran.qry.xml")

In [7]:
documents = extract_documents(documents_root)
queries = extract_query(query_root)

In [8]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [28]:
#PreProcessing 


def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

def get_tokenized_list(doc_text):
    tokens = nltk.word_tokenize(doc_text)
    return tokens
# Stemming
def word_stemmer(token_list):
    ps = nltk.stem.PorterStemmer()
    stemmed = []
    for words in token_list:
        stemmed.append(ps.stem(words))
    return stemmed

#removing stop words
def remove_stopwords(doc_text):
    cleaned_text = []
    for words in doc_text:
        if words not in stop_words:
            cleaned_text.append(words)
    return cleaned_text

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words


[nltk_data] Downloading package wordnet to C:\Users\Makarand
[nltk_data]     Thorat\AppData\Roaming\nltk_data...


In [10]:
#preprocessing our doc


def preprocess(element,flag):

    cleaned_corpus = [ ]

    element = remove_punctuation(element)
    #print("Punctuation removal :",element)
    type(element)
    tokens = get_tokenized_list(element)
    #print("token creation",tokens)
    doc_text = remove_stopwords(tokens)
    #print("stop word removal",doc_text)
    for w in word_stemmer(doc_text):
        cleaned_corpus.append(w)
    #cleaned_corpus = ''.join(cleaned_corpus)
    #print("cleaned_corpus",cleaned_corpus) 
    if(flag==1):
        answer=''
        for x in cleaned_corpus:
            answer += " "  + x
        return answer
    else:
        return cleaned_corpus


    

In [27]:
x=documents[0]['text']

x=remove_punctuation(x)
x=get_tokenized_list(x)
print(x)
x=word_stemmer(x)

print(x)
len(x)

['experimental', 'investigation', 'of', 'the', 'aerodynamics', 'of', 'a', 'wing', 'in', 'a', 'slipstream', 'an', 'experimental', 'study', 'of', 'a', 'wing', 'in', 'a', 'propeller', 'slipstream', 'was', 'made', 'in', 'order', 'to', 'determine', 'the', 'spanwise', 'distribution', 'of', 'the', 'lift', 'increase', 'due', 'to', 'slipstream', 'at', 'different', 'angles', 'of', 'attack', 'of', 'the', 'wing', 'and', 'at', 'different', 'free', 'stream', 'to', 'slipstream', 'velocity', 'ratios', 'the', 'results', 'were', 'intended', 'in', 'part', 'as', 'an', 'evaluation', 'basis', 'for', 'different', 'theoretical', 'treatments', 'of', 'this', 'problem', 'the', 'comparative', 'span', 'loading', 'curves', 'together', 'with', 'supporting', 'evidence', 'showed', 'that', 'a', 'substantial', 'part', 'of', 'the', 'lift', 'increment', 'produced', 'by', 'the', 'slipstream', 'was', 'due', 'to', 'a', 'destalling', 'or', 'boundarylayercontrol', 'effect', 'the', 'integrated', 'remaining', 'lift', 'increment'

137

In [11]:
doc_ids = [doc['docno'] for doc in documents]
query_ids = [query['num'] for query in queries]

In [12]:
docs = [preprocess(doc['text'],1) for doc in documents]  
queries = [preprocess(query['title'],1) for query in queries]

In [13]:
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(docs)

In [14]:
# Open the output file
with open('output_vs.txt', 'w') as f:
    # For each query
    for i, query in enumerate(queries):
        # Transform the query to the same vector space as the docs
        query_vector = vectorizer.transform([query])

        # Compute the cosine similarity between the query vector and the doc vectors
        cosine_similarities = linear_kernel(query_vector, doc_vectors).flatten()

        # Pair doc_ids with their scores
        scored_docs = zip(doc_ids, cosine_similarities)

        # Sort the docs by score in descending order
        sorted_docs = sorted(scored_docs, key=operator.itemgetter(1), reverse=True)

        # Get the top 100 docs
        top_docs = sorted_docs[:100]

        # Write the top docs to the file
        for rank, (doc_id, score) in enumerate(top_docs):
            f.write(f"{query_ids[i]} Q0 {doc_id} {rank+1} {round(score,2)} 1\n")