In [1]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import xml.etree.ElementTree as ET
import re
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np

In [2]:
def parse_xml(file_path):
    # Add a dummy root element to the XML content
    with open(file_path, "r") as f:
        xml_content = f.read()
        xml_content = "<root>" + xml_content + "</root>"
    
    # Parse the XML content
    root = ET.fromstring(xml_content)
    return root

In [3]:
def parse_xml_query(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    return root

In [4]:
# Extract document information
def extract_documents(xml_root):
    documents = []
    for doc in xml_root.findall("doc"):
        doc_info = {}
        docno_element = doc.find("docno")
        title_element = doc.find("title")
        author_element = doc.find("author")
        bib_element = doc.find("bib")
        text_element = doc.find("text")
        
        # Check if elements exist and have text before accessing their text
        if docno_element is not None and docno_element.text is not None:
            doc_info["docno"] = docno_element.text.strip()
        else:
            doc_info["docno"] = ""
        
        if title_element is not None and title_element.text is not None:
            doc_info["title"] = title_element.text.strip()
        else:
            doc_info["title"] = ""
        
        if author_element is not None and author_element.text is not None:
            doc_info["author"] = author_element.text.strip()
        else:
            doc_info["author"] = ""
        
        if bib_element is not None and bib_element.text is not None:
            doc_info["bib"] = bib_element.text.strip()
        else:
            doc_info["bib"] = ""
        
        if text_element is not None and text_element.text is not None:
            doc_info["text"] = text_element.text.strip()
        else:
            doc_info["text"] = ""
        
        documents.append(doc_info)
    return documents

In [5]:
# Extract query information
def extract_query(xml_root):
    query = []
    for doc in xml_root.findall("top"):
        query_info = {}
        no_element = doc.find("num")
        title_element = doc.find("title")
        
        # Check if elements exist and have text before accessing their text
        if no_element is not None and no_element.text is not None:
            query_info["num"] = no_element.text.strip()
        else:
            query_info["num"] = ""
        
        if title_element is not None and title_element.text is not None:
            query_info["title"] = title_element.text.strip()
        else:
            query_info["title"] = ""
        
        
        query.append(query_info)
    return query

In [6]:
documents_root = parse_xml("cran.all.1400.xml")
query_root = parse_xml_query("cran.qry.xml")

In [7]:
documents = extract_documents(documents_root)
query = extract_query(query_root)

In [8]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [9]:
#PreProcessing 

def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

def get_tokenized_list(doc_text):
    tokens = nltk.word_tokenize(doc_text)
    return tokens
# Stemming
def word_stemmer(token_list):
    ps = nltk.stem.PorterStemmer()
    stemmed = []
    for words in token_list:
        stemmed.append(ps.stem(words))
    return stemmed

#removing stop words
def remove_stopwords(doc_text):
    cleaned_text = []
    for words in doc_text:
        if words not in stop_words:
            cleaned_text.append(words)
    return cleaned_text


In [10]:
#preprocessing our doc


def preprocess(element):

    cleaned_corpus = [ ]
    

        

    element = remove_punctuation(element)
    #print("Punctuation removal :",element)
    type(element)
    tokens = get_tokenized_list(element)
    #print("token creation",tokens)
    doc_text = remove_stopwords(tokens)
    #print("stop word removal",doc_text)
    for w in word_stemmer(doc_text):
        cleaned_corpus.append(w)
    #cleaned_corpus = ''.join(cleaned_corpus)
    #print("cleaned_corpus",cleaned_corpus) 
    answer=''
    for x in cleaned_corpus:
        answer += " "  + x
    return answer

    

In [None]:
cleaned_document = []


In [None]:
cleaned_query = []

In [33]:
#finallllll
vectorizer = TfidfVectorizer()

cleaned_documents = []
for i in range(0,20):
    newstring=''
    tokens = preprocess(documents[i]['text'])
    for x in tokens:
        newstring += " "+ x
    cleaned_documents.append(newstring)
    
#print(cleaned_documents)
document_vectors = vectorizer.fit_transform(cleaned_documents)

# Iterate over each query
for query_info in query:
    cleaned_query=[]
    query_id = query_info['num']  # Get the query ID
    query_text = query_info['title']  # Get the query text
    nstring=''
    cleaned_query_tokens = preprocess(query_text)  # Preprocess the query text
    #print(cleaned_query_tokens)
    for y in cleaned_query_tokens:
        nstring+= " "+y
    cleaned_query.append(nstring)
    print(cleaned_query)
    
    query_vector = vectorizer.transform(cleaned_query)  # Calculate query vector
    
    # Calculate cosine similarity between query and document vectors
    similarities = cosine_similarity(query_vector, document_vectors)[0]  # Get similarities for the query
    
    # Combine similarities with document IDs
    document_scores = list(zip([doc['docno'] for doc in documents], similarities))
    
    # Sort document scores based on similarity score in descending order
    document_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Write top 100 documents for the current query to the output file
    for rank, (doc_id, similarity_score) in enumerate(document_scores[:100], start=1):
        output_line = f"{query_id} 0 {doc_id} {rank} {round(similarity_score,2)}\n"
        with open("newoutput.txt", "a") as f:
            f.write(output_line)

[' similar law must obey construct aeroelast model heat high speed aircraft']
[' structur aeroelast problem associ flight high speed aircraft']
[' problem heat conduct composit slab solv far']
[' criterion develop show empir valid flow solut chemic react ga mixtur base simplifi assumpt instantan local chemic equilibrium']
[' chemic kinet system applic hyperson aerodynam problem']
[' theoret experiment guid turbul couett flow behaviour']
[' possibl relat avail pressur distribut ogiv forebodi zero angl attack lower surfac pressur equival ogiv forebodi angl attack']
[' method dash exact approxim dash present avail predict bodi pressur angl attack']
[' paper intern slip flow heat transfer studi']
[' realga transport properti air avail wide rang enthalpi densiti']
[' possibl find analyt similar solut strong blast wave problem newtonian approxim']
[' aerodynam perform channel flow ground effect machin calcul']
[' basic mechan transon aileron buzz']
[' paper shocksound wave interact']
[' mate

[' flutter characterist expos skin panel x15 vertic stabil subject aerodynam heat']
[' agreement found theoret predict instabl time experiment measur collaps time compress column creep']
[' theoret studi creep buckl']
[' experiment studi creep buckl']
[' possibl correl result creep buckl wide differ structur within framework singl theori']
[' experiment result creep buckl column']
[' result creep buckl round tube extern pressur']
[' analyt studi conduct timetofailur mechan associ creep collaps long circular cylindr shell exhibit primari secondari creep well elast deform variou distribut forc system']
[' effect initi stress frequenc vibrat circular cylindr shell investig']
[' effect chang initi pressur due deform frequenc vibrat circular cylindr shell investig']
[' discontinu stress junction pressur structur']
[' analyt solut avail stress edgeload shell revolut']
[' dome contour minim discontinu stress use closur cylindr pressur vessel']
[' gener solut stress pressur shell revolut avail

In [32]:
len(cleaned_query)

1