<a href="https://colab.research.google.com/github/Mhna1234/projects-and-home-works/blob/main/IR_RM3_DMM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import xml.etree.cElementTree as ET

def output_xml(modified_queries, out_path):
    '''
    Writes your modified query models to an XML indri query file.

    Parameters
    ----------
    modified_queries : dict[str, dict[str,float]]
        A dictionary that maps a query id to the modified query model.
        A model is a dictionary that maps terms to their weight.
    out_path : str
        Path of the output.
    '''
    params = ET.Element('parameters')
    for qid, q_m in modified_queries.iteritems():
        q = ET.SubElement(params, 'query')
        number = ET.SubElement(q, 'number')
        number.text = qid
        text = ET.SubElement(q, 'text')
        text.text = _get_indri_query(q_m)
    tree = ET.ElementTree(params)
    tree.write(out_path, encoding="utf-8")


def parse_collection_stats(input_file):
    '''
    Parses the collection stats file.

    Parameters
    ----------
    input_file : str
        The path to the corpus file.

    Returns
    -------
    tfs : dict[str,int]
        Mapping containing the collection frequency of each term.
    total_terms : int
        The number of total terms in the corpus.
    '''
    with open(input_file) as corpus_file:
        total_terms = 0
        tfs = {}
        for line in corpus_file.readlines():
            if ',' in line:
                term, val = line.split(',')
                tfs[term] = int(val)
    total_terms = tfs["TOTAL"]
    del tfs["TOTAL"]
    return tfs, total_terms


def parse_documents_or_queries(file_path):
    '''
    Parses a TSV file that contains documents or queries count.

    Parameters
    ----------
    file_path : str
        Path to the TSV file.

    Returns
    -------
    docs_tfs : dict[str, dict[str,int]]
        Maps a document id to a dictionary that maps every term to its term frequency in the document.
    docs_len : dict[str,int]
        A dictionary that maps a document id to the length of that document.
    '''
    with open(file_path) as doc_file:
        docs_len, docs_tfs = {}, {}
        for line in doc_file.readlines():
            line_arr = line.split('\t')
            if len(line_arr) < 2:
                continue
            doc_id = line_arr[0]
            docs_tfs[doc_id] = _process_text_dict(line_arr[1])
            docs_len[doc_id] = sum(docs_tfs[doc_id].values())
        return docs_tfs, docs_len

def _process_text_dict(dict_str):
    res = {}
    pairs = dict_str.split(',')
    for pair in pairs:
        if ':' in pair:
            key, val = pair.split(':')
            res[key] = int(val)
    return res

def _get_indri_query(q_m):
    val_strings = ['{0} "{1}"'.format(weight, term) for term, weight in q_m.iteritems()]
    return "#weight(" + " ".join(val_strings) + ")"

# Step 1: Load feedback documents and collection stats
#query_ids=["711.tsv","710.tsv","709.tsv"."708.tsv","707.tsv","706.tsv","705.tsv","704.tsv","702.tsv","701.tsv"]
feedback_docs, docs_len = parse_documents_or_queries("/home/student/feedback_docs/711.tsv")
tfs, total_terms = parse_collection_stats("/home/student/gov2_collection_stats.csv")

# Step 2: Calculate RM3 relevance model
def calculate_rm3(feedback_docs, tfs, total_terms, original_query, top_k=25, original_query_weight=0.5):
    modified_queries = {}
    for qid, docs in feedback_docs.items():
        # Calculate p(w|R)
        term_scores = {}
        for doc_id, doc_terms in docs.items():
            for term, count in doc_terms.items():
                if term not in term_scores:
                    term_scores[term] = 0
                term_scores[term] += count / docs_len[doc_id]  # TF in document

        # Normalize and select top_k terms
        term_probs = {term: score / total_terms for term, score in term_scores.items()}
        top_terms = sorted(term_probs.items(), key=lambda x: x[1], reverse=True)[:top_k]

        # Combine with original query
        combined_query = {term: original_query_weight * original_query.get(term, 0) for term in original_query}
        for term, score in top_terms:
            combined_query[term] = combined_query.get(term, 0) + (1 - original_query_weight) * score

        modified_queries[qid] = combined_query

    return modified_queries

# Example usage:
original_query = {'term1': 1.0, 'term2': 1.0}  # Replace with actual query terms
rm3_queries = calculate_rm3(feedback_docs, tfs, total_terms, original_query)
output_xml(rm3_queries, "rm3.xml")

def calculate_dmm(feedback_docs, tfs, total_terms, original_query, lambda_param=0.1, delta=0.1, top_k=25, original_query_weight=0.5):
    modified_queries = {}
    for qid, docs in feedback_docs.items():
        term_scores = {}
        for doc_id, doc_terms in docs.items():
            for term, count in doc_terms.items():
                if term not in term_scores:
                    term_scores[term] = 0
                # Apply DMM calculation
                term_scores[term] += (count + delta) / (docs_len[doc_id] + delta * len(tfs))

        # Normalize and select top_k terms
        term_probs = {term: score / total_terms for term, score in term_scores.items()}
        top_terms = sorted(term_probs.items(), key=lambda x: x[1], reverse=True)[:top_k]

        # Combine with original query
        combined_query = {term: original_query_weight * original_query.get(term, 0) for term in original_query}
        for term, score in top_terms:
            combined_query[term] = combined_query.get(term, 0) + (1 - original_query_weight) * score

        modified_queries[qid] = combined_query

    return modified_queries

# Example usage:
dmm_queries = calculate_dmm(feedback_docs, tfs, total_terms, original_query)
output_xml(dmm_queries, "dmm.xml")





PermissionError: [Errno 13] Permission denied: '/Users/USER/Desktop/feedback_docs'