In [1]:
# necessary imports for the notebook to work
import os
import re
import math
from collections import defaultdict
import numpy as np
import random

In [2]:
def split_documents(input_file_path, output_dir):
    """
    Splits the input file into individual documents and saves each document in a separate file.

    Args:
        input_file_path (str): Path to the input file.
        output_dir (str): Directory to save the split documents.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(input_file_path, 'r') as input_file:
        document_content = input_file.read()

        # Split documents based on ".I" followed by a number
        documents = re.split(r'\.I\s+\d+', document_content)[1:]

        for i, document in enumerate(documents, start=1):
            # Extract text after ".W"
            text = re.search(r'\.W\n(.*)', document, re.DOTALL)
            if text:
                # Strip leading/trailing whitespace and newline characters
                document_text = text.group(1).strip()
            else:
                document_text = ""

            # Write the extracted text to a separate file
            with open(os.path.join(output_dir, f"document_{i}.txt"), 'w') as output_file:
                output_file.write(document_text)

# Example usage
input_file_path = 'cran-1.all.1400'
output_directory = 'split_documents'
split_documents(input_file_path, output_directory)


In [3]:
def calculate_document_frequency(input_dir):
    """
    Calculates the document frequency of terms in the collection.

    Args:
        input_dir (str): Directory containing the documents.

    Returns:
        dict: A dictionary containing document frequency of terms.
    """
    document_frequency = defaultdict(int)
    term_pattern = re.compile(r'\b\w+\b')

    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(input_dir, filename), 'r') as file:
                document_content = file.read().lower()

                terms_in_document = set(re.findall(term_pattern, document_content))

                for term in terms_in_document:
                    document_frequency[term] += 1

    return document_frequency

In [4]:
def calculate_tf(document_directory):
    """
    Calculates the term frequency for each term in each document.

    Args:
        document_directory (str): Directory containing the documents.

    Returns:
        dict: A nested dictionary containing term frequency for each term in each document.
    """
    term_frequency = defaultdict(dict)

    for filename in os.listdir(document_directory):
        if filename.endswith(".txt"):
            # Extract document number from filename
            doc_number = int(re.search(r'\d+', filename).group())

            with open(os.path.join(document_directory, filename), 'r') as file:
                document_content = file.read().lower()

                term_pattern = re.compile(r'\b\w+\b')
                terms_in_document = re.findall(term_pattern, document_content)

                for term in terms_in_document:
                    # Get the term frequency for each term in the document
                    if doc_number in term_frequency[term]:
                        term_frequency[term][doc_number] += 1
                    else:
                        term_frequency[term][doc_number] = 1

    # Sort the document numbers in ascending order
    for term in term_frequency:
        term_frequency[term] = dict(sorted(term_frequency[term].items()))

    return term_frequency


In [5]:
def calculate_idf_t(N, document_frequency):
    """
    Calculates the inverse document frequency for each term.

    Args:
        N (int): Total number of documents in the collection.
        document_frequency (dict): Dictionary containing document frequency of terms.

    Returns:
        dict: A dictionary containing inverse document frequency of terms.
    """
    idf_t = {}

    for term, df_t in document_frequency.items():
        if df_t == 0:
            idf_t[term] = 0
        else:
            idf_t[term] = math.log(N / df_t)

    return idf_t

In [7]:
def calculate_tf_idf(tf, idf):
    """
    Calculates the TF-IDF values for each term in each document.

    Args:
        tf (dict): Nested dictionary containing term frequency for each term in each document.
        idf (dict): Dictionary containing inverse document frequency of terms.

    Returns:
        dict: A nested dictionary containing TF-IDF values for each term in each document.
    """
    tf_idf_values = defaultdict(dict)

    # Iterate through each term
    for term, doc_tf in tf.items():
        # Iterate through all documents to ensure each term has 1400 values
        for doc in range(1, 1401):
            # Calculate TF-IDF if term exists in IDF
            if term in idf:
                # If term exists in the document, calculate TF-IDF
                tf_idf_values[term][doc] = doc_tf.get(doc, 0) * idf[term]
            else:
                # If term doesn't exist in IDF, set TF-IDF to 0
                tf_idf_values[term][doc] = 0

    return tf_idf_values


In [8]:
# Set input file path and output directory
input_file_path = 'cran-1.all.1400'
output_directory = 'split_documents'

# Split documents
split_documents(input_file_path, output_directory)

# Set input directory and N
input_directory = 'split_documents'
N = 1400

In [9]:
# Calculate document frequency
document_frequency = calculate_document_frequency(input_directory)

# Print document frequency for each term
print("Document frequency for each term:")
for term, frequency in document_frequency.items():
    print(f"{term}: {frequency}")

Document frequency for each term:
well: 167
configuration: 42
increment: 4
angles: 96
investigation: 216
potential: 54
velocity: 286
showed: 40
spanwise: 25
specific: 56
at: 771
problem: 237
distribution: 244
found: 320
was: 298
substantial: 16
wing: 181
evidence: 28
basis: 68
results: 597
stream: 232
free: 217
this: 655
together: 40
span: 41
effect: 273
by: 855
control: 45
loading: 89
the: 1391
boundary: 460
made: 352
to: 1256
determine: 102
as: 628
propeller: 23
treatments: 11
flow: 702
were: 294
effects: 326
experiment: 65
after: 45
part: 87
a: 1304
slipstream: 14
empirical: 30
of: 1394
evaluation: 24
experimental: 318
study: 140
curves: 65
theoretical: 217
that: 807
due: 143
produced: 32
different: 106
supporting: 9
aerodynamics: 24
attack: 112
intended: 12
integrated: 31
or: 325
for: 1144
theory: 432
subtracting: 2
lift: 134
with: 1010
increase: 98
and: 1323
agree: 32
in: 1241
remaining: 8
ratios: 100
an: 796
layer: 398
destalling: 2
order: 191
comparative: 6
it: 538
densities: 10

In [11]:
# Calculate term frequency
tf = calculate_tf(input_directory)

# Print term frequency for each term in each document
print("\nTerm frequency for each term in each document:")
for term, doc_tf in tf.items():
    print(f"{term}: {doc_tf}")


Term frequency for each term in each document:
experimental: {1: 2, 11: 1, 12: 1, 17: 1, 19: 1, 25: 1, 29: 1, 30: 1, 35: 1, 41: 1, 42: 1, 47: 1, 52: 1, 53: 1, 58: 1, 69: 1, 70: 1, 74: 1, 78: 1, 84: 2, 99: 2, 101: 1, 103: 1, 112: 1, 115: 1, 121: 1, 123: 2, 137: 1, 140: 1, 142: 1, 154: 1, 156: 1, 168: 1, 170: 1, 171: 1, 173: 2, 176: 1, 179: 2, 183: 1, 184: 1, 186: 2, 187: 1, 188: 1, 189: 1, 191: 1, 195: 3, 197: 2, 202: 1, 203: 1, 206: 2, 207: 2, 212: 1, 216: 1, 220: 1, 222: 1, 225: 1, 227: 1, 230: 1, 234: 4, 245: 1, 251: 1, 256: 2, 257: 1, 262: 1, 271: 2, 273: 1, 277: 1, 282: 1, 283: 1, 286: 1, 294: 1, 295: 1, 304: 1, 307: 1, 329: 2, 330: 1, 334: 2, 338: 1, 339: 1, 344: 2, 345: 1, 346: 2, 347: 1, 354: 1, 360: 1, 369: 1, 370: 1, 372: 2, 377: 1, 397: 1, 409: 1, 411: 2, 413: 2, 418: 1, 420: 1, 421: 1, 423: 1, 427: 1, 435: 1, 439: 1, 441: 1, 442: 2, 453: 1, 455: 2, 462: 1, 464: 1, 467: 1, 484: 3, 494: 2, 496: 1, 497: 1, 498: 1, 501: 1, 503: 1, 504: 1, 505: 1, 511: 1, 518: 1, 520: 2, 522: 3,

In [16]:
# calculate all idf values
idf_t = calculate_idf_t(N, document_frequency)
# print 5 sample terms with their idf values
print("\nIDF for 5 sample terms:")
for term, doc_values in list(idf_t.items())[:5]:
    print(f"{term}: {doc_values}")


IDF for 5 sample terms:
well: 2.1262337031865948
configuration: 3.506557897319982
increment: 5.857933154483459
angles: 2.6798793241355137
investigation: 1.8689491079191851


In [17]:
# Calculate TF-IDF values
tf_idf_values = calculate_tf_idf(tf, idf_t)

# print 5 sample terms with their tf-idf values per document
print("\nTF-IDF values for 5 sample terms.")
for term, doc_values in list(tf_idf_values.items())[:5]:
    print(f"{term}: {doc_values}")


TF-IDF values for 5 sample terms.
experimental: {1: 2.964352265646346, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 1.482176132823173, 12: 1.482176132823173, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 1.482176132823173, 18: 0.0, 19: 1.482176132823173, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 1.482176132823173, 26: 0.0, 27: 0.0, 28: 0.0, 29: 1.482176132823173, 30: 1.482176132823173, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 1.482176132823173, 36: 0.0, 37: 0.0, 38: 0.0, 39: 0.0, 40: 0.0, 41: 1.482176132823173, 42: 1.482176132823173, 43: 0.0, 44: 0.0, 45: 0.0, 46: 0.0, 47: 1.482176132823173, 48: 0.0, 49: 0.0, 50: 0.0, 51: 0.0, 52: 1.482176132823173, 53: 1.482176132823173, 54: 0.0, 55: 0.0, 56: 0.0, 57: 0.0, 58: 1.482176132823173, 59: 0.0, 60: 0.0, 61: 0.0, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0, 66: 0.0, 67: 0.0, 68: 0.0, 69: 1.482176132823173, 70: 1.482176132823173, 71: 0.0, 72: 0.0, 73: 0.0, 74: 1.482176132823173, 75: 0.0, 76: 0.0, 77: 0.0, 78: 1.48217613

In [18]:
file_dir = 'split_documents'

# return a specific document by its name
# takes docID as input
def return_doc(id):
    for filename in os.listdir(file_dir):
        if filename == 'document_' + str(id) + '.txt':
            return filename

In [19]:
# get 10 best scored docs based on one query
def score(query, tf_idf_dict):
    # filter dictionary according to query - assume sorted dict
    tf_idf_dict_filtered = {term: tf_idf_dict[term] for term in query.split() if term in tf_idf_dict}

    # check if query is valid aka if terms are in term collection
    if not tf_idf_dict_filtered:
        return None

    # for each word in the filtered dict, sum up the tf-idf values column-wise for each document
    scores = np.sum([[tf_idf_dict_filtered[term][doc] for doc in tf_idf_dict_filtered[term].keys()]for term in tf_idf_dict_filtered], axis = 0)

    # returns indices that would sort docs descendingly and give back 10 best docs
    docID_sorted_desc = np.argsort(-scores)[:10]

    ranked_docs = []
    for id in docID_sorted_desc:
        ranked_docs.append(return_doc(id+1))

    return ranked_docs

In [21]:
# split up all queries from cran.qry into data structure
def split_queries():
    # read in cran.qry file
    with open('cran.qry', 'r') as qry_file:
        content = qry_file.read()

        # split queries into text pieces between '.W' and next '.I', including query ID
        pattern = re.compile(r'\.I (\d+)\n\.W\n(.*?)\n(?=\.I|$)', re.DOTALL)
        matches = pattern.findall(content)

        queries = {}
        for m in matches: # loop through all pre-processed queries
            queryID = int(m[0]) # remove leading zeros from query ID
            queries[queryID] = m[1] # add queryID as key and text for queryID as value
        
        # remove new line symbols from queries
        for key, val in queries.items():
            queries[key] = val.replace('\n', ' ')

        return queries
    
# show what processed queries look like
processed_queries = split_queries()
print(processed_queries)

{1: 'what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .', 2: 'what are the structural and aeroelastic problems associated with flight of high speed aircraft .', 4: 'what problems of heat conduction in composite slabs have been solved so far .', 8: 'can a criterion be developed to show empirically the validity of flow solutions for chemically reacting gas mixtures based on the simplifying assumption of instantaneous local chemical equilibrium .', 9: 'what chemical kinetic system is applicable to hypersonic aerodynamic problems .', 10: 'what theoretical and experimental guides do we have as to turbulent couette flow behaviour .', 12: 'is it possible to relate the available pressure distributions for an ogive forebody at zero angle of attack to the lower surface pressures of an equivalent ogive forebody at angle of attack .', 13: 'what methods -dash exact or approximate -dash are presently available for predicting body pressures at ang

In [25]:
# manually test one query (queryID 1) and return scored docs
def run_query(id):
    return score(processed_queries[id], tf_idf_values)

qID = 1
doc = run_query(qID)
print("Top-10 highest ranked documents (descendingly) for query ID " + str(qID) + ":")
print(doc)

Top-10 highest ranked documents (descendingly) for query ID 1:
['document_1268.txt', 'document_51.txt', 'document_486.txt', 'document_184.txt', 'document_13.txt', 'document_792.txt', 'document_1144.txt', 'document_14.txt', 'document_12.txt', 'document_588.txt']


In [30]:
# run and score 20 random queries
def eval(tf_idf_dictionary):
    with open('cranqrel', 'r') as file:
        # read cranqrel line by line
        lines = file.readlines()

        # count how many rows per first digit there are in the file
        row_cnt = {}
        for line in lines:
            first_digit = int(line.strip().split()[0])
            row_cnt[first_digit] = row_cnt.get(first_digit, 0) + 1
        
        # filter out only those queries with => 10 rows  
        cranqrel_ids = {qryID for qryID, cnt in row_cnt.items() if cnt >= 15}

    # get all unique ID's of processed query list (queries from cran.qry)  
    cranqry_ids = set(list(processed_queries.keys()))

    # intersect them with query IDs from cran.qry so we only
    # execute queries which exist in cran.qry AND have a corresponding relevancy scoring cranqrel
    intersection_ids = list(cranqrel_ids.intersection(cranqry_ids))
        
    # get 20 random indeces that are mentioned in intersection of ID's
    indeces = random.choices(intersection_ids, k=20)

    # get the queries associated with the 20 indices
    rand_queries = [processed_queries[idx] for idx in indeces]

    qry_key_scored = {}
    # score the queries
    for rand_qry in rand_queries: # for each query
        # get key associated with this query
        qry_key = [key for key, val in processed_queries.items() if val == rand_qry]
        # score the query
        qry_score = score(rand_qry, tf_idf_dictionary)

        #make a dict containing the query ID and associated doc scores
        qry_key_scored[qry_key[0]] = qry_score
    return qry_key_scored

In [44]:
# calculate precisions based on query scores
def calc_prec(eval_scores):
    # read in cranqrel file
    with open('cranqrel', 'r') as eval_file:
        content = eval_file.readlines() # read cranqrel line by line

        # for each query executed and scored, get relevant docs listed in cranqrel and store them
        query_cranqreldocs = {}
        for key, val in eval_scores.items(): # loop through scored queries 
            query_cranqreldocs[key] = [int(line.replace('\n', '').strip().split(' ')[1]) for line in content if line.startswith(str(key) + " ")] # extracting relevant docs per query

        # get ID from docs
        processed_eval_scores = {key: [int(doc.split('_')[1].split('.')[0]) for doc in docs] for key, docs in eval_scores.items()}

        TP_per_query = []
        for (scored_key, scored_docs), (eval_key, eval_docs) in zip(processed_eval_scores.items(), query_cranqreldocs.items()): # loop through each entry in dict of scored docs
            TP = 0            
            for docID in scored_docs: # loop through docIDs of scored docs
                # check if that docID is in the ones from cranqrel
                if docID in eval_docs:
                    TP += 1
            TP_per_query.append(TP)   

        prec = []        
        for idx, TP_p_q in enumerate(TP_per_query):
            prec.append(TP_p_q/10)
    return prec  
                
eval_scores = eval(tf_idf_values)
print("QueryIDs of queries ran:")
print(eval_scores.keys())
print("Precision scores for the ran queries:")
precisions = calc_prec(eval_scores)
print(precisions)
print("Average precision of the system across all 20 queries for the current run:")
print(np.average(precisions))

QueryIDs of queries ran:
dict_keys([201, 212, 132, 2, 157, 57, 156, 1, 67, 217, 225])
Precision scores for the ran queries:
[0.0, 0.1, 0.0, 0.4, 0.0, 0.0, 0.0, 0.6, 0.0, 0.0, 0.1]
Average precision of the system across all 20 queries for the current run:
0.10909090909090911


In [None]:
# for top 10 returned docs, look if docID appears in cranqrel -> if yes, then it's relevant

# Prec = relevant docs / retrieved docs
# Retrieved docs = 10 best scored docs we got back, so always equal to 10
# relevant docs: RETRIEVED DOCS which are also IN CRANQREL

# report: which queries we used, what our program gave back VS what is in cranqrel