In [75]:
import os
import re
import math
from collections import defaultdict
import random

In [7]:
def split_documents(input_file_path, output_dir):
    """
    Splits the input file into individual documents and saves each document in a separate file.

    Args:
        input_file_path (str): Path to the input file.
        output_dir (str): Directory to save the split documents.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(input_file_path, 'r') as input_file:
        document_content = input_file.read()

        documents = re.split(r'\.I\s+\d+', document_content)[1:]

        for i, document in enumerate(documents, start=1):
            document = document.strip()
            with open(os.path.join(output_dir, f"document_{i}.txt"), 'w') as output_file:
                output_file.write(document)

In [8]:
# TODO: think if we can implement this with hashmap
def calculate_document_frequency(input_dir):
    """
    Calculates the document frequency of terms in the collection.

    Args:
        input_dir (str): Directory containing the documents.

    Returns:
        dict: A dictionary containing document frequency of terms.
    """
    document_frequency = defaultdict(int)
    term_pattern = re.compile(r'\b\w+\b')

    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(input_dir, filename), 'r') as file:
                document_content = file.read().lower()

                terms_in_document = set(re.findall(term_pattern, document_content))

                for term in terms_in_document:
                    document_frequency[term] += 1

    return document_frequency

In [9]:
def calculate_idf_t(N, document_frequency):
    """
    Calculates the inverse document frequency for each term.

    Args:
        N (int): Total number of documents in the collection.
        document_frequency (dict): Dictionary containing document frequency of terms.

    Returns:
        dict: A dictionary containing inverse document frequency of terms.
    """
    idf_t = {}

    for term, df_t in document_frequency.items():
        if df_t == 0:
            idf_t[term] = 0
        else:
            idf_t[term] = math.log10(N / df_t)

    return idf_t

In [10]:
def calculate_tf(document_directory):
    """
    Calculates the term frequency for each term in each document.

    Args:
        document_directory (str): Directory containing the documents.

    Returns:
        dict: A nested dictionary containing term frequency for each term in each document.
    """
    term_frequency = defaultdict(dict)

    for filename in os.listdir(document_directory):
        if filename.endswith(".txt"):
            with open(os.path.join(document_directory, filename), 'r') as file:
                document_content = file.read().lower()

                term_pattern = re.compile(r'\b\w+\b')
                terms_in_document = re.findall(term_pattern, document_content)

                for term in terms_in_document:
                    term_frequency[term][filename] = terms_in_document.count(term)

    return term_frequency

In [11]:
def calculate_tf_idf(tf, idf):
    """
    Calculates the TF-IDF values for each term in each document.

    Args:
        tf (dict): Nested dictionary containing term frequency for each term in each document.
        idf (dict): Dictionary containing inverse document frequency of terms.

    Returns:
        dict: A nested dictionary containing TF-IDF values for each term in each document.
    """
    tf_idf_values = defaultdict(dict)

    for term, term_freq in tf.items():
        for doc, freq in term_freq.items():
            tf_idf_values[term][doc] = freq * idf[term]

    return tf_idf_values

In [12]:
# Set input file path and output directory
input_file_path = 'cran-1.all.1400'
output_directory = 'split_documents'

# Split documents
split_documents(input_file_path, output_directory)

# Set input directory and N
input_directory = 'split_documents'
N = 1400

In [13]:
# Calculate document frequency
document_frequency = calculate_document_frequency(input_directory)

# Print document frequency for each term
print("Document frequency for each term:")
for term, frequency in document_frequency.items():
    print(f"{term}: {frequency}")

Document frequency for each term:
25: 57
that: 807
spanwise: 25
at: 771
wing: 181
with: 1010
b: 1400
an: 796
investigation: 216
were: 294
ae: 343
substantial: 16
after: 45
increase: 98
determine: 102
found: 320
showed: 40
was: 298
subtracting: 2
w: 1400
made: 352
velocity: 286
theory: 432
intended: 12
specific: 56
together: 40
distribution: 244
study: 140
the: 1391
agree: 32
to: 1257
j: 733
problem: 237
boundary: 460
basis: 68
free: 217
attack: 112
integrated: 31
aerodynamics: 26
or: 325
scs: 340
of: 1395
1958: 90
comparative: 6
for: 1145
different: 106
curves: 65
configuration: 42
produced: 32
potential: 54
treatments: 11
and: 1342
evidence: 28
empirical: 30
a: 1400
experiment: 65
well: 167
effect: 273
propeller: 23
as: 628
layer: 398
slipstream: 14
theoretical: 217
due: 143
results: 597
experimental: 318
flow: 703
lift: 134
m: 289
in: 1242
t: 1400
loading: 89
supporting: 9
remaining: 8
stream: 232
destalling: 2
increment: 4
324: 2
brenckman: 1
control: 45
span: 41
order: 191
effects:

In [14]:
# Calculate inverse document frequency
idf_t = calculate_idf_t(N, document_frequency)

# Print inverse document frequency for each term
print("Inverse document frequency for each term:")
for term, value in idf_t.items():
    print(f"{term}: {value}")

Inverse document frequency for each term:
25: 1.3902531800057467
that: 0.23925450095616763
spanwise: 1.7481880270062005
at: 0.2590736576272811
wing: 0.8884494608090535
with: 0.14180666189559543
b: 0.0
an: 0.245214967940569
investigation: 0.8116742845273072
were: 0.6777807052660807
ae: 0.6108339156354675
substantial: 1.9420080530223132
after: 1.4929155219028944
increase: 1.1549019599857433
determine: 1.1375278639163204
found: 0.640978057358332
showed: 1.5440680443502757
was: 0.6719117716019828
subtracting: 2.845098040014257
w: 0.0
made: 0.599585372200107
velocity: 0.6897620025491951
theory: 0.5106442888633259
intended: 2.066946789630613
specific: 1.3979400086720377
together: 1.5440680443502757
distribution: 0.7587382093395085
study: 1.0
the: 0.0028009056861916346
agree: 1.6409780573583321
to: 0.04679275799228029
j: 0.2810240610371101
problem: 0.7713796896681342
boundary: 0.48337020399666397
basis: 1.3136191229720018
free: 0.8096683018297085
attack: 1.0969100130080565
integrated: 1.65476

In [15]:
# Calculate term frequency
tf = calculate_tf(input_directory)

# Print term frequency for each term in each document
print("\nTerm frequency for each term in each document:")
for term, doc_tf in tf.items():
    print(f"{term}: {doc_tf}")



Term frequency for each term in each document:
t: {'document_1.txt': 1, 'document_10.txt': 1, 'document_100.txt': 1, 'document_1000.txt': 1, 'document_1001.txt': 1, 'document_1002.txt': 1, 'document_1003.txt': 1, 'document_1004.txt': 1, 'document_1005.txt': 1, 'document_1006.txt': 1, 'document_1007.txt': 1, 'document_1008.txt': 1, 'document_1009.txt': 1, 'document_101.txt': 1, 'document_1010.txt': 1, 'document_1011.txt': 1, 'document_1012.txt': 1, 'document_1013.txt': 1, 'document_1014.txt': 1, 'document_1015.txt': 1, 'document_1016.txt': 1, 'document_1017.txt': 1, 'document_1018.txt': 1, 'document_1019.txt': 1, 'document_102.txt': 1, 'document_1020.txt': 1, 'document_1021.txt': 1, 'document_1022.txt': 1, 'document_1023.txt': 1, 'document_1024.txt': 1, 'document_1025.txt': 1, 'document_1026.txt': 1, 'document_1027.txt': 1, 'document_1028.txt': 1, 'document_1029.txt': 1, 'document_103.txt': 1, 'document_1030.txt': 1, 'document_1031.txt': 1, 'document_1032.txt': 1, 'document_1033.txt': 

In [16]:
# Calculate TF-IDF values ##
tf_idf_values = calculate_tf_idf(tf, idf_t)

# Print TF-IDF values for each term in each document
print("\nTF-IDF values for each term in each document:")
for term, doc_values in tf_idf_values.items():
    print(f"{term}: {doc_values}")


TF-IDF values for each term in each document:
t: {'document_1.txt': 0.0, 'document_10.txt': 0.0, 'document_100.txt': 0.0, 'document_1000.txt': 0.0, 'document_1001.txt': 0.0, 'document_1002.txt': 0.0, 'document_1003.txt': 0.0, 'document_1004.txt': 0.0, 'document_1005.txt': 0.0, 'document_1006.txt': 0.0, 'document_1007.txt': 0.0, 'document_1008.txt': 0.0, 'document_1009.txt': 0.0, 'document_101.txt': 0.0, 'document_1010.txt': 0.0, 'document_1011.txt': 0.0, 'document_1012.txt': 0.0, 'document_1013.txt': 0.0, 'document_1014.txt': 0.0, 'document_1015.txt': 0.0, 'document_1016.txt': 0.0, 'document_1017.txt': 0.0, 'document_1018.txt': 0.0, 'document_1019.txt': 0.0, 'document_102.txt': 0.0, 'document_1020.txt': 0.0, 'document_1021.txt': 0.0, 'document_1022.txt': 0.0, 'document_1023.txt': 0.0, 'document_1024.txt': 0.0, 'document_1025.txt': 0.0, 'document_1026.txt': 0.0, 'document_1027.txt': 0.0, 'document_1028.txt': 0.0, 'document_1029.txt': 0.0, 'document_103.txt': 0.0, 'document_1030.txt': 0

In [17]:
file_dir = 'split_documents'

def return_doc(id):
    for filename in os.listdir(file_dir):
        if filename == 'document_' + str(id) + '.txt':
            return filename

In [96]:
import numpy as np

tf_idf_dictionary = {
    'hello': {'color': 1, 'taste': 2, 'taste2': 56},
    'there': {'color': 3, 'taste': 0, 'taste2': 6},
    'apple': {'color': 5, 'taste': 1, 'taste2': 0}
}
# Query string
query = "hello there"

def score(query, tf_idf_dict):
    # filter dictionary according to query - assume sorted dict
    tf_idf_dict_filtered = {term: tf_idf_dict[term] for term in query.split() if term in tf_idf_dict}

    if not tf_idf_dict_filtered:
        #print("Query matches no document")
        return None

    # for each word in the filtered dict, sum up the tf-idf values column-wise for each document
    scores = np.sum([[tf_idf_dict_filtered[term][doc] for doc in tf_idf_dict_filtered[term].keys()]for term in tf_idf_dict_filtered], axis = 0)
    # returns indices that would sort docs descendingly and give back 10 best docs
    docID_sorted_desc = np.argsort(-scores)[:10]

    docs = []
    for id in docID_sorted_desc:
        docs.append(return_doc(id+1))

    return docs
    

score(query, tf_idf_dictionary)

['document_3.txt', 'document_1.txt', 'document_2.txt']

In [97]:
# split up all queries into data structure
def split_queries():
    # read in cran.qry file
    with open('cran.qry', 'r') as eval_file:
        content = eval_file.read()

        # split queries into text pieces between '.W' and next '.I'
        pattern = re.compile(r'\.W\n(.*?)\n(?=\.I|$)', re.DOTALL)
        queries = pattern.findall(content)

        # remove new line tokens
        processed_queries = [query.replace('\n', ' ') for query in queries]

        # map a sequential, continuous ID to each query
        query_dict = {cnt: qry for cnt, qry in enumerate(processed_queries)}
        return query_dict

# get query dict
processed_queries = split_queries()
print(processed_queries)

{0: 'what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .', 1: 'what are the structural and aeroelastic problems associated with flight of high speed aircraft .', 2: 'what problems of heat conduction in composite slabs have been solved so far .', 3: 'can a criterion be developed to show empirically the validity of flow solutions for chemically reacting gas mixtures based on the simplifying assumption of instantaneous local chemical equilibrium .', 4: 'what chemical kinetic system is applicable to hypersonic aerodynamic problems .', 5: 'what theoretical and experimental guides do we have as to turbulent couette flow behaviour .', 6: 'is it possible to relate the available pressure distributions for an ogive forebody at zero angle of attack to the lower surface pressures of an equivalent ogive forebody at angle of attack .', 7: 'what methods -dash exact or approximate -dash are presently available for predicting body pressures at angle 

In [98]:
# run one query and return scored docs
def run_query(id):
    return score(processed_queries[id], tf_idf_dictionary)

doc = run_query(1)
print(doc)

None


In [125]:
# run and score 20 random queries
def eval():
    # get 20 random indeces for calling queries
    indeces = [random.randint(0, len(processed_queries)-1) for _ in range(20)]

    # get the 20 queries associated with the indices
    rand_queries = [processed_queries[idx] for idx in indeces]

    qry_key_scored = {}
    # score the 20 queries
    for rand_qry in rand_queries: # for each query
        # get key associated with this query
        qry_key = [key for key, val in processed_queries.items() if val == rand_qry]
        # score the query
        qry_score = score(rand_qry, tf_idf_dictionary)

        #make a dict containing the query ID and associated doc scores
        qry_key_scored[qry_key[0]] = qry_score
    return qry_key_scored


rand_eval = eval()
print(rand_eval)

{3: None, 29: None, 68: None, 200: None, 37: None, 159: None, 202: None, 110: None, 157: None, 50: None, 215: None, 43: None, 160: ['document_3.txt', 'document_1.txt', 'document_2.txt'], 8: None, 150: None, 18: ['document_3.txt', 'document_1.txt', 'document_2.txt'], 129: None, 213: None, 151: None, 182: None}


In [31]:

def calc_acc(scored_docIDs):
    with open('cran.qry', 'r') as eval_file:
        content = eval_file.read()

        pattern = re.compile(r'\.W\n(.*?)\n(?=\.I|$)', re.DOTALL)
        queries = pattern.findall(content)
        processed_queries = [query.replace('\n', '') for query in queries]
        # split content on number change
        # if scored_docIDs

calc_acc(2)

['what similarity laws must be obeyed when constructing aeroelastic modelsof heated high speed aircraft .', 'what are the structural and aeroelastic problems associated with flightof high speed aircraft .', 'what problems of heat conduction in composite slabs have been solved sofar .', 'can a criterion be developed to show empirically the validity of flowsolutions for chemically reacting gas mixtures based on the simplifyingassumption of instantaneous local chemical equilibrium .', 'what chemical kinetic system is applicable to hypersonic aerodynamicproblems .', 'what theoretical and experimental guides do we have as to turbulentcouette flow behaviour .', 'is it possible to relate the available pressure distributions for anogive forebody at zero angle of attack to the lower surface pressures ofan equivalent ogive forebody at angle of attack .', 'what methods -dash exact or approximate -dash are presently availablefor predicting body pressures at angle of attack.', 'papers on internal /

In [None]:
# queryID, docID, relevant
# query 1, gives back 184, has relevance x
# query gives back 

# for top 10 returned docs, look if docID appears in cranqrel -> if yes, then it's relevant

# acc = TP + TN / TP * TN * FP * FN
# TP = it is in cranqrel and also in my results
# TN = it is not in cranqrel and also not in my results
# FP = it is not in cranqrel, but it is in my results
# FN = it is in cranrel,but not in my results