In [1]:
import os
import re
import math
from collections import defaultdict

In [2]:
def split_documents(input_file_path, output_dir):
    """
    Splits the input file into individual documents and saves each document in a separate file.

    Args:
        input_file_path (str): Path to the input file.
        output_dir (str): Directory to save the split documents.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(input_file_path, 'r') as input_file:
        document_content = input_file.read()

        documents = re.split(r'\.I\s+\d+', document_content)[1:]

        for i, document in enumerate(documents, start=1):
            document = document.strip()
            with open(os.path.join(output_dir, f"document_{i}.txt"), 'w') as output_file:
                output_file.write(document)

In [8]:
def calculate_document_frequency(input_dir):
    """
    Calculates the document frequency of terms in the collection.

    Args:
        input_dir (str): Directory containing the documents.

    Returns:
        dict: A dictionary containing document frequency of terms.
    """
    document_frequency = defaultdict(int)
    term_pattern = re.compile(r'\b\w+\b')

    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(input_dir, filename), 'r') as file:
                document_content = file.read().lower()

                terms_in_document = set(re.findall(term_pattern, document_content))

                for term in terms_in_document:
                    document_frequency[term] += 1

    return document_frequency

In [3]:
def calculate_idf_t(N, document_frequency):
    """
    Calculates the inverse document frequency for each term.

    Args:
        N (int): Total number of documents in the collection.
        document_frequency (dict): Dictionary containing document frequency of terms.

    Returns:
        dict: A dictionary containing inverse document frequency of terms.
    """
    idf_t = {}

    for term, df_t in document_frequency.items():
        if df_t == 0:
            idf_t[term] = 0
        else:
            idf_t[term] = math.log(N / df_t)

    return idf_t


In [4]:
def calculate_tf(document_directory):
    """
    Calculates the term frequency for each term in each document.

    Args:
        document_directory (str): Directory containing the documents.

    Returns:
        dict: A nested dictionary containing term frequency for each term in each document.
    """
    term_frequency = defaultdict(dict)

    for filename in os.listdir(document_directory):
        if filename.endswith(".txt"):
            with open(os.path.join(document_directory, filename), 'r') as file:
                document_content = file.read().lower()

                term_pattern = re.compile(r'\b\w+\b')
                terms_in_document = re.findall(term_pattern, document_content)

                for term in terms_in_document:
                    term_frequency[term][filename] = terms_in_document.count(term)

    return term_frequency


In [5]:
def calculate_tf_idf(tf, idf):
    """
    Calculates the TF-IDF values for each term in each document.

    Args:
        tf (dict): Nested dictionary containing term frequency for each term in each document.
        idf (dict): Dictionary containing inverse document frequency of terms.

    Returns:
        dict: A nested dictionary containing TF-IDF values for each term in each document.
    """
    tf_idf_values = defaultdict(dict)

    for term, doc_tf in tf.items():
        for doc, freq in doc_tf.items():
            tf_idf_values[term][doc] = freq * idf[term]

    return tf_idf_values

In [9]:
# Set input file path and output directory
input_file_path = 'cran-1.all.1400'
output_directory = 'split_documents'

# Split documents
split_documents(input_file_path, output_directory)

# Set input directory and N
input_directory = 'split_documents'
N = 1400

# Calculate document frequency
document_frequency = calculate_document_frequency(input_directory)

# Calculate inverse document frequency
idf_t = calculate_idf_t(N, document_frequency)



In [11]:
# Print inverse document frequency for each term
print("Inverse document frequency for each term:")
for term, value in idf_t.items():
    print(f"{term}: {value}")


Inverse document frequency for each term:
far: 3.5553480614894135
was: 1.5471340290979456
in: 0.11974925310992582
on: 0.42749163500838155
be: 0.70608769183568
b: 0.0
and: 0.04231119807172283
shock: 1.7761673744682187
pattern: 3.5806658694737035
described: 2.5902671654458267
as: 0.8016873491351513
humphreys: 7.24422751560335
lindsey: 6.551080335043404
which: 0.8556661100577203
because: 3.133353651430039
evidence: 3.912023005428146
some: 1.6421086947236492
surface: 1.5538680612792897
supersonic: 1.6569788572031006
w: 0.0
upper: 3.5306554488990423
extending: 4.353855757707185
boundary: 1.1130010261202095
presented: 1.4390925466868614
t: 0.0
tn: 1.742969305058623
airfoil: 3.2739356020512282
immediate: 4.846332242804979
abruptly: 6.145615226935241
when: 1.8105055120491103
transonic: 3.101092789211817
0: 1.8551557857868493
from: 0.8128964336698711
stream: 1.79749014393704
at: 0.5965391420400205
paper: 1.9310215365615626
p: 2.2137895942109145
enough: 4.353855757707185
of: 0.003577821347883902

In [12]:
# Calculate term frequency
tf = calculate_tf(input_directory)

# Print term frequency for each term in each document
print("\nTerm frequency for each term in each document:")
for term, doc_tf in tf.items():
    print(f"{term}: {doc_tf}")


Term frequency for each term in each document:
t: {'document_439.txt': 1, 'document_363.txt': 1, 'document_405.txt': 1, 'document_1398.txt': 1, 'document_411.txt': 1, 'document_377.txt': 1, 'document_1373.txt': 1, 'document_1367.txt': 1, 'document_388.txt': 1, 'document_149.txt': 1, 'document_3.txt': 1, 'document_607.txt': 2, 'document_161.txt': 1, 'document_175.txt': 1, 'document_613.txt': 1, 'document_820.txt': 1, 'document_1171.txt': 1, 'document_834.txt': 2, 'document_1165.txt': 2, 'document_808.txt': 1, 'document_1159.txt': 1, 'document_997.txt': 1, 'document_983.txt': 1, 'document_773.txt': 1, 'document_767.txt': 1, 'document_1005.txt': 1, 'document_954.txt': 1, 'document_798.txt': 1, 'document_1011.txt': 1, 'document_940.txt': 1, 'document_1039.txt': 1, 'document_968.txt': 1, 'document_99.txt': 1, 'document_559.txt': 1, 'document_217.txt': 1, 'document_571.txt': 2, 'document_565.txt': 1, 'document_203.txt': 1, 'document_1207.txt': 1, 'document_1213.txt': 1, 'document_72.txt': 1

In [13]:
# Calculate TF-IDF values
tf_idf_values = calculate_tf_idf(tf, idf_t)

# Print TF-IDF values for each term in each document
print("\nTF-IDF values for each term in each document:")
for term, doc_values in tf_idf_values.items():
    print(f"{term}: {doc_values}")


TF-IDF values for each term in each document:
t: {'document_439.txt': 0.0, 'document_363.txt': 0.0, 'document_405.txt': 0.0, 'document_1398.txt': 0.0, 'document_411.txt': 0.0, 'document_377.txt': 0.0, 'document_1373.txt': 0.0, 'document_1367.txt': 0.0, 'document_388.txt': 0.0, 'document_149.txt': 0.0, 'document_3.txt': 0.0, 'document_607.txt': 0.0, 'document_161.txt': 0.0, 'document_175.txt': 0.0, 'document_613.txt': 0.0, 'document_820.txt': 0.0, 'document_1171.txt': 0.0, 'document_834.txt': 0.0, 'document_1165.txt': 0.0, 'document_808.txt': 0.0, 'document_1159.txt': 0.0, 'document_997.txt': 0.0, 'document_983.txt': 0.0, 'document_773.txt': 0.0, 'document_767.txt': 0.0, 'document_1005.txt': 0.0, 'document_954.txt': 0.0, 'document_798.txt': 0.0, 'document_1011.txt': 0.0, 'document_940.txt': 0.0, 'document_1039.txt': 0.0, 'document_968.txt': 0.0, 'document_99.txt': 0.0, 'document_559.txt': 0.0, 'document_217.txt': 0.0, 'document_571.txt': 0.0, 'document_565.txt': 0.0, 'document_203.txt'