In [23]:
import os
import re
import math
from collections import defaultdict

In [26]:
def split_documents(input_file_path, output_dir):
    """
    Splits the input file into individual documents and saves each document in a separate file.

    Args:
        input_file_path (str): Path to the input file.
        output_dir (str): Directory to save the split documents.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(input_file_path, 'r') as input_file:
        document_content = input_file.read()

        documents = re.split(r'\.I\s+\d+', document_content)[1:]

        for i, document in enumerate(documents, start=1):
            document = document.strip()
            with open(os.path.join(output_dir, f"document_{i}.txt"), 'w') as output_file:
                output_file.write(document)

In [27]:
def calculate_document_frequency(input_dir):
    """
    Calculates the document frequency of terms in the collection.

    Args:
        input_dir (str): Directory containing the documents.

    Returns:
        dict: A dictionary containing document frequency of terms.
    """
    document_frequency = defaultdict(int)
    term_pattern = re.compile(r'\b\w+\b')

    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(input_dir, filename), 'r') as file:
                document_content = file.read().lower()

                terms_in_document = set(re.findall(term_pattern, document_content))

                for term in terms_in_document:
                    document_frequency[term] += 1

    return document_frequency

In [28]:
def calculate_idf_t(N, document_frequency):
    """
    Calculates the inverse document frequency for each term.

    Args:
        N (int): Total number of documents in the collection.
        document_frequency (dict): Dictionary containing document frequency of terms.

    Returns:
        dict: A dictionary containing inverse document frequency of terms.
    """
    idf_t = {}

    for term, df_t in document_frequency.items():
        if df_t == 0:
            idf_t[term] = 0
        else:
            idf_t[term] = math.log10(N / df_t)

    return idf_t

In [29]:
def calculate_tf(document_directory):
    """
    Calculates the term frequency for each term in each document.

    Args:
        document_directory (str): Directory containing the documents.

    Returns:
        dict: A nested dictionary containing term frequency for each term in each document.
    """
    term_frequency = defaultdict(dict)

    for filename in os.listdir(document_directory):
        if filename.endswith(".txt"):
            with open(os.path.join(document_directory, filename), 'r') as file:
                document_content = file.read().lower()

                term_pattern = re.compile(r'\b\w+\b')
                terms_in_document = re.findall(term_pattern, document_content)

                for term in terms_in_document:
                    term_frequency[term][filename] = terms_in_document.count(term)

    return term_frequency

In [30]:
def calculate_tf_idf(tf, idf):
    """
    Calculates the TF-IDF values for each term in each document.

    Args:
        tf (dict): Nested dictionary containing term frequency for each term in each document.
        idf (dict): Dictionary containing inverse document frequency of terms.

    Returns:
        dict: A nested dictionary containing TF-IDF values for each term in each document.
    """
    tf_idf_values = defaultdict(dict)

    for term, term_freq in tf.items():
        for doc, freq in term_freq.items():
            tf_idf_values[term][doc] = freq * idf[term]

    return tf_idf_values

In [31]:
# Set input file path and output directory
input_file_path = 'cran-1.all.1400'
output_directory = 'split_documents'

# Split documents
split_documents(input_file_path, output_directory)

# Set input directory and N
input_directory = 'split_documents'
N = 1400

In [32]:
# Calculate document frequency
document_frequency = calculate_document_frequency(input_directory)

# Print document frequency for each term
print("Document frequency for each term:")
for term, frequency in document_frequency.items():
    print(f"{term}: {frequency}")

Document frequency for each term:
a: 1400
with: 1010
destalling: 2
the: 1391
distribution: 244
propeller: 23
due: 143
showed: 40
of: 1395
treatments: 11
lift: 134
specific: 56
j: 733
control: 45
attack: 112
and: 1342
theoretical: 217
made: 352
subtracting: 2
flow: 703
part: 91
order: 191
velocity: 286
were: 294
slipstream: 14
basis: 68
curves: 65
empirical: 30
comparative: 6
together: 40
by: 857
supporting: 9
in: 1242
problem: 237
evaluation: 24
found: 320
w: 1400
spanwise: 25
effects: 326
t: 1400
an: 796
evidence: 28
results: 597
for: 1145
theory: 432
different: 106
25: 57
or: 325
determine: 102
configuration: 42
scs: 340
free: 217
brenckman: 1
intended: 12
to: 1257
wing: 181
well: 167
ratios: 100
study: 140
effect: 273
integrated: 31
at: 771
investigation: 216
span: 41
after: 45
was: 298
that: 807
m: 289
boundary: 460
ae: 343
potential: 54
as: 628
layer: 398
increase: 98
324: 2
experimental: 318
b: 1400
remaining: 8
this: 655
agree: 32
angles: 97
produced: 32
increment: 4
1958: 90
lo

In [33]:
# Calculate inverse document frequency
idf_t = calculate_idf_t(N, document_frequency)

# Print inverse document frequency for each term
print("Inverse document frequency for each term:")
for term, value in idf_t.items():
    print(f"{term}: {value}")

Inverse document frequency for each term:
a: 0.0
with: 0.14180666189559543
destalling: 2.845098040014257
the: 0.0028009056861916346
distribution: 0.7587382093395085
propeller: 1.7844001996606451
due: 0.9907919982131762
showed: 1.5440680443502757
of: 0.0015538280686216335
treatments: 2.104735350520013
lift: 1.0190232373134305
specific: 1.3979400086720377
j: 0.2810240610371101
control: 1.4929155219028944
attack: 1.0969100130080565
and: 0.018375519845264733
theoretical: 0.8096683018297085
made: 0.599585372200107
subtracting: 2.845098040014257
flow: 0.2991727106584141
part: 1.1870866433571445
order: 0.8650946684305105
velocity: 0.6897620025491951
were: 0.6777807052660807
slipstream: 2.0
basis: 1.3136191229720018
curves: 1.3332146790353825
empirical: 1.6690067809585756
comparative: 2.3679767852945943
together: 1.5440680443502757
by: 0.21314721375503987
supporting: 2.191885526238913
in: 0.0520064398376766
problem: 0.7713796896681342
evaluation: 1.765916793966632
found: 0.640978057358332
w: 0

In [34]:
# Calculate term frequency
tf = calculate_tf(input_directory)

# Print term frequency for each term in each document
print("\nTerm frequency for each term in each document:")
for term, doc_tf in tf.items():
    print(f"{term}: {doc_tf}")



Term frequency for each term in each document:
t: {'document_1.txt': 1, 'document_10.txt': 1, 'document_100.txt': 1, 'document_1000.txt': 1, 'document_1001.txt': 1, 'document_1002.txt': 1, 'document_1003.txt': 1, 'document_1004.txt': 1, 'document_1005.txt': 1, 'document_1006.txt': 1, 'document_1007.txt': 1, 'document_1008.txt': 1, 'document_1009.txt': 1, 'document_101.txt': 1, 'document_1010.txt': 1, 'document_1011.txt': 1, 'document_1012.txt': 1, 'document_1013.txt': 1, 'document_1014.txt': 1, 'document_1015.txt': 1, 'document_1016.txt': 1, 'document_1017.txt': 1, 'document_1018.txt': 1, 'document_1019.txt': 1, 'document_102.txt': 1, 'document_1020.txt': 1, 'document_1021.txt': 1, 'document_1022.txt': 1, 'document_1023.txt': 1, 'document_1024.txt': 1, 'document_1025.txt': 1, 'document_1026.txt': 1, 'document_1027.txt': 1, 'document_1028.txt': 1, 'document_1029.txt': 1, 'document_103.txt': 1, 'document_1030.txt': 1, 'document_1031.txt': 1, 'document_1032.txt': 1, 'document_1033.txt': 

In [12]:
# Calculate TF-IDF values ##
tf_idf_values = calculate_tf_idf(tf, idf_t)

# Print TF-IDF values for each term in each document
print("\nTF-IDF values for each term in each document:")
for term, doc_values in tf_idf_values.items():
    print(f"{term}: {doc_values}")


TF-IDF values for each term in each document:
t: {'document_439.txt': 0.0, 'document_363.txt': 0.0, 'document_405.txt': 0.0, 'document_1398.txt': 0.0, 'document_411.txt': 0.0, 'document_377.txt': 0.0, 'document_1373.txt': 0.0, 'document_1367.txt': 0.0, 'document_388.txt': 0.0, 'document_149.txt': 0.0, 'document_3.txt': 0.0, 'document_607.txt': 0.0, 'document_161.txt': 0.0, 'document_175.txt': 0.0, 'document_613.txt': 0.0, 'document_820.txt': 0.0, 'document_1171.txt': 0.0, 'document_834.txt': 0.0, 'document_1165.txt': 0.0, 'document_808.txt': 0.0, 'document_1159.txt': 0.0, 'document_997.txt': 0.0, 'document_983.txt': 0.0, 'document_773.txt': 0.0, 'document_767.txt': 0.0, 'document_1005.txt': 0.0, 'document_954.txt': 0.0, 'document_798.txt': 0.0, 'document_1011.txt': 0.0, 'document_940.txt': 0.0, 'document_1039.txt': 0.0, 'document_968.txt': 0.0, 'document_99.txt': 0.0, 'document_559.txt': 0.0, 'document_217.txt': 0.0, 'document_571.txt': 0.0, 'document_565.txt': 0.0, 'document_203.txt'