In [2]:
# 1. split cran-1.all-1400 into the single documents (store each singular document either in its own file or part of data structure.)
import os

def split_documents(input_file_path, output_dir):
    # Make sure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Open the input file
    with open(input_file_path, 'r') as input_file:
        document_content = input_file.read()

        # Split the document by .I (document identifier)
        documents = document_content.split('.I')[1:]

        # Write each document to a separate file
        for i, document in enumerate(documents, start=1):
            document = document.strip()
            # Write each document to a separate file
            with open(os.path.join(output_dir, f"document_{i}.txt"), 'w') as output_file:
                output_file.write(document)

# 입력 파일 경로와 출력 디렉토리 경로 지정
input_file_path = 'cran-1.all.1400'
output_directory = 'split_documents'

# 문서를 분리하고 각 문서를 개별 파일로 저장
split_documents(input_file_path, output_directory)


In [3]:
# 2. maintain list of all terms with their document frequency df_t(count of how many documents in the collection contain term t) 
import os
import re # Regular Expression
from collections import defaultdict # defaultdict을 사용하면 해당 키가 존재하지 않을 경우 자동으로 기본 값

def calculate_document_frequency(input_dir):
    # Dictionary to store document frequency of terms
    document_frequency = defaultdict(int)

    # Regular expression to extract terms from document
    term_pattern = re.compile(r'\b\w+\b')

    # Iterate through each document
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            # Read the document
            with open(os.path.join(input_dir, filename), 'r') as file:
                document_content = file.read().lower()

                # Extract terms from the document
                terms_in_document = set(re.findall(term_pattern, document_content))

                # Update document frequency for each term
                for term in terms_in_document:
                    document_frequency[term] += 1

    return document_frequency

# 입력 디렉토리 경로 지정
input_directory = 'split_documents'

# 각 용어의 문서 빈도 계산
term_document_frequency = calculate_document_frequency(input_directory)

# 결과 출력
print("각 용어의 문서 빈도:")
for term, frequency in term_document_frequency.items():
    print(f"{term}: {frequency}")


각 용어의 문서 빈도:
wave: 168
439: 2
t: 1400
that: 807
under: 195
upper: 41
occurs: 40
experimental: 318
some: 271
because: 61
p: 153
interaction: 82
free: 217
is: 1151
evidence: 28
described: 105
pressure: 520
are: 1029
immediate: 11
the: 1391
boundary: 460
as: 628
8: 93
factor: 47
at: 771
presented: 332
airfoil: 53
0: 219
g: 185
was: 298
in: 1242
present: 180
number: 485
increased: 64
flow: 703
on: 913
which: 595
surface: 296
shock: 237
a: 1400
there: 107
b: 1400
pattern: 39
layer: 398
affecting: 10
extending: 18
transonic: 63
influence: 95
changed: 10
high: 236
1211: 2
edge: 153
terminated: 4
by: 857
far: 40
determining: 49
vicinity: 23
attack: 112
behind: 68
keep: 8
note: 104
of: 1395
separation: 98
then: 127
consequences: 4
paper: 203
important: 75
naca: 191
daley: 1
exists: 23
detached: 17
to: 1257
may: 242
humphreys: 1
with: 1010
technical: 17
change: 60
leading: 118
extensive: 19
lindsey: 2
when: 229
from: 621
3804: 1
enough: 18
attached: 27
stream: 232
support: 19
wood: 5
importance:

In [4]:
# 3. calculate N 
N=1400

In [5]:
# 4. For each term in list, calculate idc_t
import math

def calculate_idc_t(N, document_frequency):
    idc_t = {}

    for term, df_t in document_frequency.items():
        if df_t == 0:
            idc_t[term] = 0  # To prevent division by zero error, treat as a substitute value
        else:
            idf_t = math.log(N / df_t)
            idc_t[term] = 1 / idf_t

    return idc_t

# Set the value of N
N = 1400

# Example dictionary for document_frequency
document_frequency = {'term1': 10, 'term2': 5, 'term3': 0}

# Calculate idc_t
idc_t = calculate_idc_t(N, document_frequency)

# Print the results
print("idc_t values for each term:")
for term, value in idc_t.items():
    print(f"{term}: {value}")


idc_t values for each term:
term1: 0.2023618696943225
term2: 0.17746891550973914
term3: 0


In [6]:
# Calculate the term frequency (tf_t, d) for each term in each document.
def calculate_tf_idf(document_directory, idc_t):
    tf_idf_values = {}

    for filename in os.listdir(document_directory):
        if filename.endswith(".txt"):
            # Read the document
            with open(os.path.join(document_directory, filename), 'r') as file:
                document_content = file.read().lower()

                # Calculate term frequency (tf) for each term in the document
                term_frequency = {}
                for term in idc_t.keys():
                    term_frequency[term] = document_content.count(term)

                # Calculate tf-idf for each term in the document
                for term, tf in term_frequency.items():
                    tf_idf_values.setdefault(filename, {})[term] = tf * idc_t[term]

    return tf_idf_values

# Calculate tf-idf values
tf_idf_values = calculate_tf_idf(input_directory, idc_t)

# Print the results
print("tf-idf values for each term in each document:")
for document, values in tf_idf_values.items():
    print(f"Document: {document}")
    for term, value in values.items():
        print(f"{term}: {value}")


tf-idf values for each term in each document:
Document: document_439.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_363.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_405.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_1398.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_411.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_377.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_1373.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_1367.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_388.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_149.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_3.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_607.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_161.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_175.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_613.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_820.txt
term1: 0.0
term2: 0.0
term3: 0
Document: document_1171.t

In [7]:
# Calculate the tf-idf for each term in each document.
import os
import math
from collections import defaultdict

def calculate_tf_idf(document_directory, idc_t):
    """
    Calculate the tf-idf for each term in each document.

    Args:
        document_directory (str): The directory containing the documents.
        idc_t (dict): A dictionary containing idc_t values for each term.

    Returns:
        dict: A dictionary containing tf-idf values for each term in each document.
    """
    tf_idf_values = {}

    # Iterate through each document
    for filename in os.listdir(document_directory):
        if filename.endswith(".txt"):
            document_path = os.path.join(document_directory, filename)

            # Calculate term frequency for each term in the document
            term_frequency = defaultdict(int)
            with open(document_path, 'r') as file:
                document_content = file.read().lower().split()
                for term in document_content:
                    term_frequency[term] += 1

            # Calculate tf-idf for each term in the document
            tf_idf_doc = {}
            for term, tf in term_frequency.items():
                if term in idc_t:
                    tf_idf_doc[term] = tf * idc_t[term]

            # Store the results
            tf_idf_values[filename] = tf_idf_doc

    return tf_idf_values

# Calculate tf-idf values with the given input directory and idc_t values
tf_idf_values = calculate_tf_idf(input_directory, idc_t)

# Print the results
print("tf-idf values for each term in each document:")
for document, values in tf_idf_values.items():
    print(f"Document: {document}")
    for term, value in values.items():
        print(f"{term}: {value}")


tf-idf values for each term in each document:
Document: document_439.txt
Document: document_363.txt
Document: document_405.txt
Document: document_1398.txt
Document: document_411.txt
Document: document_377.txt
Document: document_1373.txt
Document: document_1367.txt
Document: document_388.txt
Document: document_149.txt
Document: document_3.txt
Document: document_607.txt
Document: document_161.txt
Document: document_175.txt
Document: document_613.txt
Document: document_820.txt
Document: document_1171.txt
Document: document_834.txt
Document: document_1165.txt
Document: document_808.txt
Document: document_1159.txt
Document: document_997.txt
Document: document_983.txt
Document: document_773.txt
Document: document_767.txt
Document: document_1005.txt
Document: document_954.txt
Document: document_798.txt
Document: document_1011.txt
Document: document_940.txt
Document: document_1039.txt
Document: document_968.txt
Document: document_99.txt
Document: document_559.txt
Document: document_217.txt
Doc