In [2]:
# 1. split cran-1.all-1400 into the single documents (store each singular document either in its own file or part of data structure.)
import os

def split_documents(input_file_path, output_dir):
    # Make sure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Open the input file
    with open(input_file_path, 'r') as input_file:
        document_content = input_file.read()

        # Split the document by .I (document identifier)
        documents = document_content.split('.I')[1:]

        # Write each document to a separate file
        for i, document in enumerate(documents, start=1):
            document = document.strip()
            # Write each document to a separate file
            with open(os.path.join(output_dir, f"document_{i}.txt"), 'w') as output_file:
                output_file.write(document)

# 입력 파일 경로와 출력 디렉토리 경로 지정
input_file_path = 'cran-1.all.1400'
output_directory = 'split_documents'

# 문서를 분리하고 각 문서를 개별 파일로 저장
split_documents(input_file_path, output_directory)


In [3]:
# 2. maintain list of all terms with their document frequency df_t(count of how many documents in the collection contain term t) 
import os
import re # Regular Expression
from collections import defaultdict # defaultdict을 사용하면 해당 키가 존재하지 않을 경우 자동으로 기본 값

def calculate_document_frequency(input_dir):
    # Dictionary to store document frequency of terms
    document_frequency = defaultdict(int)

    # Regular expression to extract terms from document
    term_pattern = re.compile(r'\b\w+\b')

    # Iterate through each document
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            # Read the document
            with open(os.path.join(input_dir, filename), 'r') as file:
                document_content = file.read().lower()

                # Extract terms from the document
                terms_in_document = set(re.findall(term_pattern, document_content))

                # Update document frequency for each term
                for term in terms_in_document:
                    document_frequency[term] += 1

    return document_frequency

# 입력 디렉토리 경로 지정
input_directory = 'split_documents'

# 각 용어의 문서 빈도 계산
term_document_frequency = calculate_document_frequency(input_directory)

# 결과 출력
print("각 용어의 문서 빈도:")
for term, frequency in term_document_frequency.items():
    print(f"{term}: {frequency}")


각 용어의 문서 빈도:
affecting: 10
boundary: 460
free: 217
observed: 61
stream: 232
naca: 191
to: 1257
w: 1400
importance: 43
as: 628
from: 621
shock: 237
extensive: 19
layer: 398
a: 1400
interaction: 82
far: 40
1211: 2
detached: 17
keep: 8
immediate: 11
0: 219
an: 796
some: 271
region: 149
that: 807
because: 61
tn: 245
in: 1242
be: 691
behind: 68
439: 2
consequences: 4
surface: 296
with: 1010
daley: 1
paper: 203
is: 1151
p: 153
was: 298
flow: 703
then: 127
mach: 390
8: 93
factor: 47
separation: 98
abruptly: 3
proposed: 39
there: 107
under: 195
leading: 118
pattern: 39
of: 1395
pressure: 520
airfoil: 53
enough: 18
wave: 168
described: 105
influence: 95
1956: 74
number: 485
may: 242
increased: 64
terminated: 4
on: 913
edge: 153
lindsey: 2
3804: 1
extending: 18
determining: 49
present: 180
at: 771
through: 141
high: 236
conditions: 285
occurs: 40
b: 1400
attack: 112
technical: 17
transonic: 63
humphreys: 1
supersonic: 267
support: 19
the: 1391
change: 60
experimental: 318
vicinity: 23
which: 595

In [4]:
# 3. calculate N 
N=1400

In [5]:
# 4. For each term in list, calculate idc_t
import math

def calculate_idc_t(N, document_frequency):
    idc_t = {}

    for term, df_t in document_frequency.items():
        if df_t == 0:
            idc_t[term] = 0  # To prevent division by zero error, treat as a substitute value
        else:
            idf_t = math.log(N / df_t)
            idc_t[term] = 1 / idf_t

    return idc_t

# Set the value of N
N = 1400

# Example dictionary for document_frequency
document_frequency = {'term1': 10, 'term2': 5, 'term3': 0}

# Calculate idc_t
idc_t = calculate_idc_t(N, document_frequency)

# Print the results
print("idc_t values for each term:")
for term, value in idc_t.items():
    print(f"{term}: {value}")


idc_t values for each term:
term1: 0.2023618696943225
term2: 0.17746891550973914
term3: 0
