In [11]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Define file paths
input_entities_file = "cleaned_datasets/NER_output_entities.txt"
input_report_file = "cleaned_datasets/cleaned_SUAS_final_report.txt"
output_file = "cleaned_datasets/TD_IDF_Analysis_Output.txt"

# Read entities from input_entities_file
entities = []
with open(input_entities_file, 'r') as file:
    for line in file:
        entity = line.split(':')[1].strip().split(',')[0].strip().lower()
        entities.append(entity)

# Read cleaned report text
with open(input_report_file, 'r') as file:
    report_text = file.read().lower()  # Convert to lowercase

# Tokenize the report text
corpus = [report_text]

def custom_tokenizer(text):
    # Define a regex pattern to match names
    name_pattern = r'\b(?:[A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\b'
    
    # Find all occurrences of names in the text
    names = re.findall(name_pattern, text)
    
    # Tokenize the rest of the text using default word tokenizer
    words = re.findall(r'\b\w+\b', text)
    
    # Combine names and words into a single list
    tokens = names + words
    
    return tokens

# Calculate TF-IDF
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Write TF-IDF scores to output file
with open(output_file, 'w') as file:
    for entity in entities:
        index = tfidf_vectorizer.vocabulary_.get(entity, None)
        if index is not None:
            tfidf_score = tfidf_matrix[0, index]
            file.write(f"Entity: {entity}, TF-IDF Score: {tfidf_score}\n")
        else:
            file.write(f"Entity: {entity}, TF-IDF Score: Not Found\n")
print(tfidf_vectorizer.vocabulary_)

print("TF-IDF analysis completed. Results saved to", output_file)


{'suas': 950, 'competition': 178, 'software': 910, 'team': 982, 'fall': 359, 'two': 1042, 'thousand': 1009, 'and': 58, 'twenty': 1041, 'three': 1010, 'final': 372, 'report': 818, 'authors': 82, 'marc': 593, 'cruz': 234, 'abdul': 14, 'kalam': 530, 'syed': 964, 'max': 596, 'gross': 435, 'joshua': 525, 'estrada': 341, 'jason': 521, 'mar': 592, 'josh': 524, 'ng': 648, 'ethan': 342, 'tarrer': 976, 'sarkis': 857, 'gafayan': 414, 'rubayet': 848, 'mujahid': 629, 'david': 247, 'jackson': 520, 'status': 936, 'done': 299, 'date': 246, 'sep': 878, 'one': 680, 'relative': 812, 'links': 567, 'overview': 706, 'technical': 984, 'design': 263, 'document': 294, 'machine': 580, 'learning': 551, 'models': 624, 'introduction': 514, 'progress': 763, 'timeline': 1016, 'recruitment': 798, 'august': 81, 'september': 879, 'four': 400, 'phase': 723, 'october': 672, 'hundred': 460, 'thirty': 1005, 'odlc': 674, 'obstacle': 668, 'avoidance': 89, 'five': 380, 'hardware': 439, 'trade': 1026, 'studies': 948, 'seven': 