In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

def extract_keywords(file_path, output_file, top_n=5, sort_range=None):
    # Read the text from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Create a CountVectorizer to compute term frequencies
    count_vectorizer = CountVectorizer(stop_words='english')
    word_counts = count_vectorizer.fit_transform([text])

    # Get feature names (terms)
    feature_names = np.array(count_vectorizer.get_feature_names_out())

    # Compute term frequencies
    term_frequencies = word_counts.toarray().flatten()

    # Get indices to sort
    if sort_range is not None:
        sorted_indices = np.argsort(term_frequencies)[::-1][sort_range[0]:sort_range[1]]
    else:
        sorted_indices = np.argsort(term_frequencies)[::-1]

    # Get top N keywords
    top_keywords = feature_names[sorted_indices][:top_n]
    
    # Get frequency of each word
    word_frequencies = term_frequencies[sorted_indices][:top_n]

    # Write the results to the output file
    with open(output_file, 'w', encoding='utf-8') as out_file:
        out_file.write("Top Keywords with Frequencies:\n")
        for keyword, frequency in zip(top_keywords, word_frequencies):
            out_file.write(f"Keyword: {keyword}, Frequency: {frequency}\n")

# Example usage
file_path = "cleaned_datasets/cleaned_SUAS_final_report.txt"  # Change this to the path of your text file
output_file = "cleaned_datasets/TD_IDF_Analysis_Output.txt"  # Change this to the desired output file path
extract_keywords(file_path, output_file, top_n=50, sort_range=(0, 50))

print("Results written to output_keywords.txt")

Results written to output_keywords.txt
