In [None]:
# 1) Tokenize the documents into words, remove stop words, and conduct stemming 
# 2) Calculate tf-idf for each word in each document and generate document-word matrix (each element in the matrix is the tf-idf score for a word in a document) 
# 3) Calculate pairwise cosine similarity for the documents.


import string
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


# Set the data path to the directory containing your text files
data_path = '/Users/munazzahrakhangi/Desktop/Information_Retrieval'

# Initialize dictionaries and objects
token_dict = {}
all_stemmed_words = []
ps = PorterStemmer()
stop_words = set(stopwords.words("english")) | set(string.punctuation)

# Initialize a list to store preprocessed documents
preprocessed_documents = []

# Open the output file for writingnlpNLP
output_file = open('/Users/munazzahrakhangi/Desktop/Information_Retrieval/txt_file.txt', 'w')
text_lines = "\n"

# Walk through the directory and process text files
for subdir, dirs, files in os.walk(data_path):
    for file in files:
        file_path = os.path.join(subdir, file)  # Use os.path.join to create a valid file path
        if file_path.endswith('.txt'):
            with open(file_path, 'r') as file_contents:
                text = file_contents.read()
                lowered = text.lower()
                token_dict[file] = lowered
                preprocessed_text = lowered
                
                # Append the preprocessed text to the list
                preprocessed_documents.append(preprocessed_text)

# Loop through the token_dict and tokenize each document
for file, text in token_dict.items():
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Write the tokenized content to the output file
    output_file.write(text_lines + "Sentence tokenizing" + ' ' + file + ",\n" + text_lines)
    output_file.write(str(sent_tokenize(text)) + "\n")
    
    output_file.write(text_lines + "Word Tokenization" + ' ' + file + ",\n" + text_lines)
    output_file.write(str(words) + "\n")

    #Remove stop words
    no_stop_words = []
    for w in words:
        if w not in no_stop_words:
            no_stop_words.append(w)
            output_file.write(text_lines + "Stop words removed from" + ' ' + file + ".\n" + text_lines)
            output_file.write(str(no_stop_words) + "\n")
            
    #Stemming
    stemmed_words = []
    for w in words:
        if w not in stop_words:
            stemmed_words.append(ps.stem(w))
            output_file.write(text_lines + "Stemming" + ' ' + file + ".\n" + text_lines)
            output_file.write(str(stemmed_words) + "\n")
            
            all_stemmed_words.append(stemmed_words)
    
#TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, tokenizer=word_tokenize, use_idf=True)

tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)

feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Printing and saving the TF-IDF matrix in csv file.
tfidf_df.to_csv('/Users/munazzahrakhangi/Desktop/Information_Retrieval/tfidf_matrix.csv', index=False)

# Calculate pairwise cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix)

cosine_sim_df = pd.DataFrame(cosine_similarities, columns=token_dict.keys(), index=token_dict.keys())

# Printing and saving the cosine similarity matrix in csv file.
cosine_sim_df.to_csv('/Users/munazzahrakhangi/Desktop/Information_Retrieval/cosine_similarity_matrix.csv')
   
# Close the output file
output_file.close()
