In [6]:
#-------------------------------------------------------------------------
# AUTHOR: Mahakbhai Patel
# FILENAME: indexing.py
# SPECIFICATION: This program reads a file collection.csv, processes the text by removing stopwords,
#                performs stemming, identifies index terms, and calculates the TF-IDF document-term matrix.
# FOR: CS 5180 - Assignment #1
# TIME SPENT: how long it took you to complete the assignment
#-----------------------------------------------------------*/

# Importing necessary libraries
import csv
import math
from collections import Counter

# Step 1: Reading the documents from a CSV file
documents = []
with open('collection.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip the header row
    for row in reader:
        documents.append(row[0])  # Document text in the first column

# Step 2: Conducting stopword removal for pronouns/conjunctions. Using a set to define stopwords.
stopWords = {"i", "she", "he", "they", "her", "their", "and", "is", "the", "a", "an"}

def remove_stopwords(doc):
    words = doc.lower().split()
    return [word for word in words if word not in stopWords]

# Step 3: Conducting simple stemming. Using a dictionary to map word variations to their stem.
stemming = {
    "cats": "cat",
    "dogs": "dog",
    "loves": "love",
    "love": "love"
}

def apply_stemming(words):
    return [stemming.get(word, word) for word in words]

# Step 4: Preprocessing the documents (removing stopwords and applying stemming)
processed_documents = []
for doc in documents:
    words = remove_stopwords(doc)
    stemmed_words = apply_stemming(words)
    processed_documents.append(stemmed_words)

print("Processed Documents:", processed_documents)

# Step 5: Identifying the index terms (vocabulary)
terms = ['love', 'cat', 'dog']  # Fixed order of terms for TF-IDF calculation

# Step 6: Calculating TF (Term Frequency), IDF (Inverse Document Frequency), and TF-IDF
N = len(processed_documents)

# Function to calculate term frequency (TF)
def compute_tf(term, doc):
    return doc.count(term) / len(doc)  # Normalized term frequency

# Function to calculate inverse document frequency (IDF)
def compute_idf(term):
    doc_count = sum(1 for doc in processed_documents if term in doc)
    return math.log10(N / doc_count) if doc_count > 0 else 0

# Print IDF values for debugging
for term in terms:
    idf = compute_idf(term)
    print(f"IDF for {term}: {idf:.3f}")

# Step 7: Constructing the TF-IDF document-term matrix
docTermMatrix = []
for doc in processed_documents:
    tfidf_values = []
    for term in terms:
        tf = compute_tf(term, doc)
        idf = compute_idf(term)
        tfidf = tf * idf
        tfidf_values.append(tfidf)
    docTermMatrix.append(tfidf_values)

# Step 8: Printing the TF-IDF document-term matrix
print("\nTF-IDF Document-Term Matrix:")
print(f"{'':<15}{'love':<10}{'cat':<10}{'dog':<10}")  # Term headers
for idx, doc_label in enumerate(['d1', 'd2', 'd3']):
    row = [f"{doc_label:<15}"] + \
        [f"{value:.2f}".ljust(10) for value in docTermMatrix[idx]]
    print("".join(row))


Processed Documents: [['love', 'cat', 'cat'], ['love', 'dog'], ['love', 'dog', 'cat']]
IDF for love: 0.000
IDF for cat: 0.176
IDF for dog: 0.176

TF-IDF Document-Term Matrix:
               love      cat       dog       
d1             0.00      0.12      0.00      
d2             0.00      0.00      0.09      
d3             0.00      0.06      0.06      


# New section