In [19]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score

# Step 1: Parse the dataset
def parse_cisi_dataset(file_path):
    with open(file_path, "r") as file:
        content = file.read()
    # Split by document identifiers
    documents = re.split(r'\.I \d+', content)[1:]  # Skip the first empty split
    parsed_docs = []
    for doc in documents:
        # Extract relevant fields (Title, Author, and Abstract/Content)
        title = re.search(r'\.T(.*?)\.A', doc, re.DOTALL)
        author = re.search(r'\.A(.*?)\.W', doc, re.DOTALL)
        abstract = re.search(r'\.W(.*)', doc, re.DOTALL)
        title_text = title.group(1).strip() if title else ""
        author_text = author.group(1).strip() if author else ""
        abstract_text = abstract.group(1).strip() if abstract else ""
        full_text = f"{title_text} {author_text} {abstract_text}".strip()
        parsed_docs.append(full_text)
    return parsed_docs

# Step 2: Preprocess and create the TF-IDF matrix
def create_tfidf_matrix(documents):
    # Adjusting the stop words and min_df to avoid empty vocabulary error
    vectorizer = TfidfVectorizer(stop_words=None, min_df=1)  # No stop words removal, min_df=1
    X_tfidf = vectorizer.fit_transform(documents)
    return X_tfidf, vectorizer

# Step 3: Apply Truncated SVD (LSA)
def apply_lsa(tfidf_matrix, n_components=100):
    svd = TruncatedSVD(n_components=n_components)
    X_lsa = svd.fit_transform(tfidf_matrix)
    return X_lsa, svd

# Step 4: Retrieve relevant documents based on cosine similarity
def retrieve_documents(query, tfidf_vectorizer, svd_model, lsa_matrix, top_n=5):
    query_tfidf = tfidf_vectorizer.transform([query])
    query_lsa = svd_model.transform(query_tfidf)  # Project query into LSA space
    similarities = cosine_similarity(query_lsa, lsa_matrix)  # Cosine similarity
    ranked_indices = np.argsort(similarities[0])[::-1][:top_n]  # Top N most similar documents
    return ranked_indices, similarities[0][ranked_indices]

# Step 5: Evaluate precision and recall
def compute_precision_recall(retrieved_docs, ground_truth_relevant_docs):
    # Create binary relevance for precision/recall calculation
    y_true = [1 if doc_id in ground_truth_relevant_docs else 0 for doc_id in retrieved_docs]
    y_pred = [1] * len(retrieved_docs)  # All retrieved documents are treated as relevant
    # Compute precision and recall
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return precision, recall

# Load and parse the dataset
file_path = "CISI.ALL"
documents = parse_cisi_dataset(file_path)

# Debug: Check the first few parsed documents
for i, doc in enumerate(documents[:3]):  # Print the first 3 documents
    print(f"Document {i+1}: {doc[:200]}...\n{'-'*80}")

# Create the TF-IDF matrix
X_tfidf, tfidf_vectorizer = create_tfidf_matrix(documents)

# Apply LSA using Truncated SVD
X_lsa, svd_model = apply_lsa(X_tfidf)

# Sample query
query = "history of Dewey Decimal Classification"

# Retrieve top 5 most relevant documents
top_indices, scores = retrieve_documents(query, tfidf_vectorizer, svd_model, X_lsa)

# Ground truth relevance for the sample query (for example purposes, replace with actual labels)
# Assuming that documents 1, 2, and 5 are relevant
ground_truth_relevant_docs = [1, 2, 5]

# Compute precision and recall
precision, recall = compute_precision_recall(top_indices, ground_truth_relevant_docs)


Document 1: 18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal
Classification.  The first edition of the DDC was published
in 1876, the eighteenth ...
--------------------------------------------------------------------------------
Document 2: Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use
in 104 technical libraries in the United Kingdom.
Library use is only one aspect of the wider pattern of
infor...
--------------------------------------------------------------------------------
Document 3: Two Kinds of Power
An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings
and the organization and control of knowledge and information will
in...
--------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:

# Display the results
print("Top 5 most relevant documents for the query:")
for i, index in enumerate(top_indices):
    print(f"Document {index + 1}: (Score: {scores[i]})")
    print(documents[index][:200] + "...")  # Print first 200 characters of the document for context
    print("-" * 80)

# Print precision and recall
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Top 5 most relevant documents for the query:
Document 260: (Score: 0.7782989082218531)
Classification Practice in Britain.  Report on a survey of classification
opinion and practice in Great Britain, with particular reference to the Dewey
Decimal Classification Davison, K. The objective...
--------------------------------------------------------------------------------
Document 1: (Score: 0.7017359622899058)
18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal
Classification.  The first edition of the DDC was published
in 1876, the eighteenth ...
--------------------------------------------------------------------------------
Document 1074: (Score: 0.6548754733289364)
The DK (Decimal Classification) - a Multi-Faceted Classification Dahlberg, I. Backed up by numerical data  derived from an ASLIB analysis of the planned
world-wide system of UNISIST, the author critic...
--------------------------------------------------------

In [29]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score
import numpy as np

# Load dataset
def load_cisi_dataset(file_path):
    with open(file_path, 'r') as file:
        data = file.read().split('.I ')
    
    docs = []
    for doc in data[1:]:
        doc_lines = doc.strip().splitlines()
        doc_id = doc_lines[0].strip()
        doc_content = ' '.join(doc_lines[2:]).strip()
        docs.append({'id': doc_id, 'content': doc_content})
    
    return pd.DataFrame(docs)

# Load CISI dataset
file_path = 'CISI.ALL'
df = load_cisi_dataset(file_path)

# Preprocess the data and remove stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

# Apply TF-IDF to the document corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(df['content'])

# Reduce the dimensionality with LSA (SVD)
lsa = TruncatedSVD(n_components=100, random_state=42)
lsa_matrix = lsa.fit_transform(tfidf_matrix)

# Get user query and process it similarly to the document corpus
def process_query(query):
    query_tfidf = tfidf_vectorizer.transform([query])
    query_lsa = lsa.transform(query_tfidf)
    return query_lsa

# Function to rank documents based on cosine similarity
def rank_documents(query_lsa, lsa_matrix, df):
    # Compute cosine similarity between the query and all document vectors
    cosine_similarities = cosine_similarity(query_lsa, lsa_matrix)
    similarity_scores = cosine_similarities.flatten()
    
    # Get the top N document indices sorted by similarity
    top_n_idx = similarity_scores.argsort()[::-1]
    top_n_scores = similarity_scores[top_n_idx]
    
    # Retrieve and print top N most relevant documents
    print("Top 5 most relevant documents for the query:")
    for idx in range(5):
        doc_id = df.iloc[top_n_idx[idx]]['id']
        doc_content = df.iloc[top_n_idx[idx]]['content'][:300]  # Show first 300 characters
        score = top_n_scores[idx]
        print(f"Document {doc_id}: (Score: {score})")
        print(f"{doc_content}...\n{'-'*80}")

# Precision and Recall Calculation (dummy relevance judgments for example purposes)
def evaluate_precision_recall(retrieved_docs, relevant_docs):
    y_true = np.array([1 if doc in relevant_docs else 0 for doc in retrieved_docs])
    y_pred = np.array([1] * len(retrieved_docs))  # Assume all retrieved docs are relevant for now
    
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

# Example query
query = "Dewey Decimal Classification system"

# Process the query and rank documents
query_lsa = process_query(query)
rank_documents(query_lsa, lsa_matrix, df)

# For evaluation, assume we have a list of relevant document IDs for the query
relevant_docs = ['260', '1']  # These are hypothetical relevant document IDs
retrieved_docs = df['id'].iloc[:5].tolist()  # Top 5 retrieved documents

# Evaluate precision and recall
evaluate_precision_recall(retrieved_docs, relevant_docs)


Top 5 most relevant documents for the query:
Document 260: (Score: 0.8484623239364907)
Classification Practice in Britain.  Report on a survey of classification opinion and practice in Great Britain, with particular reference to the Dewey Decimal Classification .A Davison, K. .W   The objectives of the Sub-Committee in starting their enquiries were basically three-fold      1) To gath...
--------------------------------------------------------------------------------
Document 1074: (Score: 0.8273151825699409)
The DK (Decimal Classification) - a Multi-Faceted Classification .A Dahlberg, I. .W   Backed up by numerical data  derived from an ASLIB analysis of the planned world-wide system of UNISIST, the author critically investigates the claimed university of the U.D.C.  According to it, the so-called "Univ...
--------------------------------------------------------------------------------
Document 1141: (Score: 0.7985857718343811)
Algebra of Classification .A Shreider, Yu. A. .W    Two a