In [275]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import random
import pandas as pd

# Document data
documents = {
    "D1": ["bird", "cat", "cat", "dog", "dog", "bird", "tiger", "tiger"],
    "D2": ["cat", "tiger", "cat", "dog"],
    "D3": ["dog", "bird", "bird", "cat"],
    "D4": ["cat", "tiger", "cat", "dog"],
    "D5": ["tiger", "tiger", "dog", "tiger"],
    "D6": ["cat", "cat", "tiger", "tiger"],
    "D7": ["bird", "cat", "bird"],
    "D8": ["dog", "cat", "bird"],
    "D9": ["cat", "dog", "tiger"],
    "D10": ["tiger", "tiger", "tiger"],
    "D11": ["bird"] * 10 + ["cat"] * 9 + ["dog"] * 8 + ["tiger"] * 7,
    "D12": ["bird"] * 23 + ["cat"] * 11 + ["dog"] * 8 + ["tiger"] * 1,
    "D13": ["bird"] * 0 + ["cat"] * 0 + ["dog"] * 11 + ["tiger"] * 25,
    "D14": ["bird"] * 1 + ["cat"] * 2 + ["dog"] * 4 + ["tiger"] * 3,
    "D15": ["bird"] * 8 + ["cat"] * 11 + ["dog"] * 3 + ["tiger"] * 1,
}
words = ["bird", "cat", "dog", "tiger"]

# Convert documents to strings for TF-IDF vectorizer
document_strings = [" ".join(documents[f"D{i}"]) for i in range(1, 16)]

# Create and normalize the TF-IDF matrix
vectorizer = TfidfVectorizer(vocabulary=words)
tfidf_matrix = vectorizer.fit_transform(document_strings).toarray()
normalized_tfidf_matrix = normalize(tfidf_matrix, norm='l2')


def latent_semantic_indexing(documents_matrix, k=4):
    """Perform Latent Semantic Indexing using manual SVD."""
    # Calculate the SVD of the matrix
    U, S, Vt = np.linalg.svd(documents_matrix.T, full_matrices=False)

    # Truncate the matrices to keep only the top k components
    U_k = U[:, :k]  # term-concept matrix
    S_k = np.diag(S[:k])  # singular values matrix
    Vt_k = Vt[:k, :]  # concept-document matrix
    # Calculate the reduced matrix
    reduced_matrix = np.dot(np.dot(U, S_k), Vt_k)
    return reduced_matrix, U_k, S_k, Vt_k


def calculate_cosine_similarity(query_vector, document_matrix):
    """Calculate cosine similarity between query vector and document matrix."""
    similarities = []
    for doc_vector in document_matrix.T:
        query_vector = query_vector.reshape(-1)
        norm_query = np.linalg.norm(query_vector)
        norm_doc = np.linalg.norm(doc_vector)
        if norm_query == 0 or norm_doc == 0:
            similarity = 0
        else:
            similarity = np.dot(query_vector, doc_vector) / \
                (norm_query * norm_doc)
        similarities.append(similarity)
    return similarities


def calculate_euclidean_distance(query_vector, document_matrix):
    """Calculate Euclidean distance between query vector and document matrix."""
    distances = []
    for doc_vector in document_matrix.T:
        distance = np.linalg.norm(query_vector - doc_vector)
        distance = 1 / (1 + distance)  # Convert distance to similarity
        distances.append(distance)
    return distances


def calculate_pearson_correlation(query_vector, document_matrix):
    """Calculate Pearson correlation between query vector and document matrix."""
    correlations = []
    for doc_vector in document_matrix.T:
        if np.std(query_vector) == 0 or np.std(doc_vector) == 0:
            correlation = 0
        else:
            correlation = np.corrcoef(query_vector, doc_vector)[0, 1]
        correlations.append(correlation)
    return correlations


# Perform latent semantic indexing on both the original and normalized TF-IDF matrices
lsi_matrix, U_k, S_k, Vt_k = latent_semantic_indexing(tfidf_matrix)
normalized_lsi_matrix = latent_semantic_indexing(normalized_tfidf_matrix)


def project_query_to_reduced_space(query_counts, U_k, S_k):
    """Project the query vector into the reduced space."""
    query_vector = np.array([query_counts.get(word, 0) for word in words])
    query_vector = query_vector.reshape(-1, 1)  # Change shape to (4, 1)
    query_vector_reduced = np.dot(
        np.dot(query_vector.T, U_k), np.linalg.inv(S_k))
    print(query_vector_reduced)
    return query_vector_reduced


def get_cosine_similarity_scores(query_counts):
    query_vector_reduced = project_query_to_reduced_space(
        query_counts, U_k, S_k)
    similarities = calculate_cosine_similarity(query_vector_reduced.T, Vt_k)
    sorted_similarities = sorted(zip(
        [f'Doc{i+1}' for i in range(len(similarities))], similarities), key=lambda x: x[1], reverse=True)
    similarities = zip([f'Doc{i+1}' for i in range(len(similarities))], similarities)
    return similarities


def get_euclidean_distance_scores(query_counts):
    query_vector_reduced = project_query_to_reduced_space(
        query_counts, U_k, S_k)
    distances = calculate_euclidean_distance(query_vector_reduced, Vt_k)
    sorted_distances = sorted(
        zip([f'Doc{i+1}' for i in range(len(distances))], distances), key=lambda x: x[1], reverse=True)
    return sorted_distances


def get_pearson_correlation_scores(query_counts):
    query_vector_reduced = project_query_to_reduced_space(
        query_counts, U_k, S_k)
    correlations = calculate_pearson_correlation(
        query_vector_reduced, Vt_k)
    sorted_correlations = sorted(zip(
        [f'Doc{i+1}' for i in range(len(correlations))], correlations), key=lambda x: x[1], reverse=True)
    return sorted_correlations

In [276]:
U_k_pd = pd.DataFrame(U_k, index=words)
U_k_pd

Unnamed: 0,0,1,2,3
bird,-0.479124,0.682384,0.533223,-0.143056
cat,-0.57079,0.108358,-0.741555,-0.33549
dog,-0.440793,-0.064758,-0.073828,0.89222
tiger,-0.500341,-0.720012,0.400397,-0.266316


In [277]:
S_k_pd = pd.DataFrame(S_k)
S_k_pd

Unnamed: 0,0,1,2,3
0,3.089494,0.0,0.0,0.0
1,0.0,1.931194,0.0,0.0
2,0.0,0.0,1.081811,0.0
3,0.0,0.0,0.0,0.745119


In [278]:
Vt_k_pd = pd.DataFrame(Vt_k, columns=documents.keys())
Vt_k_pd

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15
0,-0.31899,-0.275211,-0.246672,-0.275211,-0.198756,-0.245155,-0.210932,-0.27533,-0.282541,-0.161949,-0.31236,-0.24239,-0.205695,-0.293878,-0.273786
1,0.051047,-0.120085,0.318254,-0.120085,-0.364304,-0.223957,0.350024,0.251312,-0.20222,-0.372833,0.117188,0.320951,-0.354764,-0.122627,0.247357
2,0.119622,-0.43645,0.179352,-0.43645,0.329543,-0.222992,0.214917,-0.057555,-0.221474,0.370117,0.120809,0.214596,0.311289,0.019745,-0.144035
3,0.064244,-0.024696,0.082984,-0.024696,0.039585,-0.571105,-0.340418,0.258171,0.225026,-0.357414,0.027285,-0.045246,0.155101,0.463189,-0.240268


In [279]:
cosine_similarities = get_cosine_similarity_scores(
    {"bird": 2, "cat": 8, "dog": 1, "tiger": 1})
cosine_similarities_pd = pd.DataFrame(cosine_similarities, columns=["Document", "Similarity"])
cosine_similarities_pd

[[-2.09280157  0.74920757 -4.19613569 -3.14597698]]


Unnamed: 0,Document,Similarity
0,Doc1,0.000884
1,Doc2,0.792845
2,Doc3,-0.101363
3,Doc4,0.792845
4,Doc5,-0.450729
5,Doc6,0.774923
6,Doc7,0.267106
7,Doc8,0.074436
8,Doc9,0.247253
9,Doc10,-0.098807
