### <center> TP: Information Retrieval Models<br> (Term Document Matrix and Vector Space Model) <br> KRY SENGHORT <br> ID: e20200706 <br> Group I4-AMS-B

#### Problem 1: Create a Term Document Matrix (TDM)

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Define the documents
documents = [
    "Data science combines statistics, computer science, and domain knowledge.",
    "Machine learning algorithms can analyze large datasets and make predictions.",
    "Data visualization helps in interpreting complex data and communicating insights."
]

# Function to create a Term Document Matrix (TDM)
def create_tdm(documents):
    # Initialize CountVectorizer to tokenize and count term frequencies
    vectorizer = CountVectorizer()
    term_matrix = vectorizer.fit_transform(documents)
    
    # Convert the term matrix to a DataFrame for better readability
    tdm_df = pd.DataFrame(term_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    return tdm_df

# Create and print the Term Document Matrix
tdm = create_tdm(documents)
print("Term Document Matrix (TDM):")
tdm

Term Document Matrix (TDM):


Unnamed: 0,algorithms,analyze,and,can,combines,communicating,complex,computer,data,datasets,...,interpreting,knowledge,large,learning,machine,make,predictions,science,statistics,visualization
0,0,0,1,0,1,0,0,1,1,0,...,0,1,0,0,0,0,0,2,1,0
1,1,1,1,1,0,0,0,0,0,1,...,0,0,1,1,1,1,1,0,0,0
2,0,0,1,0,0,1,1,0,2,0,...,1,0,0,0,0,0,0,0,0,1


#### Problem 2: Visualize the Term Document Matrix

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Define the documents
documents = [
    "Data science combines statistics, computer science, and domain knowledge.",
    "Machine learning algorithms can analyze large datasets and make predictions.",
    "Data visualization helps in interpreting complex data and communicating insights."
]

# Function to create a Term Document Matrix (TDM)
def create_tdm(documents):
    vectorizer = CountVectorizer()
    term_matrix = vectorizer.fit_transform(documents)
    # Convert the term matrix to a DataFrame with terms as columns and document labels as rows
    tdm_df = pd.DataFrame(term_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    # Set the document labels for clarity
    tdm_df.index = [f"Document {i+1}" for i in range(len(documents))]
    return tdm_df

# Create and display the Term Document Matrix in a readable format
tdm = create_tdm(documents)
print("Term Document Matrix (TDM):")
tdm

Term Document Matrix (TDM):


Unnamed: 0,algorithms,analyze,and,can,combines,communicating,complex,computer,data,datasets,...,interpreting,knowledge,large,learning,machine,make,predictions,science,statistics,visualization
Document 1,0,0,1,0,1,0,0,1,1,0,...,0,1,0,0,0,0,0,2,1,0
Document 2,1,1,1,1,0,0,0,0,0,1,...,0,0,1,1,1,1,1,0,0,0
Document 3,0,0,1,0,0,1,1,0,2,0,...,1,0,0,0,0,0,0,0,0,1


#### Problem 3: Implement TF-IDF

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the documents
documents = [
    "Data science combines statistics, computer science, and domain knowledge.",
    "Machine learning algorithms can analyze large datasets and make predictions.",
    "Data visualization helps in interpreting complex data and communicating insights."
]

# Function to calculate TF-IDF for each term in each document
def calculate_tfidf(documents):
    # Initialize TfidfVectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    # Convert the TF-IDF matrix to a DataFrame for readability
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    # Set the document labels
    tfidf_df.index = [f"Document {i+1}" for i in range(len(documents))]
    return tfidf_df

# Calculate and display the TF-IDF matrix
tfidf_matrix = calculate_tfidf(documents)
print("TF-IDF Matrix:")
tfidf_matrix

TF-IDF Matrix:


Unnamed: 0,algorithms,analyze,and,can,combines,communicating,complex,computer,data,datasets,...,interpreting,knowledge,large,learning,machine,make,predictions,science,statistics,visualization
Document 1,0.0,0.0,0.187453,0.0,0.317385,0.0,0.0,0.317385,0.241379,0.0,...,0.0,0.317385,0.0,0.0,0.0,0.0,0.0,0.634769,0.317385,0.0
Document 2,0.327055,0.327055,0.193164,0.327055,0.0,0.0,0.0,0.0,0.0,0.327055,...,0.0,0.0,0.327055,0.327055,0.327055,0.327055,0.327055,0.0,0.0,0.0
Document 3,0.0,0.0,0.190004,0.0,0.0,0.321704,0.321704,0.0,0.489329,0.0,...,0.321704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321704


#### Problem 4: Calculate Cosine Similarity

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

documents = [
    "Data science combines statistics, computer science, and domain knowledge.",
    "Machine learning algorithms can analyze large datasets and make predictions.",
    "Data visualization helps in interpreting complex data and communicating insights."
]
# Define the query
query = "data science algorithms"

# Function to compute TF-IDF matrix for documents and a query
def compute_tfidf(documents, query):
    # Combine documents and query into one list
    docs_with_query = documents + [query]
    
    # Initialize TfidfVectorizer and fit-transform on combined data
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs_with_query)
    
    # Separate the document and query TF-IDF vectors
    doc_tfidf = tfidf_matrix[:-1]  # All document vectors
    query_tfidf = tfidf_matrix[-1]  # Query vector
    return doc_tfidf, query_tfidf

# Function to compute cosine similarity between the query and documents
def rank_documents_by_similarity(doc_tfidf, query_tfidf):
    # Calculate cosine similarity between query and each document
    similarities = cosine_similarity(query_tfidf, doc_tfidf).flatten()
    
    # Rank documents by similarity
    doc_ranking = sorted(enumerate(similarities, 1), key=lambda x: x[1], reverse=True)
    return doc_ranking

# Calculate TF-IDF and cosine similarity
doc_tfidf, query_tfidf = compute_tfidf(documents, query)
ranked_docs = rank_documents_by_similarity(doc_tfidf, query_tfidf)

# Display results
print("Ranking of documents based on cosine similarity to the query:")
for doc_num, score in ranked_docs:
    print(f"Document {doc_num}: Similarity Score = {score:.4f}")

Ranking of documents based on cosine similarity to the query:
Document 1: Similarity Score = 0.4459
Document 3: Similarity Score = 0.2110
Document 2: Similarity Score = 0.1610


#### Problem 5: Advanced Query Processing and Cosine Similarity

In [14]:
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk

# Download NLTK resources
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Define the documents
documents = [
    "Data science combines statistics, computer science, and domain knowledge.",
    "Machine learning algorithms can analyze large datasets and make predictions.",
    "Data visualization helps in interpreting complex data and communicating insights."
]

# Define the queries
queries = [
    "data scientist",
    "machine learn",
    "visualization of data"
]

# Function to preprocess text: lowercase, remove punctuation, apply stemming
def preprocess_text(text):
    # Lowercase and remove punctuation
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    # Tokenize and remove stopwords, then apply stemming
    tokens = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Preprocess documents and queries
preprocessed_documents = [preprocess_text(doc) for doc in documents]
preprocessed_queries = [preprocess_text(query) for query in queries]

# Function to compute TF-IDF and cosine similarity
def compute_tfidf_and_similarity(preprocessed_documents, preprocessed_queries):
    # Combine documents and queries for TF-IDF transformation
    vectorizer = TfidfVectorizer()
    all_texts = preprocessed_documents + preprocessed_queries
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Split TF-IDF matrix into document vectors and query vectors
    doc_tfidf = tfidf_matrix[:len(preprocessed_documents)]
    query_tfidfs = tfidf_matrix[len(preprocessed_documents):]
    
    # Compute cosine similarity for each query against all documents
    for i, query_vector in enumerate(query_tfidfs):
        similarities = cosine_similarity(query_vector, doc_tfidf).flatten()
        ranked_docs = sorted(enumerate(similarities, 1), key=lambda x: x[1], reverse=True)
        
        # Print the ranking results for the query
        print(f"\nRanking for Query {i+1} ('{queries[i]}'):")
        for doc_num, score in ranked_docs:
            print(f"Document {doc_num}: Similarity Score = {score:.4f}")

# Run the TF-IDF and similarity calculation
compute_tfidf_and_similarity(preprocessed_documents, preprocessed_queries)



Ranking for Query 1 ('data scientist'):
Document 3: Similarity Score = 0.2275
Document 1: Similarity Score = 0.0990
Document 2: Similarity Score = 0.0000

Ranking for Query 2 ('machine learn'):
Document 2: Similarity Score = 0.4279
Document 1: Similarity Score = 0.0000
Document 3: Similarity Score = 0.0000

Ranking for Query 3 ('visualization of data'):
Document 3: Similarity Score = 0.5111
Document 1: Similarity Score = 0.1137
Document 2: Similarity Score = 0.0000


[nltk_data] Downloading package stopwords to C:\Users\Rog
[nltk_data]     Strix\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### <center> The End !!!