In [9]:
#import the neccessary libraries which are required
import re
import spacy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
from collections import Counter

# dataset
quaries = pd.read_csv('/kaggle/input/quora-doc/queries.csv')
document = pd.read_csv('/kaggle/input/quora-doc/docs.csv')
query_eval = pd.read_csv('/kaggle/input/quora-doc/qdrel.csv')

# Task 1 

*  Preprocess the docs and queries – remove characters other than alphanumeric or whitespaces.
*  Correct spelling in the queries and documents using SpaCy. Only for each query with some correction, print the original and corrected query in separate lines, followed by two newlines (\n).
*  Tokenize the words in the documents using spacy. Remove all words that occur in less than 5 documents or more than 85% of the documents.
*  For each query, find the cosine similarity of its vector with that of the documents. Use this to find the top 5 and top 10 most similar documents.
*  Calculate the Precision@k scores: report P@1, P@5 and P@10 averaged over all queries

In [10]:
#Task 1.1 Preprocess the Docs and Queries
def clean(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)
quaries['query_text'] = quaries['query_text'].apply(clean)
document['doc_text'] = document['doc_text'].apply(clean)
print(quaries['query_text'].head())
# print(docs_df)

0                   How can ask questions using photos
1    What is Atal Pension Yojana What are its benefits
2          Where is starch digested How is it digested
3          What is a conjecture What are some examples
4    What can India do to support the people suffer...
Name: query_text, dtype: object


In [30]:
#Task1.2 using spacy correct the spelling if there is any misspell
# print original and corrected query
# define the spacy in english
token = spacy.load("en_core_web_sm")
def correct(text):
    doc = token(text)
    corrected =" ".join(token.text for token in doc)    
    if(doc.text!=corrected):
        print("In Dataset",text)
        print("After spell correction",text)
    return corrected
quaries['query_text'] = quaries['query_text'].apply(correct)

In [12]:
#1.3
all_tokens = []
# tokenize all doc using 
for row in document['doc_text']:
    for t in token(row):
        all_tokens.append(t.text.lower())    
# Count frequency of word and remove from each document having < 5
#  or more than 85% of the documents
word_freq = Counter(all_tokens)
min_freq,doc_freq = 5,0.85
filtered_tokens=[]
for token, count in word_freq.items():
    if min_freq <= count <= len(document) * doc_freq:
        filtered_tokens.append(token)

In [13]:
# Tokenize document and quaries 
token = spacy.load("en_core_web_sm")
tokenized_docs = []
for row in document['doc_text']:
    tokenized_docs.append([tok.text.lower() for tok in token(row) if tok.text.lower() in filtered_tokens])
tokenized_queries = []
for query_text in quaries['query_text']:
    tokenized_queries.append([tok.text.lower() for tok in token(query_text) if tok.text.lower() in filtered_tokens])
    
# for each query and document create tf-idf vector
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform([" ".join(tokens) for tokens in tokenized_docs + tokenized_queries])
doc_matrix = matrix[:len(document)]
queries_matrix = matrix[len(document):]

In [14]:
# 1.4 find the cosine similarity of each query
cosine_similarities = cosine_similarity(queries_matrix, doc_matrix)
# for each query find the top 5 and top 10 queries from document
i=0
top5=[]
top10=[]
for similarities in cosine_similarities:
    top5.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:5]])
    top10.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:10]])
    print(f"\nTop 5 documents for Query {quaries.loc[i]['query_id']}: {top5[-1]}")
    print(f"Top 10 documents for Query {quaries.loc[i]['query_id']}: {top10[-1]}")
    i+=1  


Top 5 documents for Query 4584: [1377, 45, 4412, 2603, 1782]
Top 10 documents for Query 4584: [1377, 45, 4412, 2603, 1782, 4583, 9179, 2366, 4411, 2602]

Top 5 documents for Query 6588: [7908, 428, 5065, 4108, 1916]
Top 10 documents for Query 6588: [7908, 428, 5065, 4108, 1916, 2212, 2211, 2328, 2329, 7129]

Top 5 documents for Query 10113: [10319, 3433, 3440, 3439, 3438]
Top 10 documents for Query 10113: [10319, 3433, 3440, 3439, 3438, 3437, 3436, 3435, 3434, 3432]

Top 5 documents for Query 7957: [10274, 2213, 10298, 2949, 7956]
Top 10 documents for Query 7957: [10274, 2213, 10298, 2949, 7956, 7582, 10299, 2214, 6327, 6326]

Top 5 documents for Query 5498: [9517, 252, 8399, 3392, 575]
Top 10 documents for Query 5498: [9517, 252, 8399, 3392, 575, 574, 5456, 6891, 8938, 272]

Top 5 documents for Query 7614: [7613, 5107, 5106, 140, 2178]
Top 10 documents for Query 7614: [7613, 5107, 5106, 140, 2178, 9758, 280, 3187, 6040, 4222]

Top 5 documents for Query 7301: [7302, 1086, 8576, 1909, 

In [15]:
# Dict of relevent documents for evalution purpose
relevant_documents_dict =  query_eval.set_index('query_id')['doc_id'].to_dict()

# Calculate Precision@k for each query
def precision_at_k(query_id, top_k):
    relevant_doc = relevant_documents_dict.get(query_id, None)
    if relevant_doc is not None:
        return int(relevant_doc in top_k)
    return 0

# Calculate Precision@1, Precision@5, and Precision@10 for each query
precision_at_1 = [precision_at_k(quaries.loc[query_id]['query_id'], z[:1]) for query_id, z in enumerate(top5)]
precision_at_5 = [precision_at_k(quaries.loc[query_id]['query_id'], x) for query_id, x in enumerate(top5)]
precision_at_10 = [precision_at_k(quaries.loc[query_id]['query_id'], y) for query_id, y in enumerate(top10)]
# Average Precision@k over all queries
avg_precision_at_1 = sum(precision_at_1) / len(precision_at_1)
avg_precision_at_5 = sum(precision_at_5) / len(precision_at_5)
avg_precision_at_10 = sum(precision_at_10) / len(precision_at_10)

# Report Precision@k scores
print(f"Avg Precision@1: {avg_precision_at_1}")
print(f"Avg Precision@5: {avg_precision_at_5}")
print(f"Avg Precision@10: {avg_precision_at_10}")

Avg Precision@1: 0.48
Avg Precision@5: 0.74
Avg Precision@10: 0.77


# Task 2

*  Improve the performance of Task1 by stemming the tokens (using spacy) before calculating the vocabulary.
*  Improve the performance of Task1 by lemmatizing the tokens (using spacy) before calculating the vocabulary.
*  Report the size of the vocabulary you obtained as part of Task 1, the vocabulary size after stemming and the vocabulary size after lemmatization.
*  Report the performance metrics in both these cases and discuss the results (why or why not performance has increased).


In [16]:
# By stemming the tokens
print("The vocabulary size before stemming: ",vectorizer.get_feature_names_out().shape)

stemmer = PorterStemmer()
token = spacy.load("en_core_web_sm")
tokenized_docs = []
all_tokens = []

for i, row in document.iterrows():
    doc_text = row['doc_text']
    doc_tokens = [stemmer.stem(tok.text.lower()) for tok in token(doc_text) if not tok.is_stop and tok.is_alpha]
    tokenized_docs.append(doc_tokens)
    all_tokens.extend(doc_tokens)

#countinf the frequency of tokens
word_freq = Counter(all_tokens)
# Filter out tokens
min_freq = 5
doc_freq = 0.85

filtered_tokens=[]
for token, count in word_freq.items():
    if min_freq <= count <= len(document) * doc_freq:
        filtered_tokens.append(token)

# Tokenize documents and queries using the filtered vocabulary
tokenized_docs = []
token = spacy.load("en_core_web_sm")
for doc_text in document['doc_text']:
    doc_tokens = [stemmer.stem(tok.text.lower()) for tok in token(doc_text) if stemmer.stem(tok.lemma_) in filtered_tokens]
    tokenized_docs.append(doc_tokens)

tokenized_queries = []

for query_text in quaries['query_text']:
    query_tokens = [stemmer.stem(tok.text.lower()) for tok in token(query_text) if stemmer.stem(tok.lemma_) in filtered_tokens]
    tokenized_queries.append(query_tokens)

    
#print(tokenized_docs[0])
# print(tokenized_queries)
vectorizer1 = TfidfVectorizer()
matrix = vectorizer1.fit_transform([" ".join(tokens) for tokens in tokenized_docs + tokenized_queries])
doc_matrix = matrix[:len(document)]
queries_matrix = matrix[len(document):]
print("The vocabulary size After Stemming : ",vectorizer1.get_feature_names_out().shape)

The vocabulary size before stemming:  (2129,)
The vocabulary size After Stemming :  (2026,)


In [17]:
# Performance After Stemming and lemmitizations
# 2.4 find the cosine similarity of each query
cosine_similarities = cosine_similarity(queries_matrix, doc_matrix)
# for each query find the top 5 and top 10 queries from document
i=0
top5=[]
top10=[]
for similarities in cosine_similarities:
    top5.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:5]])
    top10.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:10]])
    #print(f"\nTop 5 documents for Query {quaries.loc[i]['query_id']}: {top5[-1]}")
    #print(f"Top 10 documents for Query {quaries.loc[i]['query_id']}: {top10[-1]}")
    i+=1  
# Calculate Precision@1, Precision@5, and Precision@10 for each query
precision_at_1 = [precision_at_k(quaries.loc[query_id]['query_id'], z[:1]) for query_id, z in enumerate(top5)]
precision_at_5 = [precision_at_k(quaries.loc[query_id]['query_id'], x) for query_id, x in enumerate(top5)]
precision_at_10 = [precision_at_k(quaries.loc[query_id]['query_id'], y) for query_id, y in enumerate(top10)]
# Average Precision@k over all queries
avg_precision_at_1 = sum(precision_at_1) / len(precision_at_1)
avg_precision_at_5 = sum(precision_at_5) / len(precision_at_5)
avg_precision_at_10 = sum(precision_at_10) / len(precision_at_10)

# Report Precision@k scores
print(f"Avg Precision@1: {avg_precision_at_1}")
print(f"Avg Precision@5: {avg_precision_at_5}")
print(f"Avg Precision@10: {avg_precision_at_10}")

Avg Precision@1: 0.57
Avg Precision@5: 0.81
Avg Precision@10: 0.83


### using lemmatization

In [26]:
# By stemming the tokens
print("The vocabulary size before Lemmatization: ",vectorizer.get_feature_names_out().shape)

stemmer = PorterStemmer()
token = spacy.load("en_core_web_sm")
tokenized_docs = []
all_tokens = []

for i, row in document.iterrows():
    doc_text = row['doc_text']
    doc_tokens = [tok.lemma_ for tok in token(doc_text) if not tok.is_stop and tok.is_alpha]
    tokenized_docs.append(doc_tokens)
    all_tokens.extend(doc_tokens)

#countinf the frequency of tokens
word_freq = Counter(all_tokens)
# Filter out tokens
min_freq = 5
doc_freq = 0.85

filtered_tokens=[]
for token, count in word_freq.items():
    if min_freq <= count <= len(document) * doc_freq:
        filtered_tokens.append(token)

# Tokenize documents and queries using the filtered vocabulary
tokenized_docs = []
token = spacy.load("en_core_web_sm")
for doc_text in document['doc_text']:
    doc_tokens = [tok.lemma_ for tok in token(doc_text) if stemmer.stem(tok.lemma_) in filtered_tokens]
    tokenized_docs.append(doc_tokens)

tokenized_queries = []

for query_text in quaries['query_text']:
    query_tokens = [tok.lemma_ for tok in token(query_text) if stemmer.stem(tok.lemma_) in filtered_tokens]
    tokenized_queries.append(query_tokens)

vectorizer1 = TfidfVectorizer()
matrix = vectorizer1.fit_transform([" ".join(tokens) for tokens in tokenized_docs + tokenized_queries])
doc_matrix = matrix[:len(document)]
queries_matrix = matrix[len(document):]

# get the vocabulary size
print("The vocabulary size After Lemmatization : ",vectorizer1.get_feature_names_out().shape)

The vocabulary size before Lemmatization:  (2129,)
The vocabulary size After Lemmatization :  (1380,)


In [19]:
# Performance After Stemming and lemmitizations
# 2.4 find the cosine similarity of each query
cosine_similarities = cosine_similarity(queries_matrix, doc_matrix)
# for each query find the top 5 and top 10 queries from document
i=0
top5=[]
top10=[]
for similarities in cosine_similarities:
    top5.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:5]])
    top10.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:10]])
    #print(f"\nTop 5 documents for Query {quaries.loc[i]['query_id']}: {top5[-1]}")
    #print(f"Top 10 documents for Query {quaries.loc[i]['query_id']}: {top10[-1]}")
    i+=1  
# Calculate Precision@1, Precision@5, and Precision@10 for each query
precision_at_1 = [precision_at_k(quaries.loc[query_id]['query_id'], z[:1]) for query_id, z in enumerate(top5)]
precision_at_5 = [precision_at_k(quaries.loc[query_id]['query_id'], x) for query_id, x in enumerate(top5)]
precision_at_10 = [precision_at_k(quaries.loc[query_id]['query_id'], y) for query_id, y in enumerate(top10)]
# Average Precision@k over all queries
avg_precision_at_1 = sum(precision_at_1) / len(precision_at_1)
avg_precision_at_5 = sum(precision_at_5) / len(precision_at_5)
avg_precision_at_10 = sum(precision_at_10) / len(precision_at_10)

# Report Precision@k scores
print(f"Avg Precision@1: {avg_precision_at_1}")
print(f"Avg Precision@5: {avg_precision_at_5}")
print(f"Avg Precision@10: {avg_precision_at_10}")

Avg Precision@1: 0.34
Avg Precision@5: 0.56
Avg Precision@10: 0.64


# Task 3

*   Improve the model from Task 2.2 further with Named Entity Recognition (NER) and Parts-Of-Speech (POS) tagging using spaCy.
*   For each query and document vector, give more weightage to some important words. In essence, for each of the tf-idf vectors, multiply 2 along the dimensions which contain nouns, and multiply 4 for the named entities.
*   Report the performance metrics 

In [25]:
print("Task3: ")
# extract_E_P is a function to extract named entities and POS tags
token = spacy.load("en_core_web_sm")
special_case = [{"ORTH": "id", "NORM": "id"}]
special_case_wed = [{"ORTH": "wed", "NORM": "wed"}]
token.tokenizer.add_special_case("id", special_case)
token.tokenizer.add_special_case("wed", special_case_wed)
def entity_tags(text):
    doc = token(text)
    entity = [ent.text for ent in doc.ents]
    tags = [tok.pos_ for tok in doc]
    return entity, tags

# find tags and enties for both documents and quaries
document_entities, document_tags = zip(*document['doc_text'].apply(entity_tags))
quaries_entities, quaries_tags = zip(*quaries['query_text'].apply(entity_tags))

# Append Named Entity Recognition (NER) and Parts-Of-Speech features to TF-IDF matrix
k=vectorizer1.get_feature_names_out()
print("\n The vocabulary size : ",k.shape)
x,y=entity_tags(' '.join(k))

# modify the tf_idf vector as per given in assignment
def modify(tfidf_vector, pos_tags, entities):
    noun_indices = [i for i, pos_tag in enumerate(pos_tags) if 'NOUN' in pos_tag]
    tfidf_vector[:, noun_indices] *= 2 
    entity_indices = [i for i,j in enumerate(k) if j in entities]
    tfidf_vector[:, entity_indices] *= 4

    return tfidf_vector
#do the modifications in doc and queries matrix
doc_matrix_modify = modify(doc_matrix, y,x)
queries_matrix_modify = modify(queries_matrix, y, x)

# Calculating cosine similarity 
cosine_similarities = cosine_similarity(queries_matrix_modify, doc_matrix_modify)
#print(cosine_similarities)
# for each query find the top 5 and top 10 queries from document
i=0
top5=[]
top10=[]
for similarities in cosine_similarities:
    top5.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:5]])
    top10.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:10]])
    #print(f"\nTop 5 documents for Query {quaries.loc[i]['query_id']}: {top5[-1]}")
    #print(f"Top 10 documents for Query {quaries.loc[i]['query_id']}: {top10[-1]}")
    i+=1  
# Calculate Precision@1, Precision@5, and Precision@10 for each query
precision_at_1 = [precision_at_k(quaries.loc[query_id]['query_id'], a[:1]) for query_id, a in enumerate(top5)]
precision_at_5 = [precision_at_k(quaries.loc[query_id]['query_id'], b) for query_id, b in enumerate(top5)]
precision_at_10 = [precision_at_k(quaries.loc[query_id]['query_id'], c) for query_id, c in enumerate(top10)]
# Average Precision@k over all queries
avg_precision_at_1 = sum(precision_at_1) / len(precision_at_1)
avg_precision_at_5 = sum(precision_at_5) / len(precision_at_5)
avg_precision_at_10 = sum(precision_at_10) / len(precision_at_10)

# Report Precision@k scores
print(f"Avg Precision@1: {avg_precision_at_1}")
print(f"Avg Precision@5: {avg_precision_at_5}")
print(f"Avg Precision@10: {avg_precision_at_10}")

Task3: 

 The vocabulary size :  (1380,)
Avg Precision@1: 0.35
Avg Precision@5: 0.55
Avg Precision@10: 0.59


## Task 4:

- Performing the both lemmatization and stemming both to improve the performace 

In [27]:
# By stemming the tokens
print("The vocabulary size before stemming along with Lemmatization: ",vectorizer.get_feature_names_out().shape)

stemmer = PorterStemmer()
token = spacy.load("en_core_web_sm")
tokenized_docs = []
all_tokens = []

for i, row in document.iterrows():
    doc_text = row['doc_text']
    doc_tokens = [stemmer.stem(tok.lemma_) for tok in token(doc_text) if not tok.is_stop and tok.is_alpha]
    tokenized_docs.append(doc_tokens)
    all_tokens.extend(doc_tokens)

#countinf the frequency of tokens
word_freq = Counter(all_tokens)
# Filter out tokens
min_freq = 5
doc_freq = 0.85

filtered_tokens=[]
for token, count in word_freq.items():
    if min_freq <= count <= len(document) * doc_freq:
        filtered_tokens.append(token)

# Tokenize documents and queries using the filtered vocabulary
tokenized_docs = []
token = spacy.load("en_core_web_sm")
for doc_text in document['doc_text']:
    doc_tokens = [stemmer.stem(tok.lemma_) for tok in token(doc_text) if stemmer.stem(tok.lemma_) in filtered_tokens]
    tokenized_docs.append(doc_tokens)

tokenized_queries = []

for query_text in quaries['query_text']:
    query_tokens = [stemmer.stem(tok.lemma_) for tok in token(query_text) if stemmer.stem(tok.lemma_) in filtered_tokens]
    tokenized_queries.append(query_tokens)

vectorizer1 = TfidfVectorizer()
matrix = vectorizer1.fit_transform([" ".join(tokens) for tokens in tokenized_docs + tokenized_queries])
doc_matrix = matrix[:len(document)]
queries_matrix = matrix[len(document):]
# get the vocabulary size
print("The vocabulary size After Stemming along with Lemmatization : ",vectorizer1.get_feature_names_out().shape)


 Improve the performance of Task1 by stemming the tokens 

The vocabulary size before stemming:  (2129,)
The vocabulary size After Stemming :  (1912,)


In [28]:
# Performance After Stemming and lemmitizations
# 2.4 find the cosine similarity of each query
cosine_similarities = cosine_similarity(queries_matrix, doc_matrix)
# for each query find the top 5 and top 10 queries from document
i=0
top5=[]
top10=[]
for similarities in cosine_similarities:
    top5.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:5]])
    top10.append([document.loc[i]['doc_id'] for i in np.argsort(similarities)[::-1][:10]])
    #print(f"\nTop 5 documents for Query {quaries.loc[i]['query_id']}: {top5[-1]}")
    #print(f"Top 10 documents for Query {quaries.loc[i]['query_id']}: {top10[-1]}")
    i+=1  
# Calculate Precision@1, Precision@5, and Precision@10 for each query
precision_at_1 = [precision_at_k(quaries.loc[query_id]['query_id'], z[:1]) for query_id, z in enumerate(top5)]
precision_at_5 = [precision_at_k(quaries.loc[query_id]['query_id'], x) for query_id, x in enumerate(top5)]
precision_at_10 = [precision_at_k(quaries.loc[query_id]['query_id'], y) for query_id, y in enumerate(top10)]
# Average Precision@k over all queries
avg_precision_at_1 = sum(precision_at_1) / len(precision_at_1)
avg_precision_at_5 = sum(precision_at_5) / len(precision_at_5)
avg_precision_at_10 = sum(precision_at_10) / len(precision_at_10)

# Report Precision@k scores
print(f"Avg Precision@1: {avg_precision_at_1}")
print(f"Avg Precision@5: {avg_precision_at_5}")
print(f"Avg Precision@10: {avg_precision_at_10}")

Avg Precision@1: 0.57
Avg Precision@5: 0.82
Avg Precision@10: 0.83
