## Assignment 2

Creating inverted index for P1 Q1

In [7]:
from sentenceSegmentation import SentenceSegmentation
from tokenization import Tokenization
from inflectionReduction import InflectionReduction

docs = {"d1": "Herbivores are typically plant eaters and not meat eaters",
"d2": "Carnivores are typically meat eaters and not plant eaters",
"d3": "Deers eat grass and leaves"}

#Introducing stopwords
stopwords = set(["are", "and", "not"])

segmenter = SentenceSegmentation()
tokenizer = Tokenization()
inflectionReducer = InflectionReduction()

print("IR preprocessing method")
print(" ")
for key in docs:
    docs[key] = segmenter.punkt(docs[key])
    docs[key] = tokenizer.pennTreeBank(docs[key])
    docs[key] = inflectionReducer.reduce(docs[key])
    temp_list = []
    for segment in docs[key]:
        temp_list.append([word for segment in docs[key] for word in segment if word not in stopwords])
    docs[key] = temp_list

#Deriving unique words 
words = list(set([word for doc in docs.values() for segment in doc for word in segment]))

inverted_index_ir = {}

for word in words:
    inverted_index_ir[word] = [doc for doc in docs for segment in docs[doc] if word in segment]

for key, value in inverted_index_ir.items():
    print(f"{key}: {value}")    
    
print(" ")
print("Question specific method")
print(" ")


docs = {"d1": "Herbivores are typically plant eaters and not meat eaters",
"d2": "Carnivores are typically meat eaters and not plant eaters",
"d3": "Deers eat grass and leaves"}

inverted_index_qs = {}
words = set([word for word in " ".join(docs.values()).split()])
words = list(words - stopwords)
for word in words:
    inverted_index_qs[word] = [doc for doc in docs if word in docs[doc].split()]
    
for key, value in inverted_index_qs.items():
    print(f"{key}: {value}")   

IR preprocessing method
 
eat: ['d3']
herbivor: ['d1']
meat: ['d1', 'd2']
leav: ['d3']
eater: ['d1', 'd2']
plant: ['d1', 'd2']
grass: ['d3']
carnivor: ['d2']
typic: ['d1', 'd2']
deer: ['d3']
 
Question specific method
 
eat: ['d3']
typically: ['d1', 'd2']
meat: ['d1', 'd2']
eaters: ['d1', 'd2']
plant: ['d1', 'd2']
grass: ['d3']
Herbivores: ['d1']
Deers: ['d3']
Carnivores: ['d2']
leaves: ['d3']


P1 Q2

In [8]:
from sentenceSegmentation import SentenceSegmentation
from tokenization import Tokenization
from inflectionReduction import InflectionReduction
import numpy as np 
import pandas as pd

docs = {"d1": "Herbivores are typically plant eaters and not meat eaters",
"d2": "Carnivores are typically meat eaters and not plant eaters",
"d3": "Deers eat grass and leaves"}

#Introducing stopwords
stopwords = set(["are", "and", "not"])

segmenter = SentenceSegmentation()
tokenizer = Tokenization()
inflectionReducer = InflectionReduction()

print("IR preprocessing method")
print(" ")
for key in docs:
    docs[key] = segmenter.punkt(docs[key])
    docs[key] = tokenizer.pennTreeBank(docs[key])
    docs[key] = inflectionReducer.reduce(docs[key])
    temp_list = []
    for segment in docs[key]:
        temp_list.append([word for segment in docs[key] for word in segment if word not in stopwords])
    docs[key] = temp_list

#Deriving unique words 
words = list(set([word for doc in docs.values() for segment in doc for word in segment]))

inverted_index_ir = {}

for word in words:
    inverted_index_ir[word] = [doc for doc in docs for segment in docs[doc] if word in segment]

for key, value in inverted_index_ir.items():
    print(f"{key}: {value}") 
    
#Let the rows of the term-document matrix be the unique words and the columns be the documents. 
tf_idf_matrix = [[0]*len(docs) for _ in range(len(words))] 

for i in range(len(words)):
    for j in range(len(docs)):
        doc_words = [word for segment in list(docs.values())[j] for word in segment]
        word_count = len(list(filter(lambda x: x == words[i], doc_words)))
        tf = word_count / len(doc_words) if len(doc_words) > 0 else 0
        idf = np.log(len(docs) / len(inverted_index_ir[words[i]]))
        tf_idf_matrix[i][j] = tf * idf

print(" ")
print("TF-IDF Matrix:")
print(" ")
data = pd.DataFrame(tf_idf_matrix, index=words, columns=docs.keys()) 
data

IR preprocessing method
 
eat: ['d3']
herbivor: ['d1']
meat: ['d1', 'd2']
leav: ['d3']
eater: ['d1', 'd2']
plant: ['d1', 'd2']
grass: ['d3']
carnivor: ['d2']
typic: ['d1', 'd2']
deer: ['d3']
 
TF-IDF Matrix:
 


Unnamed: 0,d1,d2,d3
eat,0.0,0.0,0.274653
herbivor,0.183102,0.0,0.0
meat,0.067578,0.067578,0.0
leav,0.0,0.0,0.274653
eater,0.135155,0.135155,0.0
plant,0.067578,0.067578,0.0
grass,0.0,0.0,0.274653
carnivor,0.0,0.183102,0.0
typic,0.067578,0.067578,0.0
deer,0.0,0.0,0.274653


P1 Q4

In [26]:
from sentenceSegmentation import SentenceSegmentation
from tokenization import Tokenization
from inflectionReduction import InflectionReduction
import numpy as np 
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity

docs = {"d1": "Herbivores are typically plant eaters and not meat eaters",
"d2": "Carnivores are typically meat eaters and not plant eaters",
"d3": "Deers eat grass and leaves"}

#Introducing stopwords
stopwords = set(["are", "and", "not"])

segmenter = SentenceSegmentation()
tokenizer = Tokenization()
inflectionReducer = InflectionReduction()

print("IR preprocessing method")
print(" ")
for key in docs:
    docs[key] = segmenter.punkt(docs[key])
    docs[key] = tokenizer.pennTreeBank(docs[key])
    docs[key] = inflectionReducer.reduce(docs[key])
    temp_list = []
    for segment in docs[key]:
        temp_list.append([word for segment in docs[key] for word in segment if word not in stopwords])
    docs[key] = temp_list

#Deriving unique words 
words = list(set([word for doc in docs.values() for segment in doc for word in segment]))

inverted_index_ir = {}

for word in words:
    inverted_index_ir[word] = [doc for doc in docs for segment in docs[doc] if word in segment]

for key, value in inverted_index_ir.items():
    print(f"{key}: {value}") 
    
#Let the rows of the term-document matrix be the unique words and the columns be the documents. 
tf_idf_matrix = [[0]*len(docs) for _ in range(len(words))] 
idf_list = []
for i in range(len(words)):
    for j in range(len(docs)):
        doc_words = [word for segment in list(docs.values())[j] for word in segment]
        word_count = len(list(filter(lambda x: x == words[i], doc_words)))
        tf = word_count / len(doc_words) if len(doc_words) > 0 else 0
        idf = np.log(len(docs) / len(inverted_index_ir[words[i]]))
        tf_idf_matrix[i][j] = tf * idf
        idf_list.append(idf)

print(" ")
print("TF-IDF Matrix:")
print(" ")
data = pd.DataFrame(tf_idf_matrix, index=words, columns=docs.keys()) 
print(np.array(tf_idf_matrix))
print(" ")
print(data)
print(" ")
query = "plant eaters"
print(f"Query: {query}")
print(" ")

def cosine_similarity_matrix(doc_matrix, query_vector):
    doc_norms = np.linalg.norm(doc_matrix, axis=0)
    query_norm = np.linalg.norm(query_vector) 
    #Avoid division by zero
    doc_norms[doc_norms == 0] = 1e-10
    query_norm = query_norm if query_norm != 0 else 1e-10
    similarities = doc_matrix.T @ query_vector / (doc_norms * query_norm)
    return similarities

print("Cosine Similarity: ")
print(" ")
query = segmenter.punkt(query)
query = tokenizer.pennTreeBank(query)
query = inflectionReducer.reduce(query)
temp_list = []
for segment in query:
    temp_list.append([word for segment in query for word in segment if word not in stopwords])
query = temp_list
tf_idf_query = []
for p in range(len(words)):
    query_words = []
    for sentence in query: 
        for word in sentence:
            query_words.append(word)
    word_count = len(list(filter(lambda x: x == words[p], query_words)))
    tf = word_count / len(query_words) if len(query_words) > 0 else 0
    tf_idf_query.append(tf * idf_list[p])
print(tf_idf_query)

cosine_similarities = cosine_similarity_matrix(np.array(tf_idf_matrix), tf_idf_query) 
print(cosine_similarities)
rankings = np.argsort(cosine_similarities)[::-1]
print("Ranked documents:")
print(" ")
for i in rankings:
    print(f"Document {list(docs.keys())[i]}: {cosine_similarities[i]}")

IR preprocessing method
 
eat: ['d3']
herbivor: ['d1']
meat: ['d1', 'd2']
leav: ['d3']
eater: ['d1', 'd2']
plant: ['d1', 'd2']
grass: ['d3']
carnivor: ['d2']
typic: ['d1', 'd2']
deer: ['d3']
 
TF-IDF Matrix:
 
[[0.         0.         0.27465307]
 [0.18310205 0.         0.        ]
 [0.06757752 0.06757752 0.        ]
 [0.         0.         0.27465307]
 [0.13515504 0.13515504 0.        ]
 [0.06757752 0.06757752 0.        ]
 [0.         0.         0.27465307]
 [0.         0.18310205 0.        ]
 [0.06757752 0.06757752 0.        ]
 [0.         0.         0.27465307]]
 
                d1        d2        d3
eat       0.000000  0.000000  0.274653
herbivor  0.183102  0.000000  0.000000
meat      0.067578  0.067578  0.000000
leav      0.000000  0.000000  0.274653
eater     0.135155  0.135155  0.000000
plant     0.067578  0.067578  0.000000
grass     0.000000  0.000000  0.274653
carnivor  0.000000  0.183102  0.000000
typic     0.067578  0.067578  0.000000
deer      0.000000  0.000000  0.27465

## Assignment 1

In [23]:
text = "1. Mr. Johnson and Mrs. Smith arrived at 10 a.m. to discuss the project. He is working for X.Y.Z company. 2. Dr. Williams presented the latest findings, e.g., the new experimental results, which showed promising outcomes [Time now is 10 a.m.]. They also reviewed various strategies, including market analysis, customer feedback. Despite the lengthy meeting, Mr. and Mrs. Brown remained engaged throughout the discussion."
segmentedText = None 
exceptions = ["Mr", "Mrs", "Ms", "Dr", "Prof", "Inc", "no", "rev", "Ltd" "e.g", "i.e", "a.m", "p.m"]
exceptions += [str(char) for char in range(9)] 
punctuations = ".?!;"
segmentedText = []
words = text.split(" ")
temp_list = []
for word in words:
    if word == "":
        continue
    if any(exception in word for exception in exceptions):
        if any(char in word for char in ")}]"):
            for bracket in ")}]":
                if bracket in word:
                    temp_list.append(word)
                    break
            if any(char == word[-1] for char in punctuations):
                segmentedText.append(" ".join(temp_list))
                temp_list = [] 
        else:
            temp_list.append(word)
    else:
        if any(char == word[-1] for char in punctuations):
            temp_list.append(word)
            segmentedText.append(" ".join(temp_list))
            temp_list = [] 
        else:
            temp_list.append(word)  
#Fill in code here
for t in segmentedText:
    print(t)

1. Mr. Johnson and Mrs. Smith arrived at 10 a.m. to discuss the project.
He is working for X.Y.Z company.
2. Dr. Williams presented the latest findings, e.g., the new experimental results, which showed promising outcomes [Time now is 10 a.m.].
They also reviewed various strategies, including market analysis, customer feedback.
Despite the lengthy meeting, Mr. and Mrs. Brown remained engaged throughout the discussion.


In [39]:
import nltk 

sentence_tokenizer = nltk.tokenize.sent_tokenize
text = "Mr. Johnson and Mrs. Smith arrived at 10 a.m. to discuss the project. He is working for X.Y.Z Inc. . Wait... Dr. Williams presented the latest findings, e.g., the new experimental results, which showed promising outcomes [Time now is 10 a.m.]. They also reviewed various strategies, including market analysis, customer feedback. Despite the lengthy meeting, Mr. and Mrs. Brown remained engaged throughout the discussion."
result = sentence_tokenizer(text)
for res in result:
    print(res)

Mr. Johnson and Mrs. Smith arrived at 10 a.m. to discuss the project.
He is working for X.Y.Z Inc. .
Wait... Dr. Williams presented the latest findings, e.g., the new experimental results, which showed promising outcomes [Time now is 10 a.m.].
They also reviewed various strategies, including market analysis, customer feedback.
Despite the lengthy meeting, Mr. and Mrs. Brown remained engaged throughout the discussion.


In [59]:
import nltk 
from nltk.tokenize.punkt import PunktSentenceTokenizer
sentence_tokenizer = PunktSentenceTokenizer()
text = "Mr. Johnson and Mrs. Smith arrived at 10 a.m. to discuss the project. He is working for X.Y.Z Inc. . Wait... Dr. Williams presented the latest findings, e.g., the new experimental results, which showed promising outcomes [Time now is 10 a.m.]. They also reviewed various strategies, including market analysis, customer feedback. Despite the lengthy meeting, Mr. and Mrs. Brown remained engaged throughout the discussion."
result = sentence_tokenizer.tokenize(text)
for res in result:
    print(res)

Mr.
Johnson and Mrs.
Smith arrived at 10 a.m.
to discuss the project.
He is working for X.Y.Z Inc.
.
Wait... Dr.
Williams presented the latest findings, e.g., the new experimental results, which showed promising outcomes [Time now is 10 a.m.].
They also reviewed various strategies, including market analysis, customer feedback.
Despite the lengthy meeting, Mr.
and Mrs.
Brown remained engaged throughout the discussion.


In [27]:
from nltk.tokenize import word_tokenize
 
text = "Despite the lengthy meeting, Mr. and Mrs. Brown remained engaged throughout the discussion."
word_tokenize(text)

['Despite',
 'the',
 'lengthy',
 'meeting',
 ',',
 'Mr.',
 'and',
 'Mrs.',
 'Brown',
 'remained',
 'engaged',
 'throughout',
 'the',
 'discussion',
 '.']

In [53]:
from nltk.tokenize import word_tokenize
 
text = '''Good muffins cost $3.88\nin New York Please buy me\ntwo of them\nThanks'''
word_tokenize(text)

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 'Thanks']

In [78]:
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
s = '''Good muffins cost $3.88\nin New York  Please buy me\ntwo of them\nThanks. "Meet DrX at 5 p.m. in Lab-2. State-of-the-art models are expensive. +3.14 is pi'''
d = TreebankWordDetokenizer()
t = TreebankWordTokenizer()
toks = t.tokenize(s)
print(toks)
d.detokenize(toks)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks.', '``', 'Meet', 'DrX', 'at', '5', 'p.m.', 'in', 'Lab-2.', 'State-of-the-art', 'models', 'are', 'expensive.', '+3.14', 'is', 'pi']


'Good muffins cost $3.88 in New York Please buy me two of them Thanks. "Meet DrX at 5 p.m. in Lab-2. State-of-the-art models are expensive. +3.14 is pi'

In [57]:
word_tokenize(text) == toks

True

In [79]:
segment = '''It's John's car. Good muffins cost $3.88\nin New York /static/ Please buy me\ntwo of them\nThanks. Mr. and Mrs. Let's have some 3. +3.14 is Pi. State-of-the-art model'''
punctuations = ".,!?;:'\"()[]{}$"
exceptions = ["Mr", "Mrs", "Ms", "Dr", "Prof", "Inc", "no", "rev", "Ltd", "e.g", "i.e", "a.m", "p.m"]
exceptions += [str(i) for i in range(10)]

tokens = []
current_word = ""

for char in segment:
    if char.isalnum() or char == "'":
        current_word += char
    else:
        if current_word and current_word in exceptions and char == ".":
            current_word += char
            continue
        if current_word:
            tokens.append(current_word)
            current_word = ""
        if char in punctuations or char in "+-/*^&@#%^":
            tokens.append(char)

if current_word:
    tokens.append(current_word)

print(tokens)


["It's", "John's", 'car', '.', 'Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '/', 'static', '/', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks', '.', 'Mr.', 'and', 'Mrs.', "Let's", 'have', 'some', '3.', '+', '3.14', 'is', 'Pi', '.', 'State', '-', 'of', '-', 'the', '-', 'art', 'model']


In [2]:

import nltk
from nltk.corpus import stopwords

print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she