In [1]:
sentence_one = "I am a cow"
sentence_two = "Cow is what I am"
sentence_three = "Today is Tuesday"

### Get unique words from all three sentences

In [2]:
tokens = []
for word in sentence_one.lower().split():
    if word not in tokens:
        tokens.append(word)
        
for word in sentence_two.lower().split():
    if word not in tokens:
        tokens.append(word)

for word in sentence_three.lower().split():
    if word not in tokens:
        tokens.append(word)

In [3]:
tokens

['i', 'am', 'a', 'cow', 'is', 'what', 'today', 'tuesday']

### Generate vector representations for all three sentences

In [4]:
vector_one = []
for token in tokens:
    if token in sentence_one.lower().split():
            vector_one.append(1)
    else:
            vector_one.append(0)

In [5]:
vector_one

[1, 1, 1, 1, 0, 0, 0, 0]

In [6]:
vector_two = []
for token in tokens:
    if token in sentence_two.lower().split():
            vector_two.append(1)
    else:
            vector_two.append(0)

In [7]:
vector_two

[1, 1, 0, 1, 1, 1, 0, 0]

In [8]:
vector_three = []
for token in tokens:
    if token in sentence_three.lower().split():
            vector_three.append(1)
    else:
            vector_three.append(0)

In [9]:
vector_three

[0, 0, 0, 0, 1, 0, 1, 1]

### Generate tabular data for analysis

In [10]:
import pandas as pd

In [11]:
dataFrame = pd.DataFrame(columns=['words','sentence_one', 'sentence_two','sentence_three'])

In [12]:
dataFrame['words'] = tokens
dataFrame['sentence_one'] = vector_one
dataFrame['sentence_two'] = vector_two
dataFrame['sentence_three'] = vector_three

In [13]:
print(sentence_one + '\n' + sentence_two + '\n' + sentence_three)
dataFrame

I am a cow
Cow is what I am
Today is Tuesday


Unnamed: 0,words,sentence_one,sentence_two,sentence_three
0,i,1,1,0
1,am,1,1,0
2,a,1,0,0
3,cow,1,1,0
4,is,0,1,1
5,what,0,1,0
6,today,0,0,1
7,tuesday,0,0,1


### Observations and Analysis

## Which of the sentences contain cow but not tuesday?

In [14]:
cow_vector = [1,1,0]
print(f'Vector for cow : {cow_vector}')

Vector for cow : [1, 1, 0]


In [15]:
tuesday_vector = [0,0,1]
print(f'Vector for tuesday : {tuesday_vector}')

Vector for tuesday : [0, 0, 1]


In [16]:
not_tuesday_vector = [1,1,0]

In [17]:
result_vector = []
for i in range(len(cow_vector)):
    # For 1 & 1
    if (cow_vector[i] == 1 and not_tuesday_vector[i] == 1):
        result_vector.append(1)
    # For 1 & 0 or 0 & 1
    elif (cow_vector[i] == 1 and not_tuesday_vector[i] == 0) or (cow_vector[i] == 0 and not_tuesday_vector[1] == 1):
        result_vector.append(0)
    else:
        result_vector.append(0)
print(f'Final vector post BITWISE AND operator : {result_vector}')

Final vector post BITWISE AND operator : [1, 1, 0]


##### Hence, we conclude that, sentence one and sentence two contain the term 'cow' but not 'tuesday'

## Are there any other observations which can be made?

In [18]:
import numpy as np

In [19]:
print(f'Dot Product of sentence one and two is -> {np.dot(vector_one, vector_two)}')

Dot Product of sentence one and two is -> 3


In [20]:
print(f'Dot Product of sentence one and three is -> {np.dot(vector_one, vector_three)}')

Dot Product of sentence one and three is -> 0


In [21]:
print(f'Dot Product of sentence two and three is -> {np.dot(vector_two, vector_three)}')

Dot Product of sentence two and three is -> 1


##### Hence, we conclude that, sentence one and sentence two are quite similar to each other.

## Inverted Index 

In [22]:
sorted_tokens = sorted(tokens)

In [23]:
sorted_tokens

['a', 'am', 'cow', 'i', 'is', 'today', 'tuesday', 'what']

In [24]:
token_frequency = []
for token in sorted_tokens:
    token_frq = 0
    for word in sentence_one.lower().split():
        if word == token:
            token_frq += 1
    for word in sentence_two.lower().split():
        if word == token:
            token_frq += 1
    for word in sentence_three.lower().split():
        if word == token:
            token_frq += 1
    token_frequency.append(token_frq)
            

In [25]:
token_frequency

[1, 2, 2, 2, 2, 1, 1, 1]

In [26]:
posting_list = []
for token in sorted_tokens:
    p_list = ""
    for word in sentence_one.lower().split():
        if word == token and  "1 -->" not in p_list:
            p_list += "1 -->"
    for word in sentence_two.lower().split():
        if word == token and  "2 -->" not in p_list:
            p_list += "2 -->"
    for word in sentence_three.lower().split():
        if word == token and  "3 -->" not in p_list:
            p_list += "3 -->"
    posting_list.append(p_list)            

In [27]:
posting_list

['1 -->',
 '1 -->2 -->',
 '1 -->2 -->',
 '1 -->2 -->',
 '2 -->3 -->',
 '3 -->',
 '3 -->',
 '2 -->']

In [28]:
posting_list_df = pd.DataFrame(columns = ['words','term_frequency','posting_list'])

In [29]:
posting_list_df['words'] = sorted_tokens
posting_list_df['term_frequency'] = token_frequency
posting_list_df['posting_list'] = posting_list

In [30]:
print(sentence_one + '\n' + sentence_two + '\n' + sentence_three)
posting_list_df

I am a cow
Cow is what I am
Today is Tuesday


Unnamed: 0,words,term_frequency,posting_list
0,a,1,1 -->
1,am,2,1 -->2 -->
2,cow,2,1 -->2 -->
3,i,2,1 -->2 -->
4,is,2,2 -->3 -->
5,today,1,3 -->
6,tuesday,1,3 -->
7,what,1,2 -->


In [31]:
input_search = input("Enter the search string : ")

Enter the search string : what cow


In [32]:
input_search

'what cow'

In [33]:
filter_from_posting_list = []
for token in input_search.split():
    filter_from_posting_list.append(posting_list_df[posting_list_df.words == token]['posting_list'].item())

In [34]:
filter_from_posting_list

['2 -->', '1 -->2 -->']

In [35]:
searched_list = []
for x in filter_from_posting_list:
    for y in x.split(' -->'):
        if y is not '':
            searched_list.append(y)

  if y is not '':


In [36]:
print(f'Documents which are retrived based on the input strings are - {set(sorted(searched_list))}')

Documents which are retrived based on the input strings are - {'1', '2'}
