### count vectorization example

In [None]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import InputExample, SentenceTransformer, models, losses, evaluation, util
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
question1 = 'What is the most populous state in the USA?'
question2 = 'Which state in the United States has the most people?'

In [None]:
vectorizer = CountVectorizer()
sample = [question1, question2]

In [None]:
def f(number):
    return float(number)

In [None]:
fit = vectorizer.fit_transform(sample).todense()
vector1 = fit[0].tolist()[0]
vector2 = fit[1].tolist()[0]
float_numbers_iterator1 = map(f, vector1)
float_numbers_iterator2 = map(f, vector2)
vector1 = list(float_numbers_iterator1)
vector2 = list(float_numbers_iterator2)
words = vectorizer.get_feature_names_out()

In [None]:
print(words)
print(vector1)
print(vector2)

['has' 'in' 'is' 'most' 'people' 'populous' 'state' 'states' 'the'
 'united' 'usa' 'what' 'which']
[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 2.0, 0.0, 1.0, 1.0, 0.0]
[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 2.0, 1.0, 0.0, 0.0, 1.0]


In [None]:
similarity1 = util.cos_sim(vector1, vector2)
print(similarity1)

tensor([[0.6093]])


In [None]:
# extract noun
import spacy
nlp = spacy.load("en_core_web_sm")

def get_noun(s: str):
    doc = nlp(s)
    if doc[0].pos_ == 'NOUN':
        return 1
    else:
        return 0

In [None]:
vector3 = vector1
vector4 = vector2
for j in range(len(words)):
    pro = get_noun(words[j])
    if pro==0:
        vector3[j]=0.0
        vector4[j]=0.0

In [None]:
print(words)
print(vector3)
print(vector4)

['has' 'in' 'is' 'most' 'people' 'populous' 'state' 'states' 'the'
 'united' 'usa' 'what' 'which']
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
# compute the similarity
similarity2 = util.cos_sim(vector3, vector4)
print(similarity2)

tensor([[0.5774]])


### influence of typo

In [None]:
question3 = 'What is the most populous stete in the USA?'  # change "state" into "stete" 
question4 = 'Which state in the United States has the most people?'
vectorizer = CountVectorizer()
sample = [question3, question4]

In [None]:
fit = vectorizer.fit_transform(sample).todense()
vector1 = fit[0].tolist()[0]
vector2 = fit[1].tolist()[0]
float_numbers_iterator1 = map(f, vector1)
float_numbers_iterator2 = map(f, vector2)
vector1 = list(float_numbers_iterator1)
vector2 = list(float_numbers_iterator2)
words = vectorizer.get_feature_names_out()

In [None]:
print(words)
print(vector1)
print(vector2)

['has' 'in' 'is' 'most' 'people' 'populous' 'state' 'states' 'stete' 'the'
 'united' 'usa' 'what' 'which']
[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 1.0, 0.0]
[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 2.0, 1.0, 0.0, 0.0, 1.0]


In [None]:
similarity3 = util.cos_sim(vector1, vector2)
print(similarity3)

tensor([[0.5222]])


### the performance on abbriviation

In [None]:
question5 = 'What is the most populous state in the USA?'
question6 = 'Which state in the USA has the most people?' # change "United State" to "USA"
vectorizer = CountVectorizer()
sample = [question5, question6]

In [None]:
fit = vectorizer.fit_transform(sample).todense()
vector1 = fit[0].tolist()[0]
vector2 = fit[1].tolist()[0]
float_numbers_iterator1 = map(f, vector1)
float_numbers_iterator2 = map(f, vector2)
vector1 = list(float_numbers_iterator1)
vector2 = list(float_numbers_iterator2)
words = vectorizer.get_feature_names_out()

In [None]:
print(words)
print(vector1)
print(vector2)

['has' 'in' 'is' 'most' 'people' 'populous' 'state' 'the' 'usa' 'what'
 'which']
[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0.0]
[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 2.0, 1.0, 0.0, 1.0]


In [None]:
similarity3 = util.cos_sim(vector1, vector2)
print(similarity3)

tensor([[0.7273]])
