### count vectorization example

In [1]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.6 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 11.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 39.5 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 42.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 34.1 MB/s 
Building wheels for collected 

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import InputExample, SentenceTransformer, models, losses, evaluation, util
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
question1 = 'What is the most populous state in the USA?'
question2 = 'Which state in the United States has the most people?'

In [4]:
vectorizer = CountVectorizer()
sample = [question1, question2]

In [5]:
def f(number):
    return float(number)

In [6]:
fit = vectorizer.fit_transform(sample).todense()
vector1 = fit[0].tolist()[0]
vector2 = fit[1].tolist()[0]
float_numbers_iterator1 = map(f, vector1)
float_numbers_iterator2 = map(f, vector2)
vector1 = list(float_numbers_iterator1)
vector2 = list(float_numbers_iterator2)
words = vectorizer.get_feature_names_out()

In [7]:
print(words)
print(vector1)
print(vector2)

['has' 'in' 'is' 'most' 'people' 'populous' 'state' 'states' 'the'
 'united' 'usa' 'what' 'which']
[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 2.0, 0.0, 1.0, 1.0, 0.0]
[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 2.0, 1.0, 0.0, 0.0, 1.0]


In [8]:
similarity1 = util.cos_sim(vector1, vector2)
print(similarity1)

tensor([[0.6093]])


In [9]:
# extract noun
import spacy
nlp = spacy.load("en_core_web_sm")

def get_noun(s: str):
    doc = nlp(s)
    if doc[0].pos_ == 'NOUN':
        return 1
    else:
        return 0

In [10]:
vector3 = vector1
vector4 = vector2
for j in range(len(words)):
    pro = get_noun(words[j])
    if pro==0:
        vector3[j]=0.0
        vector4[j]=0.0

In [11]:
print(words)
print(vector3)
print(vector4)

['has' 'in' 'is' 'most' 'people' 'populous' 'state' 'states' 'the'
 'united' 'usa' 'what' 'which']
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [12]:
# compute the similarity
similarity2 = util.cos_sim(vector3, vector4)
print(similarity2)

tensor([[0.5774]])


### influence of typo

In [13]:
question3 = 'What is the most populous stete in the USA?'  # change "state" into "stete" 
question4 = 'Which state in the United States has the most people?'
vectorizer = CountVectorizer()
sample = [question3, question4]

In [14]:
fit = vectorizer.fit_transform(sample).todense()
vector1 = fit[0].tolist()[0]
vector2 = fit[1].tolist()[0]
float_numbers_iterator1 = map(f, vector1)
float_numbers_iterator2 = map(f, vector2)
vector1 = list(float_numbers_iterator1)
vector2 = list(float_numbers_iterator2)
words = vectorizer.get_feature_names_out()

In [15]:
print(words)
print(vector1)
print(vector2)

['has' 'in' 'is' 'most' 'people' 'populous' 'state' 'states' 'stete' 'the'
 'united' 'usa' 'what' 'which']
[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 1.0, 0.0]
[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 2.0, 1.0, 0.0, 0.0, 1.0]


In [16]:
similarity3 = util.cos_sim(vector1, vector2)
print(similarity3)

tensor([[0.5222]])


### the performance on abbriviation

In [17]:
question5 = 'What is the most populous state in the USA?'
question6 = 'Which state in the USA has the most people?' # change "United State" to "USA"
vectorizer = CountVectorizer()
sample = [question5, question6]

In [18]:
fit = vectorizer.fit_transform(sample).todense()
vector1 = fit[0].tolist()[0]
vector2 = fit[1].tolist()[0]
float_numbers_iterator1 = map(f, vector1)
float_numbers_iterator2 = map(f, vector2)
vector1 = list(float_numbers_iterator1)
vector2 = list(float_numbers_iterator2)
words = vectorizer.get_feature_names_out()

In [19]:
print(words)
print(vector1)
print(vector2)

['has' 'in' 'is' 'most' 'people' 'populous' 'state' 'the' 'usa' 'what'
 'which']
[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0.0]
[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 2.0, 1.0, 0.0, 1.0]


In [20]:
similarity3 = util.cos_sim(vector1, vector2)
print(similarity3)

tensor([[0.7273]])
