# Cosine Simlirity

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
sentence_1 = "I love flying with the airline"

sentence_2 = "This airline is my favorite"

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform([sentence_1, sentence_2])

cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

print(cos_sim)

[[0.11234278]]


In [3]:
sentence_1 = "I like to eat early in the morning"

sentence_2 = "Eating late is what i like"

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform([sentence_1, sentence_2])

cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

print(cos_sim)

[[0.09349477]]


# Web Scrapping and installation and NLTK Vs Spacy

In [4]:
import urllib3
from bs4 import BeautifulSoup

In [5]:
url = "https://en.wikipedia.org/wiki/Natural_language_processing"

http = urllib3.PoolManager()

response = http.request('GET', url)



In [6]:
soup = BeautifulSoup(response.data, 'html.parser')

text = soup.get_text()

with open('nlp_article.txt', 'w', encoding='utf-8') as f:
    f.write(text)

print("Article saved to nlp_article.txt")

Article saved to nlp_article.txt


In [7]:
import nltk, time

In [8]:
!pip install nltk



In [9]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [10]:
with open('nlp_article.txt', encoding='utf-8') as f:
    text = f.read()

start = time.time()

tokens_nltk = nltk.word_tokenize(text)

pos_tags_nltk = nltk.pos_tag(tokens_nltk)

end = time.time()

print("Time taken by NLTK: ", round(end - start), 'seconds')
print("Number of tokens: ", len(tokens_nltk))
print("Number of POS tags: ", len(pos_tags_nltk))

Time taken by NLTK:  0 seconds
Number of tokens:  9078
Number of POS tags:  9078


In [12]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 452.7 kB/s eta 0:00:28
     - ------------------------------------- 0.5/12.8 MB 452.7 kB/s eta 0:00:28
     - ------------------------------------- 0.5/12.8 MB 452.7 kB/s eta 0:00:28
     - ------------------------------------- 0.5/12.8 MB 452.7 kB/s eta 0:00:28
     - ------------------------------------- 0.5/12.8 MB 452.7 kB/s eta 

In [15]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

with open('nlp_article.txt', encoding='utf-8') as f:
    text = f.read()

start = time.time()

doc = nlp(text)

tokens_spacy = [token.text for token in doc]

pos_tags_spacy = [(token.text, token.pos_) for token in doc]

end = time.time()

In [16]:
print("Time taken by Spacy: ", round(end - start), 'seconds')
print("Number of tokens: ", len(tokens_spacy))
print("Number of POS tags: ", len(pos_tags_spacy))

Time taken by Spacy:  1 seconds
Number of tokens:  10030
Number of POS tags:  10030


# Tokenization

In [17]:
s = 'Hi Everyone ! hola gr8'

print(s.split())

['Hi', 'Everyone', '!', 'hola', 'gr8']


In [18]:
from nltk.tokenize import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize

print('Word tokenize : ', word_tokenize(s))
print("-------------------------------------")
print('Regexp tokenize : ', regexp_tokenize(s, pattern = '\w+'))
print("-------------------------------------")
print('Regexp tokenize : ', regexp_tokenize(s, pattern = '\d+'))
print("-------------------------------------")
print('Word punct tokenize : ', wordpunct_tokenize(s))
print("-------------------------------------")
print('Blank line tokenize : ', blankline_tokenize(s))


Word tokenize :  ['Hi', 'Everyone', '!', 'hola', 'gr8']
-------------------------------------
Regexp tokenize :  ['Hi', 'Everyone', 'hola', 'gr8']
-------------------------------------
Regexp tokenize :  ['8']
-------------------------------------
Word punct tokenize :  ['Hi', 'Everyone', '!', 'hola', 'gr8']
-------------------------------------
Blank line tokenize :  ['Hi Everyone ! hola gr8']


  print('Regexp tokenize : ', regexp_tokenize(s, pattern = '\w+'))
  print('Regexp tokenize : ', regexp_tokenize(s, pattern = '\d+'))


In [19]:
print(regexp_tokenize(s, pattern = '\w+'))
print(regexp_tokenize(s, pattern = '\d+'))

['Hi', 'Everyone', 'hola', 'gr8']
['8']


  print(regexp_tokenize(s, pattern = '\w+'))
  print(regexp_tokenize(s, pattern = '\d+'))


In [20]:
print(wordpunct_tokenize(s))

['Hi', 'Everyone', '!', 'hola', 'gr8']


In [21]:
s = '123456789'

print(regexp_tokenize(s, pattern = '\d+'))

['123456789']


  print(regexp_tokenize(s, pattern = '\d+'))


In [22]:
import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp(u'I am flying to San Francisco.')

print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'San', 'Francisco', '.']


In [23]:
print("spaCy Tokenization : ", [token.text for token in doc])

spaCy Tokenization :  ['I', 'am', 'flying', 'to', 'San', 'Francisco', '.']


In [24]:
for token in doc:

    print(f"{token.text:<12} | POS : {token.pos_:<10} | Lemma : {token.lemma_}")

I            | POS : PRON       | Lemma : I
am           | POS : AUX        | Lemma : be
flying       | POS : VERB       | Lemma : fly
to           | POS : ADP        | Lemma : to
San          | POS : PROPN      | Lemma : San
Francisco    | POS : PROPN      | Lemma : Francisco
.            | POS : PUNCT      | Lemma : .
