In [42]:
example = """Natural language processing (NLP) is a subfield of computer science and liguistics.
It's concerned with giving computers the ability to support and manipulate human language."""

In [43]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [44]:
sentences = sent_tokenize(example)

In [45]:
sentences

['Natural language processing (NLP) is a subfield of computer science and liguistics.',
 "It's concerned with giving computers the ability to support and manipulate human language."]

In [46]:
len(sentences)

2

In [47]:
words=word_tokenize(example)

In [48]:
words

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'subfield',
 'of',
 'computer',
 'science',
 'and',
 'liguistics',
 '.',
 'It',
 "'s",
 'concerned',
 'with',
 'giving',
 'computers',
 'the',
 'ability',
 'to',
 'support',
 'and',
 'manipulate',
 'human',
 'language',
 '.']

In [49]:
len(words)

30

In [50]:
from nltk.corpus import stopwords

In [51]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
stop_words = set(stopwords.words('english'))

In [53]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [54]:
filtered_list = [word for word in words if word.casefold() not in stop_words]

In [55]:
filtered_list

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'subfield',
 'computer',
 'science',
 'liguistics',
 '.',
 "'s",
 'concerned',
 'giving',
 'computers',
 'ability',
 'support',
 'manipulate',
 'human',
 'language',
 '.']

In [56]:
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

In [57]:
stemmed_words=[stemmer.stem(word) for word in words]

In [58]:
stemmed_words

['natur',
 'languag',
 'process',
 '(',
 'nlp',
 ')',
 'is',
 'a',
 'subfield',
 'of',
 'comput',
 'scienc',
 'and',
 'liguist',
 '.',
 'it',
 "'s",
 'concern',
 'with',
 'give',
 'comput',
 'the',
 'abil',
 'to',
 'support',
 'and',
 'manipul',
 'human',
 'languag',
 '.']

In [59]:
from nltk.stem import SnowballStemmer
stemmer=SnowballStemmer('english')

In [60]:
stemmed_words2 = [stemmer.stem(word) for word in words]

In [61]:
stemmed_words2

['natur',
 'languag',
 'process',
 '(',
 'nlp',
 ')',
 'is',
 'a',
 'subfield',
 'of',
 'comput',
 'scienc',
 'and',
 'liguist',
 '.',
 'it',
 "'s",
 'concern',
 'with',
 'give',
 'comput',
 'the',
 'abil',
 'to',
 'support',
 'and',
 'manipul',
 'human',
 'languag',
 '.']

In [62]:
from nltk.stem import WordNetLemmatizer

In [63]:
lematizer=WordNetLemmatizer()

In [64]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [65]:
lematizer.lemmatize('thieves')

'thief'

In [66]:
lematizer.lemmatize('worst')

'worst'

In [67]:
lematizer.lemmatize('worst', pos='a')

'bad'

In [68]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [69]:
nltk.pos_tag(words)

[('Natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('(', '('),
 ('NLP', 'NNP'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('subfield', 'NN'),
 ('of', 'IN'),
 ('computer', 'NN'),
 ('science', 'NN'),
 ('and', 'CC'),
 ('liguistics', 'NNS'),
 ('.', '.'),
 ('It', 'PRP'),
 ("'s", 'VBZ'),
 ('concerned', 'VBN'),
 ('with', 'IN'),
 ('giving', 'VBG'),
 ('computers', 'NNS'),
 ('the', 'DT'),
 ('ability', 'NN'),
 ('to', 'TO'),
 ('support', 'VB'),
 ('and', 'CC'),
 ('manipulate', 'VB'),
 ('human', 'JJ'),
 ('language', 'NN'),
 ('.', '.')]

In [70]:
pip install sentence-transformers



In [71]:
from sentence_transformers import SentenceTransformer

In [72]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [73]:
embedding = model.encode(example)

In [74]:
print(embedding)

[ 7.63483951e-03 -2.36063749e-02  5.47219552e-02 -1.71165857e-02
  3.10756341e-02 -1.75687801e-02  3.06161549e-02  2.50275619e-02
 -1.83027908e-02  3.55548449e-02 -1.18125752e-02 -2.19016522e-03
 -8.74949328e-05  1.07853748e-02  6.95303902e-02  8.55788067e-02
 -9.09885019e-03 -3.43363211e-02 -6.09980784e-02 -4.36937399e-02
  2.71246191e-02  8.74860138e-02 -8.61152932e-02 -4.70523126e-02
  4.03631441e-02  8.86770263e-02 -3.70660760e-02 -5.63942641e-02
  4.81498092e-02  2.91466359e-02  5.00762078e-04  1.36128636e-02
  3.46877314e-02  1.01392016e-01 -3.01228836e-02  6.57725781e-02
  4.27579827e-04  1.64930604e-03 -2.77404878e-02 -2.12131217e-02
 -8.86597559e-02 -2.40936112e-02 -5.53331226e-02  3.82887735e-03
  1.16902128e-01  7.38398451e-03 -1.21722750e-01  2.84699760e-02
 -9.28271264e-02 -1.24198589e-02 -1.14037119e-01  3.08112055e-02
  5.90500608e-02  8.94933641e-02 -6.63039833e-02  2.94000506e-02
  1.28335701e-02 -2.28534155e-02 -2.03745961e-02 -1.02535233e-01
 -3.49764414e-02 -4.64759

In [75]:
len(embedding)

384

In [76]:
import numpy as np

In [77]:
words = ['computer', 'laptop', 'car', 'motorbike', 'flower']

In [78]:
embeddings = model.encode(words)

In [79]:
len(embeddings)

5

In [80]:
from sklearn.metrics.pairwise import cosine_similarity

In [81]:
import itertools

In [82]:
for (i,word1), (j,word2) in itertools.combinations(enumerate(words), 2):
    sim = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
    print(f"{word1:10} {word2:10}, podobienstwo {sim: .4F}")

computer   laptop    , podobienstwo  0.7140
computer   car       , podobienstwo  0.5321
computer   motorbike , podobienstwo  0.3913
computer   flower    , podobienstwo  0.3406
laptop     car       , podobienstwo  0.4516
laptop     motorbike , podobienstwo  0.4111
laptop     flower    , podobienstwo  0.2536
car        motorbike , podobienstwo  0.5413
car        flower    , podobienstwo  0.3884
motorbike  flower    , podobienstwo  0.2705
