# Подготовка

In [None]:
pip install 'spacy>=3.0.0'

In [None]:
import nltk
nltk.download(['gutenberg'])

In [None]:
ls /root/nltk_data/corpora/gutenberg

In [None]:
import re
import nltk
import math
import numpy as np
import pandas as pd

# NLTK

## Токенизация

In [None]:
some_text = """We produce about two million dollars for each hour we work.  The
fifty hours is one conservative estimate for how long it we take
to get any etext selected, entered, proofread, edited, copyright
searched and analyzed, the copyright letters written, etc.  This
projected audience is one hundred million readers.  If our value
per text is nominally estimated at one dollar, then we produce 2
million dollars per hour this year we, will have to do four text
files per month:  thus upping our productivity from one million.
The Goal of Project Gutenberg is to Give Away One Trillion Etext
Files by the December 31, 2001.  [10,000 x 100,000,000=Trillion]
This is ten thousand titles each to one hundred million readers,
which is 10% of the expected number of computer users by the end
of the year 2001."""

In [None]:
nltk.download('punkt')

In [None]:
sentences = nltk.sent_tokenize(some_text)
sentences

In [None]:
words = [nltk.word_tokenize(s) for s in sentences]
words

## Лемматизация и стемминг слова

In [None]:
nltk.download('wordnet')

In [None]:
!python -m spacy download ru_core_news_sm

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import wordnet

In [None]:
stemmer = PorterStemmer()
snowball_en = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

In [None]:
word = 'dogs'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word))

In [None]:
word = 'walked'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

In [None]:
word = 'drove'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

In [None]:
word = 'seen'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

In [None]:
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

In [None]:
import spacy
snowball_ru = SnowballStemmer('russian')
nlp = spacy.load("ru_core_news_sm")

In [None]:
word = 'собаки'
print(snowball_ru.stem(word))
for token in nlp(word):
  print(nlp.get_pipe("lemmatizer").lookup_lemmatize(token))
  print(nlp.get_pipe("lemmatizer").rule_lemmatize(token))

In [None]:
word = 'собаками'
print(snowball_ru.stem(word))
for token in nlp(word):
  print(nlp.get_pipe("lemmatizer").lookup_lemmatize(token))
  print(nlp.get_pipe("lemmatizer").rule_lemmatize(token))

In [None]:
word = 'ходил'
print(snowball_ru.stem(word))
for token in nlp(word):
  print(nlp.get_pipe("lemmatizer").lookup_lemmatize(token))
  print(nlp.get_pipe("lemmatizer").rule_lemmatize(token))

In [None]:
word = 'прохаживался'
print(snowball_ru.stem(word))
for token in nlp(word):
  print(nlp.get_pipe("lemmatizer").lookup_lemmatize(token))
  print(nlp.get_pipe("lemmatizer").rule_lemmatize(token))

## Стоп-слова

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
without_stop_words = [word for word in words[0] if not word in stop_words]
words[0], without_stop_words

In [None]:
print(stopwords.raw('russian')[:30])

## Мешок слов

In [None]:
reviews = [
           'This pasta is very tasty and affordable.',
           'This pasta is not tasty and is affordable.',
           'This pasta is delicious and cheap.',
           'Pasta is tasty and pasta tastes good.',
]

In [None]:
from itertools import chain

words = chain(*[nltk.word_tokenize(r) for r in reviews])
unique_words = set([w.lower() for w in words])
unique_words, len(unique_words)

In [None]:
keys = list(unique_words)
values = range(len(unique_words))
pairs = list(zip(keys, values))
pairs

In [None]:
lower_reviews = [r.lower() for r in reviews]
lower_reviews

In [None]:
lower_reviews[0]

In [None]:
review_words = [nltk.word_tokenize(r) for r in lower_reviews]
review_words[0]

In [None]:
[(v, review_words[0].count(k)) for k, v in pairs]

In [None]:
def word_frequencies(words, vocabulary):
    return [(v, words.count(k)) for k, v in vocabulary]

word_frequencies(review_words[0], pairs)

In [None]:
freqs = [word_frequencies(r_w, pairs) for r_w in review_words]
freqs

In [None]:
feature_names, _ = list(zip(*pairs))
vectors = [list(zip(*f))[1] for f in freqs]

In [None]:
df = pd.DataFrame(vectors, columns=feature_names)
df

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
vectorizer.vocabulary_

In [None]:
df = df.reindex(sorted(df.columns), axis=1)
df

In [None]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(reviews)
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

## N-граммы

In [None]:
review_words[0]

In [None]:
from nltk import ngrams

bigrams = ngrams(review_words[0], 2)
trigrams = ngrams(review_words[0], 3)
fourgrams = ngrams(review_words[0], 4)

bigrams, trigrams, fourgrams

In [None]:
list(bigrams), list(trigrams), list(fourgrams)

In [None]:
from collections import Counter
alice_words = nltk.word_tokenize(nltk.corpus.gutenberg.raw("carroll-alice.txt"))
ng = ngrams(alice_words, 2)
Counter(ng).most_common(30)

# TF-IDF

In [None]:
reviews

In [None]:
docs = [r_w[:-1] for r_w in review_words]
docs

In [None]:
docs[0]

In [None]:
def tf(word, doc):
    return doc.count(word) #/ len(doc)

tf('pasta', docs[0]), len(docs[0])

In [None]:
def df(word, docs):
    return sum(1 for doc in docs if word in doc)

df('pasta', docs)

In [None]:
def idf(word, docs):
    N = len(docs)
    return math.log((0+N) / (0+df(word, docs))) + 1

idf('pasta', docs) 

In [None]:
def tf_idf(word, doc, docs):
    return tf(word, doc) * idf(word, docs)

In [None]:
[tf_idf(w, docs[0], docs) for w in docs[0]]

In [None]:
list(zip(docs[0], [tf_idf(w, docs[0], docs) for w in docs[0]]))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(smooth_idf=False, norm=None)
values = tfidf_vectorizer.fit_transform(reviews)

feature_names = tfidf_vectorizer.get_feature_names_out()
pd.DataFrame(values.toarray(), columns = feature_names)

## Извлечение ключевых слов

In [None]:
names = nltk.corpus.gutenberg.fileids()
names

In [None]:
texts = [nltk.corpus.gutenberg.raw(n) for n in names]

In [None]:
corpus = pd.DataFrame({'Name': names, 'Text': texts})
corpus

In [None]:
corpus['Text'] = corpus['Text'].apply(lambda t: t.lower())

In [None]:
corpus

Unnamed: 0,Name,Text
0,austen-emma.txt,[emma by jane austen 1816]\n\nvolume i\n\nchap...
1,austen-persuasion.txt,[persuasion by jane austen 1818]\n\n\nchapter ...
2,austen-sense.txt,[sense and sensibility by jane austen 1811]\n\...
3,bible-kjv.txt,[the king james bible]\n\nthe old testament of...
4,blake-poems.txt,[poems by william blake 1789]\n\n \nsongs of i...
5,bryant-stories.txt,[stories to tell to children by sara cone brya...
6,burgess-busterbrown.txt,[the adventures of buster bear by thornton w. ...
7,carroll-alice.txt,[alice's adventures in wonderland by lewis car...
8,chesterton-ball.txt,[the ball and the cross by g.k. chesterton 190...
9,chesterton-brown.txt,[the wisdom of father brown by g. k. chesterto...


In [None]:
vectorizer=CountVectorizer()
vectors = vectorizer.fit_transform(corpus['Text'])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer().fit(vectors)

In [None]:
feature_names = vectorizer.get_feature_names_out()
feature_names[2000:2010]

array(['annexed', 'annexment', 'annie', 'annihilate', 'annihilated',
       'annihilating', 'annihilation', 'anno', 'annoint', 'annotations'],
      dtype=object)

In [None]:
doc = corpus["Text"][0]

In [None]:
tf_idf_vector=tfidf.transform(vectorizer.transform([doc]))
tf_idf_vector

<1x42063 sparse matrix of type '<class 'numpy.float64'>'
	with 7239 stored elements in Compressed Sparse Row format>

In [None]:
from scipy.sparse import coo_matrix
from typing import Dict

def vector_to_dict(vector: coo_matrix) -> Dict[int, float]:
    return {k: v for k, v in zip(vector.col, vector.data)}


In [None]:
token_scores = vector_to_dict(tf_idf_vector.tocoo())
token_scores = pd.DataFrame(token_scores.items(), columns=["word_id", "score"])
token_scores = token_scores.sort_values("score", ascending=False)
token_scores

Unnamed: 0,word_id,score
703,37449,0.364045
778,36954,0.361405
6760,1938,0.340211
2763,25584,0.298171
3516,20241,0.175665
...,...,...
4603,13982,0.000077
5565,9205,0.000073
6302,4984,0.000073
4271,15834,0.000069


In [None]:
token_scores['word'] = np.array(feature_names)[token_scores.word_id]
token_scores.head(10)

Unnamed: 0,word_id,score,word
703,37449,0.364045,to
778,36954,0.361405,the
6760,1938,0.340211,and
2763,25584,0.298171,of
3516,20241,0.175665,it
4001,17684,0.171565,her
4945,12467,0.171053,emma
229,40697,0.166631,was
1410,32894,0.162601,she
3793,19055,0.152039,in


In [None]:
token_scores.tail(10)

Unnamed: 0,word_id,score,word
3983,17818,7.7e-05,hid
1374,33226,7.7e-05,shoulder
1679,31073,7.7e-05,ring
4562,14135,7.7e-05,feet
1185,34399,7.7e-05,sounded
4603,13982,7.7e-05,fat
5565,9205,7.3e-05,cry
6302,4984,7.3e-05,bound
4271,15834,6.9e-05,gently
1941,30011,6.9e-05,red


In [None]:
token_scores.head()[['word', 'score']].values

array([['to', 0.36404524901817875],
       ['the', 0.3614047223026432],
       ['and', 0.34021102103321305],
       ['of', 0.29817105622008117],
       ['it', 0.17566451412826034]], dtype=object)

In [None]:
def get_keywords(text, n=10, tfidf=tfidf, vectorizer=vectorizer):
    #generate tf-idf for the given document
    tf_idf_vector=tfidf.transform(vectorizer.transform([text]))
    token_scores = pd.DataFrame(
        vector_to_dict(tf_idf_vector.tocoo()).items(),
        columns=["word_id", "score"]
    )
    token_scores['word'] = np.array(vectorizer.get_feature_names_out())[token_scores.word_id]
    top = token_scores.sort_values("score", ascending=False).head(n)
    top.score = np.round(top.score, 3)
    return {word: score for word, score in top[["word", "score"]].values}

In [None]:
keywords = get_keywords(corpus["Text"][7])

In [None]:
for k in keywords:
    print(k, keywords[k])

the 0.582
alice 0.361
and 0.309
to 0.259
it 0.211
she 0.196
of 0.182
said 0.164
you 0.146
in 0.131


In [None]:
corpus['Keywords'] = corpus["Text"].map(get_keywords)
corpus.Keywords[7]

{'alice': 0.361,
 'and': 0.309,
 'in': 0.131,
 'it': 0.211,
 'of': 0.182,
 'said': 0.164,
 'she': 0.196,
 'the': 0.582,
 'to': 0.259,
 'you': 0.146}

In [None]:
corpus['kw'] = corpus["Keywords"].map(lambda d: " ".join(d.keys()))
corpus.kw[7]

'the alice and to it she of said you in'

In [None]:
corpus[["Name", "kw"]]

Unnamed: 0,Name,kw
0,austen-emma.txt,to the and of it her emma was she in
1,austen-persuasion.txt,the to and of in was her had she it
2,austen-sense.txt,to the of and her elinor in was it she
3,bible-kjv.txt,the and of unto to that in he shall lord
4,blake-poems.txt,the and of in to my with thee his he
5,bryant-stories.txt,the and to he of was in it little his
6,burgess-busterbrown.txt,he the buster and to of that it was joe
7,carroll-alice.txt,the alice and to it she of said you in
8,chesterton-ball.txt,the and of turnbull to in macian he that it
9,chesterton-brown.txt,the and of to he in was it his that


In [None]:
pd.set_option("max_colwidth", 200)

In [None]:
corpus[["Name", "Keywords"]]

Unnamed: 0,Name,Keywords
0,austen-emma.txt,"{'to': 0.364, 'the': 0.361, 'and': 0.34, 'of': 0.298, 'it': 0.176, 'her': 0.172, 'emma': 0.171, 'was': 0.167, 'she': 0.163, 'in': 0.152}"
1,austen-persuasion.txt,"{'the': 0.43, 'to': 0.363, 'and': 0.362, 'of': 0.332, 'in': 0.18, 'was': 0.173, 'her': 0.156, 'had': 0.153, 'she': 0.148, 'it': 0.134}"
2,austen-sense.txt,"{'to': 0.376, 'the': 0.375, 'of': 0.326, 'and': 0.319, 'her': 0.233, 'elinor': 0.203, 'in': 0.181, 'was': 0.17, 'it': 0.16, 'she': 0.147}"
3,bible-kjv.txt,"{'the': 0.619, 'and': 0.5, 'of': 0.335, 'unto': 0.174, 'to': 0.131, 'that': 0.125, 'in': 0.123, 'he': 0.101, 'shall': 0.1, 'lord': 0.09}"
4,blake-poems.txt,"{'the': 0.638, 'and': 0.506, 'of': 0.212, 'in': 0.205, 'to': 0.161, 'my': 0.121, 'with': 0.096, 'thee': 0.089, 'his': 0.083, 'he': 0.081}"
5,bryant-stories.txt,"{'the': 0.686, 'and': 0.417, 'to': 0.235, 'he': 0.202, 'of': 0.163, 'was': 0.142, 'in': 0.127, 'it': 0.122, 'little': 0.119, 'his': 0.11}"
6,burgess-busterbrown.txt,"{'he': 0.389, 'the': 0.378, 'buster': 0.374, 'and': 0.296, 'to': 0.25, 'of': 0.196, 'that': 0.177, 'it': 0.172, 'was': 0.157, 'joe': 0.15}"
7,carroll-alice.txt,"{'the': 0.582, 'alice': 0.361, 'and': 0.309, 'to': 0.259, 'it': 0.211, 'she': 0.196, 'of': 0.182, 'said': 0.164, 'you': 0.146, 'in': 0.131}"
8,chesterton-ball.txt,"{'the': 0.619, 'and': 0.333, 'of': 0.319, 'turnbull': 0.221, 'to': 0.197, 'in': 0.176, 'macian': 0.173, 'he': 0.166, 'that': 0.146, 'it': 0.142}"
9,chesterton-brown.txt,"{'the': 0.674, 'and': 0.32, 'of': 0.302, 'to': 0.201, 'he': 0.196, 'in': 0.181, 'was': 0.165, 'it': 0.142, 'his': 0.139, 'that': 0.134}"


In [None]:
pd.set_option("max_colwidth", 80)

## Сходство документов

In [None]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('count', vectorizer), ('idf', tfidf)])
tf_idf_vector = pipe.transform(corpus.Text)
tf_idf_vector

<18x42063 sparse matrix of type '<class 'numpy.float64'>'
	with 121698 stored elements in Compressed Sparse Row format>

In [None]:
from scipy.spatial import distance
print(distance.euclidean([10, 10], [13, 14]))


5.0


In [None]:
distance.euclidean(tf_idf_vector[7].toarray(), tf_idf_vector[8].toarray())

0.6122816888239713

In [None]:
corpus

In [None]:
a = corpus[['Name']].reset_index()
cross = a.merge(a, how='cross')
cross

In [None]:
from itertools import product

product_ = pd.DataFrame(product(corpus.index, corpus.index), columns=['id1', 'id2'])
product_

In [None]:
corpus.Name.loc[product_.id1]

In [None]:
product_['Name1'] = corpus.Name.loc[product_.id1].values
product_['Name2'] = corpus.Name.loc[product_.id2].values
product_

In [None]:
def euclidean_distance(id1, id2, tf_idf_vector=tf_idf_vector):
    return distance.euclidean(tf_idf_vector[id1].toarray(), tf_idf_vector[id2].toarray())

product_['Distance'] = product_.apply(lambda x: euclidean_distance(x.id1, x.id2), axis=1)
product_

In [None]:
result = product_.sort_values(by=['Distance'])
result

In [None]:
result[result['Distance'] > 0].head(10)

# Задание
1. Извлечь ключевые слова, устранив стоп-слова (к текущей реализации добавить исключение стоп-слов);
2. Посмотреть, как поменялись расстояния между текстами после устранения стоп-слов;
3. Найти ключевые триграммы для текстов в корпусе Гутенберга;
4. Отсортировать документы по близости векторов TF-IDF на основе триграмм используя [косинусное расстояние](https://en.wikipedia.org/wiki/Cosine_similarity).