# Подготовка

In [1]:
import nltk
nltk.download(['gutenberg'])

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [2]:
ls /root/nltk_data/corpora/gutenberg

austen-emma.txt          carroll-alice.txt        README
austen-persuasion.txt    chesterton-ball.txt      shakespeare-caesar.txt
austen-sense.txt         chesterton-brown.txt     shakespeare-hamlet.txt
bible-kjv.txt            chesterton-thursday.txt  shakespeare-macbeth.txt
blake-poems.txt          edgeworth-parents.txt    whitman-leaves.txt
bryant-stories.txt       melville-moby_dick.txt
burgess-busterbrown.txt  milton-paradise.txt


In [3]:
import re
import nltk
import math
import numpy as np
import pandas as pd

# Некоторые функции NLTK

## Токенизация

In [4]:
some_text = """We produce about two million dollars for each hour we work.  The
fifty hours is one conservative estimate for how long it we take
to get any etext selected, entered, proofread, edited, copyright
searched and analyzed, the copyright letters written, etc.  This
projected audience is one hundred million readers.  If our value
per text is nominally estimated at one dollar, then we produce 2
million dollars per hour this year we, will have to do four text
files per month:  thus upping our productivity from one million.
The Goal of Project Gutenberg is to Give Away One Trillion Etext
Files by the December 31, 2001.  [10,000 x 100,000,000=Trillion]
This is ten thousand titles each to one hundred million readers,
which is 10% of the expected number of computer users by the end
of the year 2001."""

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
sentences = nltk.sent_tokenize(some_text)
sentences

['We produce about two million dollars for each hour we work.',
 'The\nfifty hours is one conservative estimate for how long it we take\nto get any etext selected, entered, proofread, edited, copyright\nsearched and analyzed, the copyright letters written, etc.',
 'This\nprojected audience is one hundred million readers.',
 'If our value\nper text is nominally estimated at one dollar, then we produce 2\nmillion dollars per hour this year we, will have to do four text\nfiles per month:  thus upping our productivity from one million.',
 'The Goal of Project Gutenberg is to Give Away One Trillion Etext\nFiles by the December 31, 2001.',
 '[10,000 x 100,000,000=Trillion]\nThis is ten thousand titles each to one hundred million readers,\nwhich is 10% of the expected number of computer users by the end\nof the year 2001.']

In [7]:
words = [nltk.word_tokenize(s) for s in sentences]
words

[['We',
  'produce',
  'about',
  'two',
  'million',
  'dollars',
  'for',
  'each',
  'hour',
  'we',
  'work',
  '.'],
 ['The',
  'fifty',
  'hours',
  'is',
  'one',
  'conservative',
  'estimate',
  'for',
  'how',
  'long',
  'it',
  'we',
  'take',
  'to',
  'get',
  'any',
  'etext',
  'selected',
  ',',
  'entered',
  ',',
  'proofread',
  ',',
  'edited',
  ',',
  'copyright',
  'searched',
  'and',
  'analyzed',
  ',',
  'the',
  'copyright',
  'letters',
  'written',
  ',',
  'etc',
  '.'],
 ['This',
  'projected',
  'audience',
  'is',
  'one',
  'hundred',
  'million',
  'readers',
  '.'],
 ['If',
  'our',
  'value',
  'per',
  'text',
  'is',
  'nominally',
  'estimated',
  'at',
  'one',
  'dollar',
  ',',
  'then',
  'we',
  'produce',
  '2',
  'million',
  'dollars',
  'per',
  'hour',
  'this',
  'year',
  'we',
  ',',
  'will',
  'have',
  'to',
  'do',
  'four',
  'text',
  'files',
  'per',
  'month',
  ':',
  'thus',
  'upping',
  'our',
  'productivity',
  'from

## Лемматизация и стемминг слова

In [8]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
!python -m spacy download ru_core_news_md

2022-12-30 18:10:34.885243: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ru-core-news-md==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.4.0/ru_core_news_md-3.4.0-py3-none-any.whl (41.9 MB)
[K     |████████████████████████████████| 41.9 MB 1.8 MB/s 
Collecting pymorphy2>=0.9
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.9 MB/s 
[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 41.8 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt>=0.6
  Downloading docopt-0.6.2.tar.gz (25 kB)
Building

In [10]:
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [11]:
stemmer = PorterStemmer()
snowball_en = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

In [12]:
word = 'dogs'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word))

dog
dog
dog


In [13]:
word = 'walked'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

walk
walk
walk


In [14]:
word = 'drove'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

drove
drove
drive


In [15]:
word = 'seen'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

seen
seen
see


In [16]:
word = 'домами'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(WordNetLemmatizer().lemmatize(word))

домами
домами
домами


In [17]:
import spacy
snowball_ru = SnowballStemmer('russian')
model = spacy.load("ru_core_news_md")



In [18]:
word = 'собаки'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

собак
собака


In [19]:
word = 'собаками'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

собак
собака


In [20]:
word = 'ходил'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

ход
ходить


In [21]:
word = 'прохаживал'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

прохажива
прохаживать


In [22]:
word = 'прохаживался'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

прохажива
прохаживаться


## Стоп-слова

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
filtered_words = [word for word in words[0] if not word.lower() in stop_words]
filtered_words_2 = list(filter(lambda s: s.lower() not in stop_words, words[0]))
print(words[0])
print(filtered_words)
print(filtered_words_2)

['We', 'produce', 'about', 'two', 'million', 'dollars', 'for', 'each', 'hour', 'we', 'work', '.']
['produce', 'two', 'million', 'dollars', 'hour', 'work', '.']
['produce', 'two', 'million', 'dollars', 'hour', 'work', '.']


In [25]:
print(stopwords.raw('russian')[:30])

и
в
во
не
что
он
на
я
с
со
как


# Мешок слов
По сути это мультисет или счетчик, но конкретно используемый как предстваление текста.

In [26]:
reviews = [
           'This pasta is very tasty and affordable.',
           'This pasta is not tasty and is affordable.',
           'This pasta is delicious and cheap.',
           'Pasta is tasty and pasta tastes good.',
]

In [27]:
from itertools import chain

words = chain(*map(nltk.word_tokenize, reviews))
unique_words = set(map(str.lower, words))
unique_words

{'.',
 'affordable',
 'and',
 'cheap',
 'delicious',
 'good',
 'is',
 'not',
 'pasta',
 'tastes',
 'tasty',
 'this',
 'very'}

In [28]:
import pandas as pd
vocabulary = pd.Series(list(unique_words))
vocabulary

0           very
1           this
2          pasta
3          tasty
4          cheap
5      delicious
6            not
7              .
8            and
9     affordable
10        tastes
11          good
12            is
dtype: object

In [29]:
lookup = pd.Series({v: k for k, v in vocabulary.items()})
lookup

very           0
this           1
pasta          2
tasty          3
cheap          4
delicious      5
not            6
.              7
and            8
affordable     9
tastes        10
good          11
is            12
dtype: int64

In [30]:
review_words = [nltk.word_tokenize(r) for r in reviews]
review_words[0]

['This', 'pasta', 'is', 'very', 'tasty', 'and', 'affordable', '.']

In [31]:
[[w.lower() for w in review] for review in review_words]

[['this', 'pasta', 'is', 'very', 'tasty', 'and', 'affordable', '.'],
 ['this', 'pasta', 'is', 'not', 'tasty', 'and', 'is', 'affordable', '.'],
 ['this', 'pasta', 'is', 'delicious', 'and', 'cheap', '.'],
 ['pasta', 'is', 'tasty', 'and', 'pasta', 'tastes', 'good', '.']]

In [32]:
[[lookup[w.lower()] for w in review] for review in review_words]


[[1, 2, 12, 0, 3, 8, 9, 7],
 [1, 2, 12, 6, 3, 8, 12, 9, 7],
 [1, 2, 12, 5, 8, 4, 7],
 [2, 12, 3, 8, 2, 10, 11, 7]]

In [33]:
from collections import Counter
[Counter(lookup[w.lower()] for w in review) for review in review_words]


[Counter({1: 1, 2: 1, 12: 1, 0: 1, 3: 1, 8: 1, 9: 1, 7: 1}),
 Counter({1: 1, 2: 1, 12: 2, 6: 1, 3: 1, 8: 1, 9: 1, 7: 1}),
 Counter({1: 1, 2: 1, 12: 1, 5: 1, 8: 1, 4: 1, 7: 1}),
 Counter({2: 2, 12: 1, 3: 1, 8: 1, 10: 1, 11: 1, 7: 1})]

In [34]:
def word_frequencies(words, lookup):
  counters = pd.Series(0, index=lookup.values)
  for w in words:
    counters[lookup[w.lower()]] += 1
  return counters

word_frequencies(review_words[0], lookup)

0     1
1     1
2     1
3     1
4     0
5     0
6     0
7     1
8     1
9     1
10    0
11    0
12    1
dtype: int64

In [35]:
freqs = pd.DataFrame([word_frequencies(r_w, lookup) for r_w in review_words])
freqs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1,1,1,1,0,0,0,1,1,1,0,0,1
1,0,1,1,1,0,0,1,1,1,1,0,0,2
2,0,1,1,0,1,1,0,1,1,0,0,0,1
3,0,0,2,1,0,0,0,1,1,0,1,1,1


In [36]:
freqs.columns = lookup.index
freqs

Unnamed: 0,very,this,pasta,tasty,cheap,delicious,not,.,and,affordable,tastes,good,is
0,1,1,1,1,0,0,0,1,1,1,0,0,1
1,0,1,1,1,0,0,1,1,1,1,0,0,2
2,0,1,1,0,1,1,0,1,1,0,0,0,1
3,0,0,2,1,0,0,0,1,1,0,1,1,1


In [37]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,1,1,0,0,0,1,0,1,0,1,1,1
1,1,1,0,0,0,2,1,1,0,1,1,0
2,0,1,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,1,1,0,2,1,1,0,0


In [38]:
vectorizer.vocabulary_

{'this': 10,
 'pasta': 7,
 'is': 5,
 'very': 11,
 'tasty': 9,
 'and': 1,
 'affordable': 0,
 'not': 6,
 'delicious': 3,
 'cheap': 2,
 'tastes': 8,
 'good': 4}

In [39]:
df = freqs.reindex(sorted(freqs.columns), axis=1).drop(columns='.')
df

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,1,1,0,0,0,1,0,1,0,1,1,1
1,1,1,0,0,0,2,1,1,0,1,1,0
2,0,1,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,1,1,0,2,1,1,0,0


In [40]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(reviews)
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,affordable,cheap,delicious,good,pasta,tastes,tasty
0,1,0,0,0,1,0,1
1,1,0,0,0,1,0,1
2,0,1,1,0,1,0,0
3,0,0,0,1,2,1,1


# N-граммы

In [41]:
review_words[0]

['This', 'pasta', 'is', 'very', 'tasty', 'and', 'affordable', '.']

In [42]:
from nltk import ngrams

bigrams = ngrams(review_words[0], 2)
trigrams = ngrams(review_words[0], 3)
fourgrams = ngrams(review_words[0], 4)

bigrams, trigrams, fourgrams

(<zip at 0x7f905d734180>, <zip at 0x7f905d64c280>, <zip at 0x7f905d64cb40>)

In [43]:
list(bigrams), list(trigrams), list(fourgrams)

([('This', 'pasta'),
  ('pasta', 'is'),
  ('is', 'very'),
  ('very', 'tasty'),
  ('tasty', 'and'),
  ('and', 'affordable'),
  ('affordable', '.')],
 [('This', 'pasta', 'is'),
  ('pasta', 'is', 'very'),
  ('is', 'very', 'tasty'),
  ('very', 'tasty', 'and'),
  ('tasty', 'and', 'affordable'),
  ('and', 'affordable', '.')],
 [('This', 'pasta', 'is', 'very'),
  ('pasta', 'is', 'very', 'tasty'),
  ('is', 'very', 'tasty', 'and'),
  ('very', 'tasty', 'and', 'affordable'),
  ('tasty', 'and', 'affordable', '.')])

In [44]:
from collections import Counter
alice_words = nltk.word_tokenize(nltk.corpus.gutenberg.raw("carroll-alice.txt"))
ng = ngrams(alice_words, 2)
Counter(ng).most_common(30)

[((',', 'and'), 450),
 ((',', "'"), 429),
 (("'", 'said'), 329),
 (('!', "'"), 283),
 (('.', "'"), 262),
 (('said', 'the'), 206),
 (("'", 'I'), 169),
 (('?', "'"), 157),
 (('of', 'the'), 127),
 (('said', 'Alice'), 115),
 (('in', 'a'), 95),
 ((',', 'I'), 81),
 (("'", 'the'), 81),
 (('Alice', ','), 78),
 (('in', 'the'), 76),
 (('and', 'the'), 72),
 (('to', 'the'), 69),
 (('it', 'was'), 62),
 (('the', 'Queen'), 62),
 ((',', 'as'), 61),
 ((',', 'but'), 60),
 (('at', 'the'), 60),
 (('it', ','), 57),
 (('*', '*'), 57),
 (('as', 'she'), 56),
 (('a', 'little'), 56),
 (("'", 'Alice'), 56),
 (('she', 'had'), 55),
 (('the', 'King'), 55),
 (('Mock', 'Turtle'), 55)]

In [45]:
vectorizer = CountVectorizer(ngram_range=(2, 2))
X = vectorizer.fit_transform([nltk.corpus.gutenberg.raw("carroll-alice.txt")])
X

<1x14587 sparse matrix of type '<class 'numpy.int64'>'
	with 14587 stored elements in Compressed Sparse Row format>

In [46]:
pd.Series(X.toarray()[0], index=vectorizer.get_feature_names_out()).sort_values(ascending=False).head(20)


said the       210
of the         133
said alice     116
and the         82
in the          80
it was          76
the queen       72
to the          69
the king        62
as she          61
at the          60
she had         60
and she         56
mock turtle     56
she was         55
the mock        53
the gryphon     53
the hatter      52
to be           52
went on         48
dtype: int64

# TF-IDF

In [47]:
reviews

['This pasta is very tasty and affordable.',
 'This pasta is not tasty and is affordable.',
 'This pasta is delicious and cheap.',
 'Pasta is tasty and pasta tastes good.']

In [48]:
docs = [r_w[:-1] for r_w in review_words]
docs

[['This', 'pasta', 'is', 'very', 'tasty', 'and', 'affordable'],
 ['This', 'pasta', 'is', 'not', 'tasty', 'and', 'is', 'affordable'],
 ['This', 'pasta', 'is', 'delicious', 'and', 'cheap'],
 ['Pasta', 'is', 'tasty', 'and', 'pasta', 'tastes', 'good']]

In [49]:
docs[0]

['This', 'pasta', 'is', 'very', 'tasty', 'and', 'affordable']

In [50]:
def tf(word, doc):
    return doc.count(word)

tf('pasta', docs[0]), len(docs[0])

(1, 7)

In [51]:
def df(word, docs):
    return sum(1 for doc in docs if word in doc)

df('pasta', docs)

4

In [52]:
def idf(word, docs):
    N = len(docs)
    return math.log((1 + N) / (1 + df(word, docs))) + 1

idf('pasta', docs) 

1.0

In [53]:
def tf_idf(word, doc, docs):
    return tf(word, doc) * idf(word, docs)

In [54]:
[tf_idf(w, docs[0], docs) for w in docs[0]]

[1.2231435513142097,
 1.0,
 1.0,
 1.916290731874155,
 1.2231435513142097,
 1.0,
 1.5108256237659907]

In [55]:
v1 = list(zip(docs[0], [tf_idf(w, docs[0], docs) for w in docs[0]]))
v1

[('This', 1.2231435513142097),
 ('pasta', 1.0),
 ('is', 1.0),
 ('very', 1.916290731874155),
 ('tasty', 1.2231435513142097),
 ('and', 1.0),
 ('affordable', 1.5108256237659907)]

In [56]:
import math
norm = math.sqrt(sum(v**2 for _, v in v1))
[(k, v/norm) for k, v in v1]

[('This', 0.35387458240192243),
 ('pasta', 0.28931565883800064),
 ('is', 0.28931565883800064),
 ('very', 0.5544129156173256),
 ('tasty', 0.35387458240192243),
 ('and', 0.28931565883800064),
 ('affordable', 0.4371055107291909)]

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
values = tfidf_vectorizer.fit_transform(reviews)

feature_names = tfidf_vectorizer.get_feature_names_out()
pd.DataFrame(values.toarray(), columns = feature_names)

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,0.437106,0.289316,0.0,0.0,0.0,0.289316,0.0,0.289316,0.0,0.353875,0.353875,0.554413
1,0.390785,0.258657,0.0,0.0,0.0,0.517314,0.495662,0.258657,0.0,0.316375,0.316375,0.0
2,0.0,0.290614,0.556901,0.556901,0.0,0.290614,0.0,0.290614,0.0,0.0,0.355463,0.0
3,0.0,0.259583,0.0,0.0,0.497437,0.259583,0.0,0.519167,0.497437,0.317508,0.0,0.0


## Извлечение ключевых слов

In [58]:
names = nltk.corpus.gutenberg.fileids()
names

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [59]:
texts = [nltk.corpus.gutenberg.raw(n) for n in names]

In [60]:
corpus = pd.DataFrame({'Name': names, 'Text': texts})
corpus

Unnamed: 0,Name,Text
0,austen-emma.txt,[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAP...
1,austen-persuasion.txt,[Persuasion by Jane Austen 1818]\n\n\nChapter ...
2,austen-sense.txt,[Sense and Sensibility by Jane Austen 1811]\n\...
3,bible-kjv.txt,[The King James Bible]\n\nThe Old Testament of...
4,blake-poems.txt,[Poems by William Blake 1789]\n\n \nSONGS OF I...
5,bryant-stories.txt,[Stories to Tell to Children by Sara Cone Brya...
6,burgess-busterbrown.txt,[The Adventures of Buster Bear by Thornton W. ...
7,carroll-alice.txt,[Alice's Adventures in Wonderland by Lewis Car...
8,chesterton-ball.txt,[The Ball and The Cross by G.K. Chesterton 190...
9,chesterton-brown.txt,[The Wisdom of Father Brown by G. K. Chesterto...


In [61]:
# corpus['Text'] = corpus['Text'].apply(lambda t: t.lower())

In [62]:
corpus

Unnamed: 0,Name,Text
0,austen-emma.txt,[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAP...
1,austen-persuasion.txt,[Persuasion by Jane Austen 1818]\n\n\nChapter ...
2,austen-sense.txt,[Sense and Sensibility by Jane Austen 1811]\n\...
3,bible-kjv.txt,[The King James Bible]\n\nThe Old Testament of...
4,blake-poems.txt,[Poems by William Blake 1789]\n\n \nSONGS OF I...
5,bryant-stories.txt,[Stories to Tell to Children by Sara Cone Brya...
6,burgess-busterbrown.txt,[The Adventures of Buster Bear by Thornton W. ...
7,carroll-alice.txt,[Alice's Adventures in Wonderland by Lewis Car...
8,chesterton-ball.txt,[The Ball and The Cross by G.K. Chesterton 190...
9,chesterton-brown.txt,[The Wisdom of Father Brown by G. K. Chesterto...


In [63]:
vectorizer=CountVectorizer()
vectors = vectorizer.fit_transform(corpus['Text'])
vectors

<18x42063 sparse matrix of type '<class 'numpy.int64'>'
	with 121698 stored elements in Compressed Sparse Row format>

In [64]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer().fit(vectors)

In [65]:
feature_names = vectorizer.get_feature_names_out()
feature_names[2000:2010]

array(['annexed', 'annexment', 'annie', 'annihilate', 'annihilated',
       'annihilating', 'annihilation', 'anno', 'annoint', 'annotations'],
      dtype=object)

In [66]:
doc = corpus["Text"][0]

In [67]:
tf_idf_vector=tfidf.transform(vectorizer.transform([doc]))
tf_idf_vector

<1x42063 sparse matrix of type '<class 'numpy.float64'>'
	with 7239 stored elements in Compressed Sparse Row format>

In [68]:
from scipy.sparse import coo_matrix
from typing import Dict

def vector_to_dict(vector: coo_matrix) -> Dict[int, float]:
    return {k: v for k, v in zip(vector.col, vector.data)}


In [69]:
token_scores = vector_to_dict(tf_idf_vector.tocoo())
token_scores = pd.DataFrame(token_scores.items(), columns=["word_id", "score"])
token_scores = token_scores.sort_values("score", ascending=False)
token_scores

Unnamed: 0,word_id,score
703,37449,0.364045
778,36954,0.361405
6760,1938,0.340211
2763,25584,0.298171
3516,20241,0.175665
...,...,...
4603,13982,0.000077
5565,9205,0.000073
6302,4984,0.000073
4271,15834,0.000069


In [70]:
token_scores['word'] = np.array(feature_names)[token_scores.word_id]
token_scores.head(10)

Unnamed: 0,word_id,score,word
703,37449,0.364045,to
778,36954,0.361405,the
6760,1938,0.340211,and
2763,25584,0.298171,of
3516,20241,0.175665,it
4001,17684,0.171565,her
4945,12467,0.171053,emma
229,40697,0.166631,was
1410,32894,0.162601,she
3793,19055,0.152039,in


In [71]:
token_scores.tail(10)

Unnamed: 0,word_id,score,word
3983,17818,7.7e-05,hid
1374,33226,7.7e-05,shoulder
1679,31073,7.7e-05,ring
4562,14135,7.7e-05,feet
1185,34399,7.7e-05,sounded
4603,13982,7.7e-05,fat
5565,9205,7.3e-05,cry
6302,4984,7.3e-05,bound
4271,15834,6.9e-05,gently
1941,30011,6.9e-05,red


In [72]:
token_scores.head()[['word', 'score']].values

array([['to', 0.36404524901817875],
       ['the', 0.3614047223026432],
       ['and', 0.34021102103321305],
       ['of', 0.29817105622008117],
       ['it', 0.17566451412826034]], dtype=object)

In [73]:
def get_keywords(text, n=10, tfidf=tfidf, vectorizer=vectorizer):
    #generate tf-idf for the given document
    tf_idf_vector=tfidf.transform(vectorizer.transform([text]))
    token_scores = pd.DataFrame(
        vector_to_dict(tf_idf_vector.tocoo()).items(),
        columns=["word_id", "score"]
    )
    token_scores['word'] = np.array(vectorizer.get_feature_names_out())[token_scores.word_id]
    top = token_scores.sort_values("score", ascending=False).head(n)
    top.score = np.round(top.score, 3)
    return {word: score for word, score in top[["word", "score"]].values}

In [74]:
keywords = get_keywords(corpus["Text"][7])

In [75]:
for k in keywords:
    print(k, keywords[k])

the 0.582
alice 0.361
and 0.309
to 0.259
it 0.211
she 0.196
of 0.182
said 0.164
you 0.146
in 0.131


In [76]:
corpus['Keywords'] = corpus["Text"].map(get_keywords)
corpus.Keywords[7]

{'the': 0.582,
 'alice': 0.361,
 'and': 0.309,
 'to': 0.259,
 'it': 0.211,
 'she': 0.196,
 'of': 0.182,
 'said': 0.164,
 'you': 0.146,
 'in': 0.131}

In [77]:
corpus['kw'] = corpus["Keywords"].map(lambda d: " ".join(d.keys()))
corpus.kw[7]

'the alice and to it she of said you in'

In [78]:
corpus[["Name", "kw"]]

Unnamed: 0,Name,kw
0,austen-emma.txt,to the and of it her emma was she in
1,austen-persuasion.txt,the to and of in was her had she it
2,austen-sense.txt,to the of and her elinor in was it she
3,bible-kjv.txt,the and of unto to that in he shall lord
4,blake-poems.txt,the and of in to my with thee his he
5,bryant-stories.txt,the and to he of was in it little his
6,burgess-busterbrown.txt,he the buster and to of that it was joe
7,carroll-alice.txt,the alice and to it she of said you in
8,chesterton-ball.txt,the and of turnbull to in macian he that it
9,chesterton-brown.txt,the and of to he in was it his that


In [79]:
pd.set_option("max_colwidth", 200)

In [80]:
corpus[["Name", "Keywords"]]

Unnamed: 0,Name,Keywords
0,austen-emma.txt,"{'to': 0.364, 'the': 0.361, 'and': 0.34, 'of': 0.298, 'it': 0.176, 'her': 0.172, 'emma': 0.171, 'was': 0.167, 'she': 0.163, 'in': 0.152}"
1,austen-persuasion.txt,"{'the': 0.43, 'to': 0.363, 'and': 0.362, 'of': 0.332, 'in': 0.18, 'was': 0.173, 'her': 0.156, 'had': 0.153, 'she': 0.148, 'it': 0.134}"
2,austen-sense.txt,"{'to': 0.376, 'the': 0.375, 'of': 0.326, 'and': 0.319, 'her': 0.233, 'elinor': 0.203, 'in': 0.181, 'was': 0.17, 'it': 0.16, 'she': 0.147}"
3,bible-kjv.txt,"{'the': 0.619, 'and': 0.5, 'of': 0.335, 'unto': 0.174, 'to': 0.131, 'that': 0.125, 'in': 0.123, 'he': 0.101, 'shall': 0.1, 'lord': 0.09}"
4,blake-poems.txt,"{'the': 0.638, 'and': 0.506, 'of': 0.212, 'in': 0.205, 'to': 0.161, 'my': 0.121, 'with': 0.096, 'thee': 0.089, 'his': 0.083, 'he': 0.081}"
5,bryant-stories.txt,"{'the': 0.686, 'and': 0.417, 'to': 0.235, 'he': 0.202, 'of': 0.163, 'was': 0.142, 'in': 0.127, 'it': 0.122, 'little': 0.119, 'his': 0.11}"
6,burgess-busterbrown.txt,"{'he': 0.389, 'the': 0.378, 'buster': 0.374, 'and': 0.296, 'to': 0.25, 'of': 0.196, 'that': 0.177, 'it': 0.172, 'was': 0.157, 'joe': 0.15}"
7,carroll-alice.txt,"{'the': 0.582, 'alice': 0.361, 'and': 0.309, 'to': 0.259, 'it': 0.211, 'she': 0.196, 'of': 0.182, 'said': 0.164, 'you': 0.146, 'in': 0.131}"
8,chesterton-ball.txt,"{'the': 0.619, 'and': 0.333, 'of': 0.319, 'turnbull': 0.221, 'to': 0.197, 'in': 0.176, 'macian': 0.173, 'he': 0.166, 'that': 0.146, 'it': 0.142}"
9,chesterton-brown.txt,"{'the': 0.674, 'and': 0.32, 'of': 0.302, 'to': 0.201, 'he': 0.196, 'in': 0.181, 'was': 0.165, 'it': 0.142, 'his': 0.139, 'that': 0.134}"


In [81]:
pd.set_option("max_colwidth", 80)

## Сходство документов

In [82]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('count', vectorizer), ('idf', tfidf)])
tf_idf_vector = pipe.transform(corpus.Text)
tf_idf_vector

<18x42063 sparse matrix of type '<class 'numpy.float64'>'
	with 121698 stored elements in Compressed Sparse Row format>

In [83]:
from scipy.spatial import distance
print(distance.euclidean([10, 10], [13, 14]))


5.0


In [84]:
distance.euclidean(tf_idf_vector[7].toarray(), tf_idf_vector[8].toarray())

0.6122816888239713

In [85]:
corpus

Unnamed: 0,Name,Text,Keywords,kw
0,austen-emma.txt,"[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, han...","{'to': 0.364, 'the': 0.361, 'and': 0.34, 'of': 0.298, 'it': 0.176, 'her': 0....",to the and of it her emma was she in
1,austen-persuasion.txt,"[Persuasion by Jane Austen 1818]\n\n\nChapter 1\n\n\nSir Walter Elliot, of K...","{'the': 0.43, 'to': 0.363, 'and': 0.362, 'of': 0.332, 'in': 0.18, 'was': 0.1...",the to and of in was her had she it
2,austen-sense.txt,[Sense and Sensibility by Jane Austen 1811]\n\nCHAPTER 1\n\n\nThe family of ...,"{'to': 0.376, 'the': 0.375, 'of': 0.326, 'and': 0.319, 'her': 0.233, 'elinor...",to the of and her elinor in was it she
3,bible-kjv.txt,[The King James Bible]\n\nThe Old Testament of the King James Bible\n\nThe F...,"{'the': 0.619, 'and': 0.5, 'of': 0.335, 'unto': 0.174, 'to': 0.131, 'that': ...",the and of unto to that in he shall lord
4,blake-poems.txt,[Poems by William Blake 1789]\n\n \nSONGS OF INNOCENCE AND OF EXPERIENCE\nan...,"{'the': 0.638, 'and': 0.506, 'of': 0.212, 'in': 0.205, 'to': 0.161, 'my': 0....",the and of in to my with thee his he
5,bryant-stories.txt,[Stories to Tell to Children by Sara Cone Bryant 1918] \r\n\r\n\r\nTWO LITTL...,"{'the': 0.686, 'and': 0.417, 'to': 0.235, 'he': 0.202, 'of': 0.163, 'was': 0...",the and to he of was in it little his
6,burgess-busterbrown.txt,[The Adventures of Buster Bear by Thornton W. Burgess 1920]\r\n\r\nI\r\n\r\n...,"{'he': 0.389, 'the': 0.378, 'buster': 0.374, 'and': 0.296, 'to': 0.25, 'of':...",he the buster and to of that it was joe
7,carroll-alice.txt,[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I. Down ...,"{'the': 0.582, 'alice': 0.361, 'and': 0.309, 'to': 0.259, 'it': 0.211, 'she'...",the alice and to it she of said you in
8,chesterton-ball.txt,[The Ball and The Cross by G.K. Chesterton 1909]\n\n\nI. A DISCUSSION SOMEWH...,"{'the': 0.619, 'and': 0.333, 'of': 0.319, 'turnbull': 0.221, 'to': 0.197, 'i...",the and of turnbull to in macian he that it
9,chesterton-brown.txt,[The Wisdom of Father Brown by G. K. Chesterton 1914]\n\n\nI. The Absence of...,"{'the': 0.674, 'and': 0.32, 'of': 0.302, 'to': 0.201, 'he': 0.196, 'in': 0.1...",the and of to he in was it his that


In [86]:
a = corpus[['Name']].reset_index()
cross = a.merge(a, how='cross')
cross

Unnamed: 0,index_x,Name_x,index_y,Name_y
0,0,austen-emma.txt,0,austen-emma.txt
1,0,austen-emma.txt,1,austen-persuasion.txt
2,0,austen-emma.txt,2,austen-sense.txt
3,0,austen-emma.txt,3,bible-kjv.txt
4,0,austen-emma.txt,4,blake-poems.txt
...,...,...,...,...
319,17,whitman-leaves.txt,13,milton-paradise.txt
320,17,whitman-leaves.txt,14,shakespeare-caesar.txt
321,17,whitman-leaves.txt,15,shakespeare-hamlet.txt
322,17,whitman-leaves.txt,16,shakespeare-macbeth.txt


In [87]:
from itertools import product

product_ = pd.DataFrame(product(corpus.index, corpus.index), columns=['id1', 'id2'])
product_

Unnamed: 0,id1,id2
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
319,17,13
320,17,14
321,17,15
322,17,16


In [88]:
corpus.Name.loc[product_.id1]

0        austen-emma.txt
0        austen-emma.txt
0        austen-emma.txt
0        austen-emma.txt
0        austen-emma.txt
             ...        
17    whitman-leaves.txt
17    whitman-leaves.txt
17    whitman-leaves.txt
17    whitman-leaves.txt
17    whitman-leaves.txt
Name: Name, Length: 324, dtype: object

In [89]:
product_['Name1'] = corpus.Name.loc[product_.id1].values
product_['Name2'] = corpus.Name.loc[product_.id2].values
product_

Unnamed: 0,id1,id2,Name1,Name2
0,0,0,austen-emma.txt,austen-emma.txt
1,0,1,austen-emma.txt,austen-persuasion.txt
2,0,2,austen-emma.txt,austen-sense.txt
3,0,3,austen-emma.txt,bible-kjv.txt
4,0,4,austen-emma.txt,blake-poems.txt
...,...,...,...,...
319,17,13,whitman-leaves.txt,milton-paradise.txt
320,17,14,whitman-leaves.txt,shakespeare-caesar.txt
321,17,15,whitman-leaves.txt,shakespeare-hamlet.txt
322,17,16,whitman-leaves.txt,shakespeare-macbeth.txt


In [90]:
def euclidean_distance(id1, id2, tf_idf_vector=tf_idf_vector):
    return distance.euclidean(tf_idf_vector[id1].toarray(), tf_idf_vector[id2].toarray())

product_['Distance'] = product_.apply(lambda x: euclidean_distance(x.id1, x.id2), axis=1)
product_

Unnamed: 0,id1,id2,Name1,Name2,Distance
0,0,0,austen-emma.txt,austen-emma.txt,0.000000
1,0,1,austen-emma.txt,austen-persuasion.txt,0.431002
2,0,2,austen-emma.txt,austen-sense.txt,0.458749
3,0,3,austen-emma.txt,bible-kjv.txt,0.740704
4,0,4,austen-emma.txt,blake-poems.txt,0.753086
...,...,...,...,...,...
319,17,13,whitman-leaves.txt,milton-paradise.txt,0.532938
320,17,14,whitman-leaves.txt,shakespeare-caesar.txt,0.850927
321,17,15,whitman-leaves.txt,shakespeare-hamlet.txt,0.727884
322,17,16,whitman-leaves.txt,shakespeare-macbeth.txt,0.724485


In [91]:
result = product_.sort_values(by=['Distance'])
result

Unnamed: 0,id1,id2,Name1,Name2,Distance
0,0,0,austen-emma.txt,austen-emma.txt,0.000000
57,3,3,bible-kjv.txt,bible-kjv.txt,0.000000
76,4,4,blake-poems.txt,blake-poems.txt,0.000000
95,5,5,bryant-stories.txt,bryant-stories.txt,0.000000
114,6,6,burgess-busterbrown.txt,burgess-busterbrown.txt,0.000000
...,...,...,...,...,...
259,14,7,shakespeare-caesar.txt,carroll-alice.txt,0.916476
124,6,16,burgess-busterbrown.txt,shakespeare-macbeth.txt,0.918573
294,16,6,shakespeare-macbeth.txt,burgess-busterbrown.txt,0.918573
122,6,14,burgess-busterbrown.txt,shakespeare-caesar.txt,0.967108


In [92]:
result[result['Distance'] > 0].head(10)

Unnamed: 0,id1,id2,Name1,Name2,Distance
225,12,9,melville-moby_dick.txt,chesterton-brown.txt,0.314317
174,9,12,chesterton-brown.txt,melville-moby_dick.txt,0.314317
99,5,9,bryant-stories.txt,chesterton-brown.txt,0.336497
167,9,5,chesterton-brown.txt,bryant-stories.txt,0.336497
153,8,9,chesterton-ball.txt,chesterton-brown.txt,0.345937
170,9,8,chesterton-brown.txt,chesterton-ball.txt,0.345937
233,12,17,melville-moby_dick.txt,whitman-leaves.txt,0.356927
318,17,12,whitman-leaves.txt,melville-moby_dick.txt,0.356927
172,9,10,chesterton-brown.txt,chesterton-thursday.txt,0.366997
189,10,9,chesterton-thursday.txt,chesterton-brown.txt,0.366997


# Задание
1. Извлечь ключевые слова из всех текстов в корпусе, после устранения стоп-слов
2. Найти ключевые триграммы для текстов (без устранения стоп-слов)


1

In [93]:
corpus = pd.DataFrame({'Name': names, 'Text': texts})
pd.set_option("max_colwidth", 200)
corpus

Unnamed: 0,Name,Text
0,austen-emma.txt,"[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof ..."
1,austen-persuasion.txt,"[Persuasion by Jane Austen 1818]\n\n\nChapter 1\n\n\nSir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who,\nfor his own amusement, never took up any book but the Baronetage;\nthere..."
2,austen-sense.txt,"[Sense and Sensibility by Jane Austen 1811]\n\nCHAPTER 1\n\n\nThe family of Dashwood had long been settled in Sussex.\nTheir estate was large, and their residence was at Norland Park,\nin the cent..."
3,bible-kjv.txt,[The King James Bible]\n\nThe Old Testament of the King James Bible\n\nThe First Book of Moses: Called Genesis\n\n\n1:1 In the beginning God created the heaven and the earth.\n\n1:2 And the earth...
4,blake-poems.txt,"[Poems by William Blake 1789]\n\n \nSONGS OF INNOCENCE AND OF EXPERIENCE\nand THE BOOK of THEL\n\n\n SONGS OF INNOCENCE\n \n \n INTRODUCTION\n \n Piping down the valleys wild,\n Piping songs of ..."
5,bryant-stories.txt,"[Stories to Tell to Children by Sara Cone Bryant 1918] \r\n\r\n\r\nTWO LITTLE RIDDLES IN RHYME\r\n\r\n\r\n There's a garden that I ken,\r\n Full of little gentlemen;\r\n Little caps of..."
6,burgess-busterbrown.txt,[The Adventures of Buster Bear by Thornton W. Burgess 1920]\r\n\r\nI\r\n\r\nBUSTER BEAR GOES FISHING\r\n\r\n\r\nBuster Bear yawned as he lay on his comfortable bed of leaves and\r\nwatched the fir...
7,carroll-alice.txt,"[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I. Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to..."
8,chesterton-ball.txt,[The Ball and The Cross by G.K. Chesterton 1909]\n\n\nI. A DISCUSSION SOMEWHAT IN THE AIR\n\nThe flying ship of Professor Lucifer sang through the skies like\na silver arrow; the bleak white steel...
9,chesterton-brown.txt,"[The Wisdom of Father Brown by G. K. Chesterton 1914]\n\n\nI. The Absence of Mr Glass\n\n\nTHE consulting-rooms of Dr Orion Hood, the eminent criminologist\nand specialist in certain moral disorde..."


In [94]:
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(texts)
tfidf = TfidfTransformer().fit(vectors)

In [95]:
keywords = [get_keywords(text, 10, tfidf, vectorizer) for text in corpus["Text"]]
keywords

[{'emma': 0.465,
  'mr': 0.337,
  'harriet': 0.311,
  'weston': 0.27,
  'knightley': 0.239,
  'elton': 0.237,
  'mrs': 0.231,
  'woodhouse': 0.192,
  'fairfax': 0.148,
  'miss': 0.148},
 {'elliot': 0.402,
  'anne': 0.396,
  'wentworth': 0.303,
  'mrs': 0.217,
  'captain': 0.213,
  'musgrove': 0.181,
  'russell': 0.18,
  'mr': 0.169,
  'charles': 0.166,
  'walter': 0.141},
 {'elinor': 0.589,
  'marianne': 0.426,
  'mrs': 0.245,
  'dashwood': 0.217,
  'jennings': 0.198,
  'willoughby': 0.163,
  'lucy': 0.14,
  'edward': 0.139,
  'brandon': 0.124,
  'ferrars': 0.112},
 {'unto': 0.52,
  'shall': 0.3,
  'lord': 0.27,
  'thou': 0.207,
  'thy': 0.164,
  'thee': 0.161,
  'israel': 0.16,
  'god': 0.144,
  'ye': 0.135,
  '12': 0.122},
 {'thee': 0.265,
  'thel': 0.225,
  'weep': 0.223,
  'thou': 0.197,
  'little': 0.194,
  'thy': 0.166,
  'like': 0.151,
  'love': 0.147,
  'joy': 0.141,
  'infant': 0.129},
 {'little': 0.491,
  'said': 0.373,
  'margery': 0.195,
  'jackal': 0.187,
  'came': 0.157,


In [96]:
corpus['Keywords'] = keywords
corpus[["Name", "Keywords"]]

Unnamed: 0,Name,Keywords
0,austen-emma.txt,"{'emma': 0.465, 'mr': 0.337, 'harriet': 0.311, 'weston': 0.27, 'knightley': 0.239, 'elton': 0.237, 'mrs': 0.231, 'woodhouse': 0.192, 'fairfax': 0.148, 'miss': 0.148}"
1,austen-persuasion.txt,"{'elliot': 0.402, 'anne': 0.396, 'wentworth': 0.303, 'mrs': 0.217, 'captain': 0.213, 'musgrove': 0.181, 'russell': 0.18, 'mr': 0.169, 'charles': 0.166, 'walter': 0.141}"
2,austen-sense.txt,"{'elinor': 0.589, 'marianne': 0.426, 'mrs': 0.245, 'dashwood': 0.217, 'jennings': 0.198, 'willoughby': 0.163, 'lucy': 0.14, 'edward': 0.139, 'brandon': 0.124, 'ferrars': 0.112}"
3,bible-kjv.txt,"{'unto': 0.52, 'shall': 0.3, 'lord': 0.27, 'thou': 0.207, 'thy': 0.164, 'thee': 0.161, 'israel': 0.16, 'god': 0.144, 'ye': 0.135, '12': 0.122}"
4,blake-poems.txt,"{'thee': 0.265, 'thel': 0.225, 'weep': 0.223, 'thou': 0.197, 'little': 0.194, 'thy': 0.166, 'like': 0.151, 'love': 0.147, 'joy': 0.141, 'infant': 0.129}"
5,bryant-stories.txt,"{'little': 0.491, 'said': 0.373, 'margery': 0.195, 'jackal': 0.187, 'came': 0.157, 'king': 0.136, 'nightingale': 0.112, 'brahmin': 0.103, 'went': 0.1, 'big': 0.099}"
6,burgess-busterbrown.txt,"{'buster': 0.683, 'joe': 0.274, 'little': 0.217, 'blacky': 0.177, 'farmer': 0.172, 'bear': 0.168, 'otter': 0.137, 'billy': 0.133, 'brown': 0.132, 'sammy': 0.125}"
7,carroll-alice.txt,"{'alice': 0.795, 'said': 0.361, 'gryphon': 0.122, 'hatter': 0.112, 'duchess': 0.107, 'dormouse': 0.102, 'little': 0.1, 'turtle': 0.099, 'rabbit': 0.08, 'know': 0.069}"
8,chesterton-ball.txt,"{'turnbull': 0.66, 'macian': 0.516, 'said': 0.243, 'evan': 0.171, 'man': 0.126, 'like': 0.121, 'really': 0.06, 'god': 0.058, 'quite': 0.057, 'did': 0.054}"
9,chesterton-brown.txt,"{'flambeau': 0.301, 'said': 0.296, 'brown': 0.256, 'like': 0.236, 'man': 0.219, 'father': 0.173, 'priest': 0.148, 'boulnois': 0.12, 'muscari': 0.114, 'mr': 0.111}"


2

In [97]:
corpus['Text'] = corpus['Text'].apply(lambda t: t.lower())
corpus[["Name", "Text"]]

Unnamed: 0,Name,Text
0,austen-emma.txt,"[emma by jane austen 1816]\n\nvolume i\n\nchapter i\n\n\nemma woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof ..."
1,austen-persuasion.txt,"[persuasion by jane austen 1818]\n\n\nchapter 1\n\n\nsir walter elliot, of kellynch hall, in somersetshire, was a man who,\nfor his own amusement, never took up any book but the baronetage;\nthere..."
2,austen-sense.txt,"[sense and sensibility by jane austen 1811]\n\nchapter 1\n\n\nthe family of dashwood had long been settled in sussex.\ntheir estate was large, and their residence was at norland park,\nin the cent..."
3,bible-kjv.txt,[the king james bible]\n\nthe old testament of the king james bible\n\nthe first book of moses: called genesis\n\n\n1:1 in the beginning god created the heaven and the earth.\n\n1:2 and the earth...
4,blake-poems.txt,"[poems by william blake 1789]\n\n \nsongs of innocence and of experience\nand the book of thel\n\n\n songs of innocence\n \n \n introduction\n \n piping down the valleys wild,\n piping songs of ..."
5,bryant-stories.txt,"[stories to tell to children by sara cone bryant 1918] \r\n\r\n\r\ntwo little riddles in rhyme\r\n\r\n\r\n there's a garden that i ken,\r\n full of little gentlemen;\r\n little caps of..."
6,burgess-busterbrown.txt,[the adventures of buster bear by thornton w. burgess 1920]\r\n\r\ni\r\n\r\nbuster bear goes fishing\r\n\r\n\r\nbuster bear yawned as he lay on his comfortable bed of leaves and\r\nwatched the fir...
7,carroll-alice.txt,"[alice's adventures in wonderland by lewis carroll 1865]\n\nchapter i. down the rabbit-hole\n\nalice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to..."
8,chesterton-ball.txt,[the ball and the cross by g.k. chesterton 1909]\n\n\ni. a discussion somewhat in the air\n\nthe flying ship of professor lucifer sang through the skies like\na silver arrow; the bleak white steel...
9,chesterton-brown.txt,"[the wisdom of father brown by g. k. chesterton 1914]\n\n\ni. the absence of mr glass\n\n\nthe consulting-rooms of dr orion hood, the eminent criminologist\nand specialist in certain moral disorde..."


In [98]:
texts_list = [list(nltk.ngrams(document.lower().split(), 3)) for document in corpus['Text'].tolist()]

vectorizer = CountVectorizer(analyzer=lambda x: x)
vectors = vectorizer.fit_transform(texts_list)
pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())



Unnamed: 0,"("", ""a, promise?"")","("", ""i, never)","("", ""oh,, shut)","("", ""what, sort)","("", he, set)","("", said, hal,)","("", so, genuine)","("", the, professor)","("", the, younger)","("", there, was)",...,"(}, you, lingering)","(}, youth,, day,)","(}[i], the, pilot)","(}[ii], had, i)","(}[iii], you, tides)","(}[iv], last, of)","(}[v], and, yet)","(}[vi], proudly, the)","(}[vii], by, that)","(}[viii], then, last)"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit(vectors)
feature_names = vectorizer.get_feature_names()



In [100]:
def sort(coo_matrix):
    return sorted(zip(coo_matrix.col, coo_matrix.data), key=lambda x: (x[1], x[0]), reverse=True)

def top_in_vector(feature_names, sorted_items):
    sorted_items = sorted_items[:10]
    score_v = []
    feature_v = []
    results= {}

    for id, score in sorted_items:
        fname = feature_names[id]
        score_v.append(round(score, 3))
        feature_v.append(feature_names[id])
    
    for id in range(len(feature_v)):
        results[feature_v[id]] = score_v[id]
    
    return results

In [101]:
tfidf_vector = tfidf.transform(vectorizer.transform(texts_list))
results = []
for i in range(tfidf_vector.shape[0]):
    curr_vector = tfidf_vector[i]
    sorted_items = sort(curr_vector.tocoo())
    keywords = top_in_vector(feature_names, sorted_items)
    results.append(keywords)

results

[{('i', 'do', 'not'): 0.083,
  ('she', 'could', 'not'): 0.082,
  ('emma', 'could', 'not'): 0.075,
  ('i', 'am', 'sure'): 0.067,
  ('would', 'have', 'been'): 0.065,
  ('she', 'had', 'been'): 0.063,
  ('mr.', 'and', 'mrs.'): 0.059,
  ('mr.', 'frank', 'churchill'): 0.055,
  ('a', 'great', 'deal'): 0.052,
  ('it', 'would', 'be'): 0.05},
 {('she', 'could', 'not'): 0.098,
  ('she', 'had', 'been'): 0.068,
  ('sir', 'walter', 'and'): 0.056,
  ('mr', 'and', 'mrs'): 0.053,
  ('captain', 'wentworth', 'was'): 0.05,
  ('anne', 'could', 'not'): 0.05,
  ('mr', 'elliot', 'was'): 0.046,
  ('a', 'great', 'deal'): 0.043,
  ('and', 'captain', 'wentworth'): 0.043,
  ('he', 'had', 'been'): 0.042},
 {('as', 'soon', 'as'): 0.066,
  ('she', 'could', 'not'): 0.058,
  ('i', 'am', 'sure'): 0.056,
  ('would', 'have', 'been'): 0.051,
  ('could', 'not', 'be'): 0.047,
  ('mrs.', 'jennings', 'was'): 0.043,
  ('elinor', 'could', 'not'): 0.041,
  ('the', 'miss', 'steeles'): 0.038,
  ('the', 'miss', 'dashwoods'): 0.038,


In [102]:
corpus['Trigrams'] = results
corpus[["Name", "Trigrams"]]

Unnamed: 0,Name,Trigrams
0,austen-emma.txt,"{('i', 'do', 'not'): 0.083, ('she', 'could', 'not'): 0.082, ('emma', 'could', 'not'): 0.075, ('i', 'am', 'sure'): 0.067, ('would', 'have', 'been'): 0.065, ('she', 'had', 'been'): 0.063, ('mr.', 'a..."
1,austen-persuasion.txt,"{('she', 'could', 'not'): 0.098, ('she', 'had', 'been'): 0.068, ('sir', 'walter', 'and'): 0.056, ('mr', 'and', 'mrs'): 0.053, ('captain', 'wentworth', 'was'): 0.05, ('anne', 'could', 'not'): 0.05,..."
2,austen-sense.txt,"{('as', 'soon', 'as'): 0.066, ('she', 'could', 'not'): 0.058, ('i', 'am', 'sure'): 0.056, ('would', 'have', 'been'): 0.051, ('could', 'not', 'be'): 0.047, ('mrs.', 'jennings', 'was'): 0.043, ('eli..."
3,bible-kjv.txt,"{('the', 'son', 'of'): 0.242, ('the', 'children', 'of'): 0.242, ('of', 'the', 'lord'): 0.195, ('and', 'the', 'lord'): 0.176, ('said', 'unto', 'him,'): 0.137, ('of', 'the', 'lord,'): 0.134, ('thus'..."
4,blake-poems.txt,"{('the', 'human', 'form'): 0.048, ('never', 'can', 'it'): 0.048, ('can', 'it', 'be!'): 0.048, ('the', 'vales', 'of'): 0.038, ('welcome', 'in', 'the'): 0.036, ('vales', 'of', 'har,'): 0.036, ('to',..."
5,bryant-stories.txt,"{('the', 'little', 'jackal'): 0.144, ('said', 'the', 'little'): 0.121, ('the', 'little', 'red'): 0.114, ('the', 'little', 'fir'): 0.097, ('the', 'little', 'gingerbread'): 0.08, ('little', 'gingerb..."
6,burgess-busterbrown.txt,"{('farmer', 'brown's', 'boy'): 0.458, ('the', 'green', 'forest'): 0.184, ('little', 'joe', 'otter'): 0.161, ('the', 'green', 'forest,'): 0.119, ('in', 'the', 'green'): 0.094, ('the', 'laughing', '..."
7,carroll-alice.txt,"{('*', '*', '*'): 0.216, ('the', 'mock', 'turtle'): 0.173, ('said', 'the', 'mock'): 0.106, ('the', 'march', 'hare'): 0.084, ('said', 'the', 'caterpillar.'): 0.067, ('the', 'white', 'rabbit'): 0.06..."
8,chesterton-ball.txt,"{('a', 'sort', 'of'): 0.084, ('with', 'a', 'sort'): 0.052, ('a', 'kind', 'of'): 0.039, ('out', 'of', 'the'): 0.036, ('said', 'the', 'other,'): 0.036, ('said', 'turnbull,', 'with'): 0.034, ('that',..."
9,chesterton-brown.txt,"{('said', 'father', 'brown,'): 0.106, ('said', 'father', 'brown.'): 0.055, ('one', 'of', 'the'): 0.048, ('a', 'sort', 'of'): 0.044, ('the', 'little', 'priest'): 0.044, ('father', 'brown', 'was'): ..."
