## Предобработка текста

In [None]:
import nltk
import spacy
import re

### Токенизация

In [None]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
data = "All work and no play makes jack a dull boy, all work and no play"
tokens = word_tokenize(data.lower())
print(tokens)

['all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


### Удаление неинформативных слов

#### N-граммы

<img src="https://res.cloudinary.com/practicaldev/image/fetch/s--466CQV1q--/c_limit%2Cf_auto%2Cfl_progressive%2Cq_66%2Cw_880/https://thepracticaldev.s3.amazonaws.com/i/78nf1vryed8h1tz05fim.gif" height=400>

In [None]:
unigram = list(nltk.ngrams(tokens, 1))
bigram = list(nltk.ngrams(tokens, 2))
print(unigram[:5])
print(bigram[:5])

[('all',), ('work',), ('and',), ('no',), ('play',)]
[('all', 'work'), ('work', 'and'), ('and', 'no'), ('no', 'play'), ('play', 'makes')]


In [None]:
from nltk import FreqDist
print('Популярные униграммы: ', FreqDist(unigram).most_common(5))
print('Популярные биграммы: ', FreqDist(bigram).most_common(5))

Популярные униграммы:  [(('all',), 2), (('work',), 2), (('and',), 2), (('no',), 2), (('play',), 2)]
Популярные биграммы:  [(('all', 'work'), 2), (('work', 'and'), 2), (('and', 'no'), 2), (('no', 'play'), 2), (('play', 'makes'), 1)]


#### Стоп-слова

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stopWords = set(stopwords.words('english'))
print(stopWords)

{'can', 'his', 'again', 'any', 'if', "mustn't", 'she', 'had', 'them', 'under', 'doing', "doesn't", "needn't", 'y', 'are', 'down', 'that', 'been', 'wasn', 'both', 'mustn', 'off', 'until', 'needn', 'a', 'me', "it's", 'mightn', 'ain', 'i', 'ourselves', 'very', 'some', 'after', 'shan', 'did', 'we', 'aren', 'but', 'you', 'out', 'who', 'once', "you've", 'while', "hasn't", 'yours', 'through', 'an', 'here', 'shouldn', 'is', 'themselves', 'there', "hadn't", 're', 'ours', 'about', "wasn't", 'wouldn', 'itself', "you're", "don't", "you'd", 'its', 'm', 'haven', 'on', 'doesn', 'at', "that'll", "didn't", 'hasn', 'the', 'between', 'didn', 'by', 't', "shan't", 'has', "weren't", "she's", 'yourself', 'hers', "you'll", 'was', 'don', 'or', 'hadn', 'these', 'from', 'does', 'should', 'too', 'over', 'myself', 'her', 'll', "mightn't", 'himself', 'why', 'most', 'nor', 'it', 'isn', 'weren', 'such', 'with', 'for', 'not', 'ma', 'our', 'during', 'so', 'than', 'have', 'am', 'will', "should've", 'won', 'my', 'few', '

In [None]:
print([word for word in tokens if word not in stopWords])

['work', 'play', 'makes', 'jack', 'dull', 'boy', ',', 'work', 'play']


### Стемминг
* процесс нахождения основы слова для заданного исходного слова

In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer
words = ["game", "gaming", "gamed", "games", "compacted"]
words_ru = ['корова', 'мальчики', 'мужчины', 'столом', 'убежала']

In [None]:
ps = PorterStemmer()
list(map(ps.stem, words))

['game', 'game', 'game', 'game', 'compact']

In [None]:
ss = SnowballStemmer(language='russian')
list(map(ss.stem, words_ru))

['коров', 'мальчик', 'мужчин', 'стол', 'убежа']

### Лематизация
* процесс приведения словоформы к лемме — её нормальной (словарной) форме

In [None]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government.  Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""

In [None]:
nlp = spacy.load('en')
doc = nlp(raw)
print(' '.join([token.lemma_ for token in doc]))

denni : listen , strange woman lie in pond distribute sword 
 be no basis for a system of government .   Supreme executive power derive from 
 a mandate from the masse , not from some farcical aquatic ceremony .


In [None]:
[(token.lemma_, token.pos_) for token in doc[:7]]

[('denni', 'NOUN'),
 (':', 'PUNCT'),
 ('listen', 'VERB'),
 (',', 'PUNCT'),
 ('strange', 'ADJ'),
 ('woman', 'NOUN'),
 ('lie', 'VERB')]

### Поиск шаблонов

#### Регулярные выражения

Исчерпывающий пост https://habr.com/ru/post/349860/

In [None]:
re.findall('\d+', 'There is some numbers: 49 and 432')

['49', '432']

In [None]:
re.sub('[,\.?!]',' ','How, to? split. text!').split()

['How', 'to', 'split', 'text']

In [None]:
re.sub('[^A-z]',' ','I 123 can 45 play 67 football').split()

['I', 'can', 'play', 'football']