Импорт библиотек 

In [347]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
import pandas as pd
import numpy as np
import spacy
import sklearn
import pymorphy2
from pymorphy2 import MorphAnalyzer
import string
import re
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

Инициализация глобальных переменных, а также наборов данных, таких как стоп-слова и знаки пунктуации

In [348]:
nltk.download('stopwords')
nltk.download('punkt')
rus = "russian"
sw = stopwords.words(rus)
morph = MorphAnalyzer()
patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
lancaster = LancasterStemmer()
snowball = SnowballStemmer(language=rus)
vectorizer = CountVectorizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NitghtWay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NitghtWay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [349]:
#инициализация пути до набора данных
test_ds1_path = "C:\\course_work\\feature_extraction_NLP\\data\\test_ds1\\train.jsonl"

In [350]:
#выгрузка данных с помощью Pandas, а также просмотр
data = pd.read_json(test_ds1_path, lines=True)
data

Unnamed: 0,premise,label,hypothesis,verb,negation,genre,idx,no_negation
0,Сумма ущерба составила одну тысячу рублей. Уто...,entailment,Ранее местный житель совершал подобное правона...,судить,no_negation,kp,0,
1,"Перебрасываясь словечками, они скользят глазам...",contradiction,Они что-то понимают,смотреть,no_negation,fiction,1,
2,"— Разве что, — сказала она, — мы хотим где-ниб...",neutral,"Это “ Таганская ”, а не “ Тульская ”.",понять,no_negation,fiction,2,
3,"Зима, наконец, показала свой характер.",contradiction,У зимы есть свой характер.,показать,no_negation,kp,3,
4,ГуманностьБогаподобнымсценариемнепредполагаетс...,contradiction,Это должно быть просто.,сказать,no_negation,fiction,4,
...,...,...,...,...,...,...,...,...
433,Тверские спортсмены показали отличный результа...,entailment,У тверских спортсменов пять новых медалей.,суметь,no_negation,kp,433,
434,"Просто я не хочу с ним жить и, извини, больше ...",contradiction,Этого было вполне достаточно.,уверять,no_negation,fiction,434,
435,"Второй аргумент: все читаем и видим, что надви...",neutral,Надвигается вторая волна кризиса.,видеть,negation,interfax,435,
436,"— Да, сын. Здравствуй. — Ты можешь говорить, н...",neutral,Сын не занят,говорить,no_negation,fiction,436,


In [351]:
#Удаление не используемых столбцов
data = data.drop(['label', 'hypothesis', 'verb','negation'
,'genre','idx','no_negation'], axis=1)
data.head()

Unnamed: 0,premise
0,Сумма ущерба составила одну тысячу рублей. Уто...
1,"Перебрасываясь словечками, они скользят глазам..."
2,"— Разве что, — сказала она, — мы хотим где-ниб..."
3,"Зима, наконец, показала свой характер."
4,ГуманностьБогаподобнымсценариемнепредполагаетс...


Вспомогательные функции

In [352]:
#ф-я для вывода текстовых данных в удобном формате
def print_table(array):
    table_width = 5
    for i in range(0, len(array), table_width):
        row = array[i:i + table_width]
        formatted_row = " ".join(["{:<10}".format(str(elem)) for elem in row])
        print(formatted_row)

In [353]:
#ф-я для удаления символов пунктуации
def remove_punctuation(text):
    return ''.join([ch for ch in text if ch not in 
    string.punctuation])

In [354]:
#ф-я для удаления чисел
def remove_numbers(text):
    return ''.join([i if not i.isdigit() else ' ' for i in text]) 

In [355]:
#ф-я для удаления последовательностей
def remove_patterns(text):
    text = re.sub(patterns, ' ', text)
    return text

In [356]:
#ф-я для удаления не буквенных символов
def remove_notalpha(text):
    return ''.join(i if i.isalpha() else ' ' for i in text)

Стемминг с использованием Snowball

In [357]:
example_text = data['premise'][4] #Текст для примера
print(example_text) #Вывод текста для примера

ГуманностьБогаподобнымсценариемнепредполагается. Но Его благость остается в неприкосновенности. Непросто жить в таком мире, но кто сказал, что это должно быть просто?


In [358]:
#ф-я токенизации с использованием Snowball
def tokenize_snowball(text):
    text = remove_notalpha(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_patterns(text)
    sentences = sent_tokenize(text, language=rus)
    all_words = []
    for sent in sentences:
        words = word_tokenize(sent, language=rus)
        for w in words:
            if w in sw or len(w) >= 11:
                continue
            else:
                all_words.append(snowball.stem(w))
            
    return all_words

In [359]:
#Вывод результата с использованем ф-ии
print_table(tokenize_snowball(example_text)) 

но         ег         благост    оста       непрост   
жит        так        мир        сказа      эт        
должн      прост     


Стемминг с использованием алгоритма Lancaster

In [360]:
#ф-я токенизации с использованием Lancaster
def tokenize_lancaster(text):
    text = remove_notalpha(text)
    text = remove_numbers(text)
    text = remove_patterns(text)
    text = remove_punctuation(text)
    sentences = sent_tokenize(text, language=rus)
    all_words = []
    for sent in sentences:
        words = word_tokenize(sent, language=rus)
        for w in words:
            if w in sw or len(w) >= 11:
                continue
            else:
                all_words.append(lancaster.stem(w))
    return all_words

In [361]:
#вывод примера
print_table(tokenize_lancaster(example_text))

но         его        благость   остается   непросто  
жить       таком      мире       сказал     это       
должно     просто    


Лемматизация с помощью PyMorphy2

In [362]:
#ф-я токенизации с помощью PyMorphy2
def tokenize_morphy(text):
    text = remove_notalpha(text)
    text = remove_patterns(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    sentences = sent_tokenize(text, language=rus)
    all_words = []
    for sent in sentences:
        words = word_tokenize(sent, language=rus)
        for w in words:
            if w in sw or len(w) >= 11:
                continue
            else:
                all_words.append(morph.normal_forms(w)[0])
    if len(all_words) > 2:
        return all_words
    else:
        return None
    

In [363]:
#вывод примера
print_table(tokenize_morphy(example_text))

но         он         благость   оставаться непросто  
жить       такой      мир        сказать    это       
должный    просто    


Создание отформатированных наборов данных с помощью Snowball, Lancaster, PyMorphy2

In [364]:
#Формирование наборов данных
df_morphy = pd.DataFrame({'text':[tokenize_morphy(sample) for sample in data['premise']]})
df_snowball = pd.DataFrame({'text':[tokenize_snowball(sample) for sample in data['premise']]})
df_lancaster = pd.DataFrame({'text':[tokenize_lancaster(sample) for sample in data['premise']]})
#Очистка от пустот
df_lancaster = df_lancaster.dropna()
df_morphy = df_morphy.dropna()
df_snowball = df_snowball.dropna()

Функции для векторизации текста с использованием словаря Python

In [365]:
#ф-я формирования словаря
def vectorize(doc):
    features = defaultdict(int)
    for token in doc:
        features[token] += 1
    return features

In [366]:
#формирование словаря со всеми словами
def make_dictionary_words(df):
    dictionary_whole_words = defaultdict(int)
    for doc in df['text']:
        for token in doc:
            dictionary_whole_words[token] += 1
    return dictionary_whole_words

In [367]:
#ф-я конвертации предложения в вектор на основе общего словаря
def sentence_to_vector(sentence, dictionary_whole_words):
    sentence = np.array(sentence)
    vector = np.zeros(len(dictionary_whole_words))
    dict_values = np.array(list(dictionary_whole_words.values()))
    dict_keys = np.array(list(dictionary_whole_words.keys()))
    sorter = np.argsort(dict_keys)
    for idx in sorter[np.searchsorted(dict_keys, sentence, sorter = sorter)]:
        vector[idx] = dict_values[idx]
    return vector 

Пример использования векторизации с помощью словаря

In [368]:
whole_dict = make_dictionary_words(df_morphy) #общий словарь на основе всего набора данных
whole_dict

defaultdict(int,
            {'сумма': 4,
             'ущерб': 2,
             'составить': 3,
             'один': 22,
             'тысяча': 4,
             'рубль': 4,
             'уточняться': 3,
             'место': 16,
             'выехать': 3,
             'группа': 5,
             'который': 34,
             'установить': 8,
             'личность': 2,
             'они': 14,
             'оказаться': 14,
             'местный': 4,
             'житель': 7,
             'ранее': 7,
             'судить': 9,
             'подобный': 3,
             'словечко': 2,
             'скользить': 2,
             'глаз': 8,
             'мой': 25,
             'город': 6,
             'как': 26,
             'сметь': 5,
             'смотреть': 8,
             'понимать': 40,
             'разве': 3,
             'сказать': 122,
             'хотеть': 28,
             'выпить': 2,
             'кофе': 1,
             'я': 81,
             'казаться': 13,
             'на': 8,
       

In [369]:
#пример формирования вектора отдельного корпуса текста 
print(sentence_to_vector(df_morphy['text'][1], whole_dict)[20:])

[2. 2. 8. ... 0. 0. 0.]


Векторизация текста Sklearn

In [370]:
df_morphy.head()

Unnamed: 0,text
0,"[сумма, ущерб, составить, один, тысяча, рубль,..."
1,"[словечко, скользить, глаз, мой, город, как, с..."
2,"[разве, сказать, хотеть, выпить, кофе, я, каза..."
3,"[зима, показать, свой, характер]"
4,"[но, он, благость, оставаться, непросто, жить,..."


In [372]:
vec = CountVectorizer()
corpus = []
for sent in df_snowball['text']:
    corpus.append(' '.join(sent))
X = vec.fit_transform(corpus)
pd.DataFrame(vec.transform(corpus).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,авар,аварийн,авиасалон,автомобил,автор,агентств,агрессивн,адвокат,адрес,адск,...,яблон,явк,явля,ядерн,язык,якоб,январ,ярост,ясн,ясност
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [373]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
vec = CountVectorizer()
corpus = []
for sent in df_morphy['text']:
    corpus.append(' '.join(sent))
vec.fit(corpus)
whole_list_words = [w for w in sorted(vec.vocabulary_.keys())]
pd.DataFrame(vec.transform(corpus).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,аварийный,авария,авиасалон,автомобиль,автор,агентство,агрессивный,адвокат,адрес,адский,...,яблоня,явка,являться,ядерный,язык,якобы,январь,ярость,ясно,ясность
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
433,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#vectorized_input = vec.transform(X)
print(len(vec.get_feature_names_out()))

2532


In [None]:
vec = TfidfVectorizer()
vec.fit(corpus)
pd.DataFrame(vec.transform(corpus).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,аварийный,авария,авиасалон,автомобиль,автор,агентство,агрессивный,адвокат,адрес,адский,...,яблоня,явка,являться,ядерный,язык,якобы,январь,ярость,ясно,ясность
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print(vec.get_feature_names_out())

['аварийный' 'авария' 'авиасалон' ... 'ярость' 'ясно' 'ясность']


In [None]:
df_morphy_copy = df_morphy['text'].apply(lambda col: ' '.join(col)).copy()

In [None]:
corpus = df_morphy_copy
vectors = vectorizer.fit_transform(corpus)
print(vectors)

  (0, 2156)	1
  (0, 2388)	1
  (0, 2049)	1
  (0, 1213)	1
  (0, 2271)	1
  (0, 1870)	1
  (0, 2370)	1
  (0, 924)	1
  (0, 273)	1
  (0, 381)	1
  (0, 798)	1
  (0, 2361)	1
  (0, 863)	1
  (0, 1232)	1
  (0, 1223)	1
  (0, 923)	1
  (0, 531)	1
  (0, 1799)	1
  (0, 2147)	1
  (0, 1450)	1
  (1, 1977)	1
  (1, 1954)	1
  (1, 330)	1
  (1, 949)	1
  (1, 353)	1
  :	:
  (434, 163)	1
  (434, 338)	2
  (434, 981)	1
  (434, 968)	1
  (434, 1352)	1
  (434, 255)	2
  (434, 213)	1
  (434, 24)	1
  (434, 2472)	1
  (434, 815)	1
  (434, 763)	1
  (435, 386)	1
  (435, 338)	1
  (435, 968)	1
  (435, 2270)	1
  (435, 2174)	1
  (435, 589)	1
  (435, 638)	1
  (436, 1944)	1
  (436, 921)	1
  (436, 2189)	1
  (436, 2082)	1
  (436, 2203)	1
  (436, 2332)	1
  (436, 19)	1


Векторизация Gensim

In [None]:
corpus = [doc for doc in df_morphy['text']]
id2word = gensim.corpora.Dictionary(corpus)
vectors = [id2word.doc2bow(doc) for doc in corpus]
for v in vectors:
    print(v)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]
[(20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]
[(29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1)]
[(46, 1), (47, 1), (48, 1), (49, 1)]
[(34, 1), (37, 1), (44, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)]
[(59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1)]
[(13, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1)]
[(11, 1), (34, 1), (72, 1), (73, 1), (80, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1)]
[(8, 1), (40, 1), (41, 1), (43, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98

Реализация BagOfWords

In [None]:
text_series = df_morphy['text']
allsentences = []
for sent in text_series:
    allsentences.append(' '.join(sent))
allsentences = pd.Series(allsentences)
print(allsentences)

0      сумма ущерб составить один тысяча рубль уточня...
1      словечко скользить глаз мой город как сметь см...
2      разве сказать хотеть выпить кофе я казаться ск...
3                            зима показать свой характер
4      но он благость оставаться непросто жить такой ...
                             ...                        
432    тверской спортсмен показать отличный результат...
433    просто хотеть жить извинить сказать ты нечего ...
434    второй аргумент читаемый видеть второй волна к...
435           да сын здравствуй ты мочь говорить занятый
436        там сказать мера спуск тело умерший альпинист
Length: 437, dtype: object
