In [1]:
import linora as la

In [2]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

# filter punctuation

In [3]:
corpus = la.text.filter_punctuation(corpus)
corpus

['This is the first document',
 'This document is the second document',
 'And this is the third one',
 'Is this the first document']

In [4]:
corpus = [i.split(' ') for i in corpus]
corpus

[['This', 'is', 'the', 'first', 'document'],
 ['This', 'document', 'is', 'the', 'second', 'document'],
 ['And', 'this', 'is', 'the', 'third', 'one'],
 ['Is', 'this', 'the', 'first', 'document']]

# CountVectorizer

In [5]:
x, scale = la.text.CountVectorizer(corpus)

In [6]:
x

Unnamed: 0,columns_this,columns_second,columns_Is,columns_one,columns_document,columns_third,columns_first,columns_the,columns_is,columns_This,columns_And
0,0,0,0,0,1,0,1,1,1,1,0
1,0,1,0,0,2,0,0,1,1,1,0
2,1,0,0,1,0,1,0,1,1,0,1
3,1,0,1,0,1,0,1,1,0,0,0


In [7]:
scale

['this',
 'second',
 'Is',
 'one',
 'document',
 'third',
 'first',
 'the',
 'is',
 'This',
 'And']

# TfidfVectorizer

In [8]:
la.text.TfidfVectorizer(x)

Unnamed: 0,columns_this,columns_second,columns_Is,columns_one,columns_document,columns_third,columns_first,columns_the,columns_is,columns_This,columns_And
0,0.0,0.0,0.0,0.0,0.418127,0.0,0.51647,0.341846,0.418127,0.51647,0.0
1,0.0,0.504371,0.0,0.0,0.643868,0.0,0.0,0.263202,0.321934,0.397652,0.0
2,0.380147,0.0,0.0,0.482169,0.0,0.482169,0.0,0.251616,0.307762,0.0,0.482169
3,0.461153,0.0,0.584914,0.0,0.373343,0.0,0.461153,0.305232,0.0,0.0,0.0


# word count

In [9]:
word_count_dict = la.text.word_count(corpus)
word_count_dict

Counter({'This': 2,
         'is': 3,
         'the': 4,
         'first': 2,
         'document': 4,
         'second': 1,
         'And': 1,
         'this': 2,
         'third': 1,
         'one': 1,
         'Is': 1})

# low freq word

In [10]:
la.text.word_low_freq(word_count_dict, threshold=1)

['second', 'And', 'third', 'one', 'Is']

# high freq word

In [11]:
la.text.word_high_freq(word_count_dict, threshold=3)

['is', 'the', 'document']

# filter word

In [12]:
la.text.filter_word(corpus, la.text.word_low_freq(word_count_dict, threshold=1))

[['This', 'is', 'the', 'first', 'document'],
 ['This', 'document', 'is', 'the', 'document'],
 ['this', 'is', 'the'],
 ['this', 'the', 'first', 'document']]

# word to index

In [13]:
word_index_dict = la.text.word_to_index(corpus)
word_index_dict

{'this': 1,
 'second': 2,
 'Is': 3,
 'one': 4,
 'document': 5,
 'third': 6,
 'first': 7,
 'the': 8,
 'is': 9,
 'This': 10,
 'And': 11}

# word index sequence

In [14]:
word_index_sequence = la.text.word_index_sequence(corpus, word_index_dict)
word_index_sequence

[[10, 9, 8, 7, 5], [10, 5, 9, 8, 2, 5], [11, 1, 9, 8, 6, 4], [3, 1, 8, 7, 5]]

# select best length

In [15]:
la.text.select_best_length(corpus, sample_rate=0.7)

6

# pad sequences

In [16]:
word_index_sequence = la.text.pad_sequences(word_index_sequence, la.text.select_best_length(corpus, sample_rate=0.7))
word_index_sequence

[[0, 10, 9, 8, 7, 5],
 [10, 5, 9, 8, 2, 5],
 [11, 1, 9, 8, 6, 4],
 [0, 3, 1, 8, 7, 5]]